我想监控最后一级缓存中的缓存请求数。我根据教程 here 编写了一个 Linux 模块来获取该信息。
可以编译运行,但是输出结果总是0。也就是说,我在使用rdmsr
的时候,总是给我edx=0,eax=0。我什至尝试了 tutorial 中的演示代码,输出仍然是 0。
我被这个问题困了整整一周。谁能帮我指出我在程序中犯的错误?
我知道有一些现有的程序在做同样的事情,但我必须知道如何自己编写代码,因为我想在 Xen hypervisor 中监控缓存请求。我不能在 Xen 中使用这些工具,除非我将这些工具合并到 Xen 的虚拟机管理程序中,这似乎更有效。
/*
* Record the cache miss rate of Intel Sandybridge cpu
* To confirm the event is correctly set!
*/
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
/*4 Performance Counters Selector for %ecx in insn wrmsr*/
#define PERFEVTSEL0 0x186
#define PERFEVTSEL1 0x187
#define PERFEVTSEL2 0x188
#define PERFEVTSEL3 0x189
/*4 MSR Performance Counter for the above selector*/
#define PMC0 0xc1
#define PMC1 0xc2
#define PMC2 0xc2
#define PMC3 0xc3
/*Intel Software Developer Manual Page 2549*/ /*L1I L1D cache events has not been confirmed!*/
/*L1 Instruction Cache Performance Tuning Events*/
#define L1I_ALLHIT_EVENT 0x80
#define L1I_ALLHIT_MASK 0x01
#define L1I_ALLMISS_EVENT 0x80 /*confirmed*/
#define L1I_ALLMISS_MASK 0x02 /*confirmed*/
/*L1 Data Cache Performance Tuning Events*/
/*Intel does not have the ALLREQ Miss mask; have to add LD_miss and ST_miss*/
#define L1D_ALLREQ_EVENT 0x43
#define L1D_ALLREQ_MASK 0x01
#define L1D_LDMISS_EVENT 0x40
#define L1D_LDMISS_MASK 0x01
#define L1D_STMISS_EVENT 0x28
#define L1D_STMISS_MASK 0x01
/*L2 private cache for each core*/ /*confirmed*/
#define L2_ALLREQ_EVENT 0x24
#define L2_ALLREQ_MASK L2_ALLCODEREQ_MASK /*0xFF*/
#define L2_ALLMISS_EVENT 0x24
#define L2_ALLMISS_MASK L2_ALLCODEMISS_MASK /*0xAA*/
#define L2_ALLCODEREQ_MASK 0x30
#define L2_ALLCODEMISS_MASK 0x20
/*L3 shared cache*/ /*confirmed*/
/*Use the last level cache event and mask*/
#define L3_ALLREQ_EVENT 0x2E
#define L3_ALLREQ_MASK 0x4F
#define L3_ALLMISS_EVENT 0x2E
#define L3_ALLMISS_MASK 0x41
#define USR_BIT (0x01UL << 16)
#define OS_BIT (0x01UL << 17)
#define SET_MSR_USR_BIT(eax) eax |= USR_BIT
#define CLEAR_MSR_USR_BIT(exa) eax &= (~USR_BIT)
#define SET_MSR_OS_BIT(eax) eax |= OS_BIT
#define CLEAR_MSR_OS_BIT(eax) eax &= (~OS_BIT)
#define SET_EVENT_MASK(eax, event, umask) eax |= (event | (umask << 8))
/*MSR EN flag: when set start the counter!*/
//#define MSR_ENFLAG (0x1<<22)
#define MSR_ENFLAG (0x1<<22)
/* 32bit insn v3*/
static inline void rtxen_write_msr(uint32_t eax, uint32_t ecx)
{
/*clear counter first*/
__asm__ __volatile__ ("movl %0, %%ecx\n\t"
"xorl %%edx, %%edx\n\t"
"xorl %%eax, %%eax\n\t"
"wrmsr\n\t"
: /* no outputs */
: "m" (ecx)
: "eax", "ecx", "edx" /* all clobbered */);
eax |= MSR_ENFLAG;
__asm__("movl %0, %%ecx\n\t" /* ecx contains the number of the MSR to set */
"xorl %%edx, %%edx\n\t"/* edx contains the high bits to set the MSR to */
"movl %1, %%eax\n\t" /* eax contains the log bits to set the MSR to */
"wrmsr\n\t"
: /* no outputs */
: "m" (ecx), "m" (eax)
: "eax", "ecx", "edx" /* clobbered */);
}
static inline void rtxen_read_msr(uint32_t* ecx, uint32_t *eax, uint32_t* edx)
{ __asm__ __volatile__(\
"rdmsr"\
:"=d" (*edx), "=a" (*eax)\
:"c"(*ecx)
);
}
static inline void delay(void )
{
char tmp[1000];
int i;
for( i = 0; i < 1000; i++ )
{
tmp[i] = i * 2;
}
}
enum cache_level
{
UOPS,
L1I,
L1D,
L2,
L3
};
int init_module(void)
{
enum cache_level op;
uint32_t eax, edx, ecx;
uint64_t l3_all;
op = UOPS;
switch(op)
{
case UOPS:
eax = 0x0001010E;
eax |= MSR_ENFLAG;
ecx = 0x187;
printk(KERN_INFO "UOPS Demo: write_msr: eax=%#010x, ecx=%#010x\n", eax, ecx);
rtxen_write_msr(eax, ecx);
ecx = 0xc2;
eax = 1;
edx = 2;
rtxen_read_msr(&ecx, &eax, &edx);
printk(KERN_INFO "UOPS Demo: read_msr: edx=%#010x, eax=%#010x\n", edx, eax);
break;
case L3:
eax = 0;
SET_MSR_USR_BIT(eax);
SET_MSR_OS_BIT(eax);
SET_EVENT_MASK(eax, L3_ALLREQ_EVENT, L3_ALLREQ_MASK);
eax |= MSR_ENFLAG;
ecx = PERFEVTSEL2;
printk(KERN_INFO "before wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
rtxen_write_msr(eax, ecx);
printk(KERN_INFO "after wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
printk(KERN_INFO "L3 all request set MSR PMC2\n");
printk(KERN_INFO "delay by access an array\n");
delay();
ecx = PMC2;
eax = 1;
edx = 2;
printk(KERN_INFO "rdmsr: ecx=%#010x\n", ecx);
rtxen_read_msr(&ecx, &eax, &edx); /*need to pass into address!*/
l3_all = ( ((uint64_t) edx << 32) | eax );
printk(KERN_INFO "rdmsr: L3 all request is %llu (%#010lx)\n", l3_all, (unsigned long)l3_all);
break;
default:
printk(KERN_INFO "operation not implemented yet\n");
}
/*
* A non 0 return means init_module failed; module can't be loaded.
*/
return 0;
}
void cleanup_module(void)
{
printk(KERN_INFO "Goodbye world 1.\n");
}
我的结果是:
[ 1780.946584] UOPS Demo: write_msr: eax=0x0001010e, ecx=0x00000187
[ 1780.946590] UOPS Demo: read_msr: edx=0x00000000, eax=0x00000000
[ 1818.595055] Goodbye world 1.
[ 1821.153947] UOPS Demo: write_msr: eax=0x0041010e, ecx=0x00000187
[ 1821.153950] UOPS Demo: read_msr: edx=0x00000000, eax=0x00000000
最佳答案
我终于在@Manuel Selva 的帮助下解决了它!
设置性能的正确流程。计数器是:
步骤1:设置msr并通过设置eax中的EN位使能计数器;
第 2 步:通过写入 msr 来停止计数器
第 3 步:读取计数器
我错过了第 2 步,这就是为什么它总是给我 0。如果我想在停止之前读取计数器,报告 0 是有意义的。
switch语句的正确代码如下:
switch(op)
{
case UOPS:
eax = 0x0051010E;
eax |= MSR_ENFLAG;
ecx = 0x187;
printk(KERN_INFO "UOPS Demo: write_msr: eax=%#010x, ecx=%#010x\n", eax, ecx);
rtxen_write_msr(eax, ecx);
//stop counting
eax = 0x0011010E;
rtxen_write_msr(eax,ecx);
ecx = 0xc2;
eax = 1;
edx = 2;
rtxen_read_msr(&ecx, &eax, &edx);
printk(KERN_INFO "UOPS Demo: read_msr: edx=%#010x, eax=%#010x\n", edx, eax);
break;
case L3:
eax = 0;
SET_MSR_USR_BIT(eax);
SET_MSR_OS_BIT(eax);
SET_EVENT_MASK(eax, L3_ALLREQ_EVENT, L3_ALLREQ_MASK);
eax |= MSR_ENFLAG;
eax |= (1<<20); //INT bit: counter overflow
ecx = PERFEVTSEL2;
printk(KERN_INFO "before wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
rtxen_write_msr(eax, ecx);
printk(KERN_INFO "after wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
printk(KERN_INFO "L3 all request set MSR PMC2\n");
printk(KERN_INFO "delay by access an array\n");
delay();
eax &= (~MSR_ENFLAG);
rtxen_write_msr(eax, ecx);
printk(KERN_INFO "stop the counter, eax=%#010x\n", eax);
ecx = PMC2;
eax = 1;
edx = 2;
printk(KERN_INFO "rdmsr: ecx=%#010x\n", ecx);
rtxen_read_msr(&ecx, &eax, &edx); /*need to pass into address!*/
l3_all = ( ((uint64_t) edx << 32) | eax );
printk(KERN_INFO "rdmsr: L3 all request is %llu (%#010lx)\n", l3_all, (unsigned long)l3_all);
break;
default:
printk(KERN_INFO "operation not implemented yet\n");
}
关于c - Linux 模块 : performance counter does not work,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/21652256/