我想测量访问一个表条目和在clflush之后访问另一个条目之间的时间差。
下面你会发现我的企图,我几乎没有得到以上两个行动的惩罚表格长度为256,每个条目中有8位我怀疑我的刷牙不正常我正在用gcc中的-O3标志编译。

            #include <stdio.h>
            #include <stdlib.h>
            #include <stdint.h>
            #define ARRAYSIZE(arr) (sizeof(arr)/sizeof(arr[0]))

            #define REPEAT 10000

            unsigned char table[256]={103,198,105,115,81,255,74,236,41,205,186,171,242,251,227,70,124,194,84,248,27,232,231,141,118,90,46,99,51,159,201,154,102,50,13,183,49,88,163,90,37,93,5,23,88,233,94,212,171,178,205,198,155,180,84,17,14,130,116,65,33,61,220,135,112,233,62,161,65,225,252,103,62,1,126,151,234,220,107,150,143,56,92,42,236,176,59,251,50,175,60,84,236,24,219,92,2,26,254,67,251,250,170,58,251,41,209,230,5,60,124,148,117,216,190,97,137,249,92,187,168,153,15,149,177,235,241,179,5,239,247,0,233,161,58,229,202,11,203,208,72,71,100,189,31,35,30,168,28,123,100,197,20,115,90,197,94,75,121,99,59,112,100,36,17,158,9,220,170,212,172,242,27,16,175,59,51,205,227,80,72,71,21,92,187,111,34,25,186,155,125,245,11,225,26,28,127,35,248,41,248,164,27,19,181,202,78,232,152,50,56,224,121,77,61,52,188,95,78,119,250,203,108,5,172,134,33,43,170,26,85,162,190,112,181,115,59,4,92,211,54,148,179,175,226,240,228,158,79,50,21,73,253,130,78,169};



            inline void clflush(volatile void *p)
            {
                asm volatile ("clflush (%0)" :: "r"(p));
            }

            inline uint64_t rdtsc()
            {
                unsigned long a, d;
                asm volatile ("cpuid; rdtsc" : "=a" (a), "=d" (d) : : "ebx", "ecx");
                return a | ((uint64_t)d << 32);
            }

            inline int func(int *a) {
                int i;
                for(i=0;i<REPEAT;i++){
                    a[i]=(int)table[rand()%256];
                }

            }
            void flushCache(unsigned char *start)
            {
                // flush table
                unsigned char* fPtr = (unsigned char*)start;
                clflush(fPtr);
                clflush(fPtr+64);
                clflush(fPtr+128);
                clflush(fPtr+192);
                clflush(fPtr+256);
            }


            inline void test()
            {
                int i=0;
                uint64_t start, end;
                char c;
                int temp[REPEAT];

                start = rdtsc();

                func(temp);

                end = rdtsc();

                //following line of code to prevent compiler from optimizing. do something with the return value
                for(i-0;i<REPEAT;i++){
                temp[i]=temp[i]+temp[i/2];
                }

                printf("%ld ticks\n", end - start);
            }

            inline void testflush()
            {
                int i=0;
                uint64_t start, end;
                char c;
                int temp[REPEAT];

                start = rdtsc();

                func(temp);
                flushCache(table); //flush afer every read

                end = rdtsc();

                //following line of code to prevent compiler from optimizing. do something with the return value
                for(i-0;i<REPEAT;i++){
                temp[i]=temp[i]+temp[i/2];
                }

                printf("%ld ticks\n", end - start);
            }



            int main(int ac, char **av)
            {
                test();
                printf("Tables in cache!\n");
                testflush();
                printf("Tables evicted from cache.\n");
                test();

                return 0;
            }

更新:我知道可能是由于表访问的问题下面是另一个逐出单个变量而不是整个表的代码这个显示了使用clflush()时时钟周期中的显著变化这是否意味着clflush()工作正常,增加的时间是从内存访问变量造成的?
            #include <stdint.h>
            #include <stdio.h>
            #define REPEAT 100000
            inline void clflush(volatile void *p)
            {
                asm volatile ("clflush (%0)" :: "r"(p));
            }

            inline uint64_t rdtsc()
            {
                unsigned long a, d;
                asm volatile ("rdtsc" : "=a" (a), "=d" (d));
                return a | ((uint64_t)d << 32);
            }

            volatile int i;

            inline void test()
            {
                uint64_t start, end,clock;
                volatile int j;
                long int rep;
                int k;

                clock=0;
                for(rep=0;rep<REPEAT;rep++){
                    start = rdtsc();
                    j = i+1;
                    end = rdtsc();
                    clock=clock+(end-start);
                    k=j;
                }
                printf("took %lu ticks\n", clock);
            }

            inline void testflush()
            {
                uint64_t start, end,clock;
                volatile int j;
                int k;
                long int rep;

                clock=0;
                for(rep=0;rep<REPEAT;rep++){
                    start = rdtsc();
                    j = i+1;
                    end = rdtsc();
                    clflush(&i);
                    clock=clock+(end-start);
                    k=j;
                }
                printf("took %lu ticks\n", clock);
            }


            int main(int ac, char **av)
            {
                i=5;
                printf("------------------------------------------\n");
                test();
                printf("------------------------------------------\n");
                testflush();
                printf("------------------------------------------\n");
                test();
                return 0;
            }

最佳答案

我在代码中看到了一些问题。
在调用testflush后结束clflush的计时器因此,您也在为处理这些指令所需的周期计时我不认为这是故意的。
在测试函数中,有一个循环,迭代次数为10000次每个迭代都可以调用对一个新缓存线的引用,但是table中只有4个缓存线因此至少9996个迭代调用没有缓存丢失。
因此,您正在计时10000次rand()%256加上4个缓存加载即使缓存加载需要几百个周期,10000次rand()%256的迭代仍然会掩盖这一点。
生成的10000个整数也必须写回我不确定L1->L2缓存带宽是否是一个限制因素,但它可能是。
您还需要运行几千次左右的测试,然后取平均值,否则样本方差太高。
然后,在您请求缓存线之前,cpu可能会通过推测再次预取缓存线允许这样做,但我不知道现在的CPU有多聪明。

08-16 02:57