我做了一个做矩阵乘法的程序(没有优化)。

for(i=0; i<a_r; i++)
 for(j=0;j<b_c; j++)
  for(k=0;k<a_c; k++)
   c[i][j]=c[i][j]+a[i][k]*b[k][j];

根据我分配内存的方式,计算时间有所不同。

请注意以下三点很重要:
  • 我只记录计算时间,而不记录分配时间
    (加上分配时间与
    计算)。
  • 我在计算之前用随机数初始化矩阵。一世
    使用巨大的矩阵(2500 int * 2500 int)。我选择这个尺寸来使用
    RAM内存,但不包含交换空间。
  • 如果不使用RAM,该现象将消失。

  • 我用三种不同的配置来测试该程序:

    测试1 :全局分配
    Int a[2500][2500], b[2500][2500], c[2500][2500];
    Main {matrix multiplication a*b=c}
    

    计算时间非常恒定(大约41s)。

    测试2 :动态分配(数组)
    Main{
    int **a,**b,**c;
    a=(int **) malloc(sizeof(int)*2500);
    for( i=0;i<a_c; i++)
     a[i]=(int *) malloc(sizeof(int)*2500);
    …
    matrix multiplication a*b=c
    }
    

    当我在原始中多次执行该程序时,将获得以下运行时间:260秒,180、110、110…
    如果我等待约5秒钟,然后再次启动该程序,我将获得相同的结果。

    测试3 :动态分配(行)
    Main{
    int *a, *b, *c;
    a=(int *) malloc(sizeof(int)*2500*2500);
    … (same for b and c)
    matrix multiplication a*b=c
    }
    

    计算时间非常恒定(大约44s)。

    我认为测试2的效率较低,原因是数据存储在内存中的方式。就像在article(网站已关闭)或question中进行解释一样。通过内存的某种方式更有效,因此,通过某种方式分配内存可以使您的程序更高效。

    但是(在测试2中),我不知道为什么该程序会随着时间的推移变得更快。有谁有办法解释这种现象?提前致谢。

    P.S.我在具有CentOS 6.3和内核Linux 3.11.1的Intel Xeon E5-2625上进行了这些测试。
    编辑:我的计算机上禁用了频率缩放。频率是恒定的。

    这是代码:
    #include <stdlib.h>
    #include <stdio.h>
    #include <time.h>
    #include <float.h>
    #include <sys/time.h>
    #include <sys/mman.h>
    #include <sys/resource.h>
    
    /****************************************************
     * Random generate a random int between _Min and _Max
     */
    int Random(int _iMin,int _iMax)
    {
        return (_iMin + (rand()%(_iMax-_iMin+1)));
    }
    
    /****************************************************
     * Return the struct of getrusage.
     */
    struct rusage give_ru()
    {
        struct rusage ru;
        getrusage(RUSAGE_SELF, &ru);
        //getrusage(RUSAGE_CHILDREN, &ru);
    
        return ru;
    }
    
    /****************************************************
     * Do a matrix multiplication a*b=c
     * This program isn't optimized.
     */
    int main()
    {
        /*
         * Initialize variables and array sizes.
         */
        int **a,**b,**c;
        printf("\nenter Taille de la matrice : 2500");
        int a_r=2500,a_c=2500,b_r=2500,b_c=2500;
        clock_t start,end;
        struct rusage start1,end1;
        /* variables to store time difference between
        start of matrix multiplication and end of multiplication. */
        double dif; /*variable to calculate the time difference between the parallelization */
        int i,j,k;
    
        if(a_c!=b_r )
        {
            printf("\ncan not multiply");
            goto again;
        }
    
    
        /*
         * Memory allocation.
         */
        start =clock();
        a=(int **) malloc(sizeof(int *)*a_r);
        if (a == NULL)
        {
           printf("Allocation of matrix a failed \n");
           exit(0);
        }
        for( i=0;i<a_c; i++)
        {
            a[i]=(int *) malloc(sizeof(int)*a_c);
                if (a[i] == NULL)
                {
                printf("Allocation of matrix a[%d] failed \n",i);
                exit(0);
                }
        }
        b=(int **) malloc(sizeof(int *)*b_r);
        if (b == NULL)
        {
           printf("Allocation of matrix b failed \n");
           exit(0);
        }
        for( i=0;i<b_c; i++)
        {
            b[i]=(int *) malloc(sizeof(int)*b_c);
                if (b[i] == NULL)
                {
                printf("Allocation of matrix b[%d] failed \n",i);
                exit(0);
                }
        }
        c=(int **) malloc(sizeof(int *)*a_r);
        if (c == NULL)
        {
           printf("Allocation of matrix c failed \n");
           exit(0);
        }
        for( i=0;i< b_c; i++)
        {
            c[i]=(int *) malloc(sizeof(int)*b_c);
                if (c[i] == NULL)
                {
                printf("Allocation of matrix c[%d] failed \n",i);
                exit(0);
                }
        }
    
        /* Memory is lock*/
        printf("Starting mlockall.\n");
        if(mlockall(MCL_CURRENT | MCL_FUTURE)<0) {
            perror("mlockall");
            return 1;
        }
    
    
        /*
         * Matrix initialization (with random integer).
         */
        printf("Initializing matrices...\n");
    
        //initializing first matrix
        srand(time(NULL));
        for(i=0;i<a_r; i++)
        {
            for(j=0;j<a_c; j++)
            {
                a[i][j] = Random(0,100);//i+j;
                //printf("%d \n", a[i][j]);
            }
        }
        // initializing second matrix
        srand(time(NULL));
        for(i=0;i<b_r; i++)
        {
            for(j=0;j<b_c; j++)
            {
                b[i][j] = Random(0,100);//i*j;
            }
        }
        /*initialize product matrix */
        srand(time(NULL));
        for(i=0;i<a_r; i++)
        {
            for(j=0;j< b_c; j++)
            {
                c[i][j]= Random(0,100);//0;
            }
        }
        end= clock(); //end the timer
        dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference
        printf("Malloc and initialization took %f sec. time.\n", dif);
    
    
        /*
         * Matrix Multiplication.
         */
        start =clock(); //start the timer
        start1 = give_ru(); //start the timer
        /* multiply matrix one and matrix two */
        for(i=0;i<a_r; i++)
            for(j=0;j<a_c; j++)
                for(k=0;k<b_c; k++)
                c[i][j]=c[i][j]+a[i][k]*b[k][j];
    
        end1 = give_ru(); //end the timer
        end = clock(); //end the timer
    
        dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference
    
        struct timeval timeUserStart = start1.ru_utime;
        double utimeStart = (double)timeUserStart.tv_sec + (double)timeUserStart.tv_usec / 1000000.0;
        struct timeval timeSystemStart = start1.ru_stime;
        double stimeStart = (double)timeSystemStart.tv_sec + (double)timeSystemStart.tv_usec / 1000000.0;
    
        struct timeval timeUserEnd = end1.ru_utime;
        double utimeEnd  = (double)timeUserEnd.tv_sec + (double)timeUserEnd.tv_usec / 1000000.0;
        struct timeval timeSystemEnd  = end1.ru_stime;
        double stimeEnd  = (double)timeSystemEnd.tv_sec + (double)timeSystemEnd.tv_usec / 1000000.0;
    
        double difuTime = utimeEnd - utimeStart;
        double difsTime = stimeEnd - stimeStart;
    
        long pageReclaims = end1.ru_minflt;//start1.ru_minflt-
        long pageFaults = end1.ru_majflt;//start1.ru_majflt-
    
        printf("Parallelization took %f sec. time.\n", dif);
        printf("Time User : %f .\n", difuTime );
        printf("Time System : %f .\n", difsTime );
        printf("Page reclaims : %ld .\n", end1.ru_minflt);
        printf("Page faults : %ld .\n", end1.ru_majflt);
    
        /*
         * free memory.
         */
        printf("Finished, cleaning up \n");
        munlockall();
    
        /*free memory*/
        for(i=0;i<a_r; i++)
        {
            free(a[i]);
            a[i] = NULL;
        }
        free(a);
        a = NULL;
        for(i=0;i<a_c; i++)
        {
            free(b[i]);
            b[i] = NULL;
        }
        free(b);
        b = NULL;
        for(i=0;i<b_c; i++)
        {
            free(c[i]);
            c[i] = NULL;
        }
        free(c);
        c = NULL;
    
        return 0;
    }
    

    最佳答案

    指针数组需要额外的间接级别,因此会更慢。此外,这可能会导致更多的高速缓存未命中。

    所以我尝试了1000 * 1000矩阵:

    测试2(int **)

    $ perf stat ./test2
    Performance counter stats for './test1':
    
       8561,688348 task-clock (msec)         #    1,000 CPUs utilized
                13 context-switches          #    0,002 K/sec
                 9 cpu-migrations            #    0,001 K/sec
             3 058 page-faults               #    0,357 K/sec
    24 844 304 630 cycles                    #    2,902 GHz                     [83,32%]
    21 302 837 742 stalled-cycles-frontend   #   85,75% frontend cycles idle    [83,32%]
     2 110 745 845 stalled-cycles-backend    #    8,50% backend  cycles idle    [66,68%]
     7 030 427 722 instructions              #    0,28  insns per cycle
                                             #    3,03  stalled cycles per insn [83,36%]
     1 004 889 984 branches                  #  117,371 M/sec                   [83,37%]
         1 077 360 branch-misses             #    0,11% of all branches         [83,32%]
    
       8,561758966 seconds time elapsed
    

    测试3(int *)
    $ perf stat ./test3
     Performance counter stats for './test2':
    
       1367,856713 task-clock (msec)         #    0,995 CPUs utilized
               195 context-switches          #    0,143 K/sec
                 3 cpu-migrations            #    0,002 K/sec
             3 001 page-faults               #    0,002 M/sec
     3 977 759 335 cycles                    #    2,908 GHz                     [83,41%]
       975 477 913 stalled-cycles-frontend   #   24,52% frontend cycles idle    [83,41%]
       179 003 530 stalled-cycles-backend    #    4,50% backend  cycles idle    [66,81%]
     7 017 803 848 instructions              #    1,76  insns per cycle
                                             #    0,14  stalled cycles per insn [83,41%]
     1 002 321 021 branches                  #  732,768 M/sec                   [83,42%]
         1 046 066 branch-misses             #    0,10% of all branches         [82,97%]
    
       1,374613079 seconds time elapsed
    

    如我们所见,int test2有更多的循环。

    我还确保错过了缓存:

    测试2(int **)
    $ perf stat -e 'L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses'  ./test2
    Performance counter stats for './test1':
    
     3 009 028 415 L1-dcache-loads
     1 259 622 058 L1-dcache-load-misses     #   41,86% of all L1-dcache hits
         6 427 561 L1-dcache-stores
         1 141 461 L1-dcache-store-misses
    
       8,650905202 seconds time elapsed
    

    测试3(int *)
    $ perf stat -e 'L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses'  ./test3
    Performance counter stats for './test2':
    
     2 004 807 223 L1-dcache-loads
       596 388 192 L1-dcache-load-misses     #   29,75% of all L1-dcache hits
         2 111 264 L1-dcache-stores
           384 198 L1-dcache-store-misses
    
       1,384352257 seconds time elapsed
    

    10-07 13:25
    查看更多