c - 而(n> 1)比(n)快25％？

我有两个逻辑上等效的函数：

long ipow1(int base, int exp) {
    // HISTORICAL NOTE:
    // This wasn't here in the original question, I edited it in,
    if (exp == 0) return 1;

    long result = 1;

    while (exp > 1) {
        if (exp & 1) result *= base;
        exp >>= 1;
        base *= base;
    }

    return result * base;
}

long ipow2(int base, int exp) {
    long result = 1;

    while (exp) {
        if (exp & 1) result *= base;
        exp >>= 1;
        base *= base;
    }

    return result;
}

注意：

这些循环是等效的，因为在前一种情况下，我们返回result * base（处理在exp为或已被简化为1的情况下），但在第二种情况下，我们将返回result。

奇怪的是，-O3和-O0 ipow1的结果因此都比ipow2高出约25％。这怎么可能？

我在Windows 7，x64，gcc 4.5.2上，并使用gcc ipow.c -O0 -std=c99进行编译。

这是我的分析代码：

int main(int argc, char *argv[]) {
    LARGE_INTEGER ticksPerSecond;
    LARGE_INTEGER tick;
    LARGE_INTEGER start_ticks, end_ticks, cputime;

    double totaltime = 0;
    int repetitions = 10000;
    int rep = 0;
    int nopti = 0;

    for (rep = 0; rep < repetitions; rep++) {
        if (!QueryPerformanceFrequency(&ticksPerSecond)) printf("\tno go QueryPerformance not present");
        if (!QueryPerformanceCounter(&tick)) printf("no go counter not installed");
        QueryPerformanceCounter(&start_ticks);

        /* start real code */

        for (int i = 0; i < 55; i++) {
            for (int j = 0; j < 11; j++) {
                nopti = ipow1(i, j); // or ipow2
            }
        }

        /* end code */

        QueryPerformanceCounter(&end_ticks);
        cputime.QuadPart = end_ticks.QuadPart - start_ticks.QuadPart;
        totaltime += (double)cputime.QuadPart / (double)ticksPerSecond.QuadPart;
    }

    printf("\tTotal elapsed CPU time:   %.9f  sec  with %d repetitions - %ld:\n", totaltime, repetitions, nopti);

    return 0;
}

最佳答案

如果您不想阅读所有这些跳到底部的内容，仅通过分析代码，我就会得出21％的差异。

不同的系统，不同的编译器版本，由不同的人/发行者构建的相同的编译器版本将给出不同的指令组合，这只是您可能会得到的一个示例。

long ipow1(int base, int exp) {
    long result = 1;

    while (exp > 1) {
        if (exp & 1) result *= base;
        exp >>= 1;
        base *= base;
    }

    return result * base;
}

long ipow2(int base, int exp) {
    long result = 1;

    while (exp) {
        if (exp & 1) result *= base;
        exp >>= 1;
        base *= base;
    }

    return result;
}

0000000000000000 <ipow1>:
   0:   83 fe 01                cmp    $0x1,%esi
   3:   ba 01 00 00 00          mov    $0x1,%edx
   8:   7e 1d                   jle    27 <ipow1+0x27>
   a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  10:   40 f6 c6 01             test   $0x1,%sil
  14:   74 07                   je     1d <ipow1+0x1d>
  16:   48 63 c7                movslq %edi,%rax
  19:   48 0f af d0             imul   %rax,%rdx
  1d:   d1 fe                   sar    %esi
  1f:   0f af ff                imul   %edi,%edi
  22:   83 fe 01                cmp    $0x1,%esi
  25:   7f e9                   jg     10 <ipow1+0x10>
  27:   48 63 c7                movslq %edi,%rax
  2a:   48 0f af c2             imul   %rdx,%rax
  2e:   c3                      retq
  2f:   90                      nop

0000000000000030 <ipow2>:
  30:   85 f6                   test   %esi,%esi
  32:   b8 01 00 00 00          mov    $0x1,%eax
  37:   75 0a                   jne    43 <ipow2+0x13>
  39:   eb 19                   jmp    54 <ipow2+0x24>
  3b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  40:   0f af ff                imul   %edi,%edi
  43:   40 f6 c6 01             test   $0x1,%sil
  47:   74 07                   je     50 <ipow2+0x20>
  49:   48 63 d7                movslq %edi,%rdx
  4c:   48 0f af c2             imul   %rdx,%rax
  50:   d1 fe                   sar    %esi
  52:   75 ec                   jne    40 <ipow2+0x10>
  54:   f3 c3                   repz retq

隔离循环：

    while (exp > 1) {
        if (exp & 1) result *= base;
        exp >>= 1;
        base *= base;
    }


//if exp & 1 not true jump to 1d to skip
  10:   40 f6 c6 01             test   $0x1,%sil
  14:   74 07                   je     1d <ipow1+0x1d>
//result *= base
  16:   48 63 c7                movslq %edi,%rax
  19:   48 0f af d0             imul   %rax,%rdx
//exp>>=1
  1d:   d1 fe                   sar    %esi
//base *= base
  1f:   0f af ff                imul   %edi,%edi
//while(exp>1) stayin the loop
  22:   83 fe 01                cmp    $0x1,%esi
  25:   7f e9                   jg     10 <ipow1+0x10>

比较零通常可以为您节省指令，您可以在这里看到

    while (exp) {
        if (exp & 1) result *= base;
        exp >>= 1;
        base *= base;
    }


//base *= base
  40:   0f af ff                imul   %edi,%edi
//if exp & 1 not true jump to skip
  43:   40 f6 c6 01             test   $0x1,%sil
  47:   74 07                   je     50 <ipow2+0x20>
//result *= base
  49:   48 63 d7                movslq %edi,%rdx
  4c:   48 0f af c2             imul   %rdx,%rax
//exp>>=1
  50:   d1 fe                   sar    %esi
//no need for a compare
  52:   75 ec                   jne    40 <ipow2+0x10>

您的计时方法将产生很多错误/混乱。根据循环的拍频和计时器的精度，您可以在一个中产生很多增益，而在另一个中产生很多损耗。此方法通常可以提供更好的准确性：

开始时间= ...
for（rep = bignumber; rep; rep--）
{
//正在测试的代码
...
}
结束时间= ...
总计=结束时间-开始时间；

当然，如果您在操作系统定时上运行此程序，则无论如何都会有相当大的错误。

另外，您还想对计时器变量使用易失性变量，这有助于编译器不重新安排执行顺序。（在那里看到了）。

如果我们从基乘的角度来看这个：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

unsigned int mults;

long ipow1(int base, int exp) {
    long result = 1;

    while (exp > 1) {
        if (exp & 1) result *= base;
        exp >>= 1;
        base *= base;
        mults++;
    }

    result *= base;

    return result;
}

long ipow2(int base, int exp) {
    long result = 1;

    while (exp) {
        if (exp & 1) result *= base;
        exp >>= 1;
        base *= base;
        mults++;
    }

    return result;
}


int main ( void )
{
    int i;
    int j;

    mults = 0;
        for (i = 0; i < 55; i++) {
            for (j = 0; j < 11; j++) {
                ipow1(i, j); // or ipow2
            }
        }
    printf("mults %u\n",mults);

    mults=0;

        for (i = 0; i < 55; i++) {
            for (j = 0; j < 11; j++) {
                ipow2(i, j); // or ipow2
            }
        }
    printf("mults %u\n",mults);

}

有

mults 1045
mults 1595

ipow2（）多50％。实际上，这不只是乘数，还在于您将循环执行50％以上。

ipow1（）在其他乘数上略有增加：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

unsigned int mults;

long ipow1(int base, int exp) {
    long result = 1;

    while (exp > 1) {
        if (exp & 1) mults++;
        exp >>= 1;
        base *= base;
    }
    mults++;

    return result;
}

long ipow2(int base, int exp) {
    long result = 1;

    while (exp) {
        if (exp & 1) mults++;
        exp >>= 1;
        base *= base;
    }

    return result;
}


int main ( void )
{
    int i;
    int j;

    mults = 0;
        for (i = 0; i < 55; i++) {
            for (j = 0; j < 11; j++) {
                ipow1(i, j); // or ipow2
            }
        }
    printf("mults %u\n",mults);

    mults=0;
        for (i = 0; i < 55; i++) {
            for (j = 0; j < 11; j++) {
                ipow2(i, j); // or ipow2
            }
        }
    printf("mults %u\n",mults);

}

ipow1（）执行结果* =基数比ipow2（）大（更多）次

mults 990
mults 935

一个long * int会使这些更昂贵。不足以弥补ipow2（）中循环的损失。

即使不反汇编，也要对编译器希望使用的操作/指令进行粗略的猜测。这里考虑的处理器通常不一定是x86，某些处理器将比其他处理器更好地运行此代码（从执行的指令数量出发，不计算所有其他因素）。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

unsigned int ops;

long ipow1(int base, int exp) {
    long result = 1;
    ops++; //result = immediate
    while (exp > 1) {
        ops++; // compare exp - 1
        ops++; // conditional jump
            //if (exp & 1)
        ops++; //exp&1
        ops++; //conditional jump
        if (exp & 1)
        {
            result *= base;
            ops++;
        }
        exp >>= 1;
        ops++;
        //ops+=?; //using a signed number can cost you this on some systems
        //always use unsigned unless you have a specific reason to use signed.
        //if this had been a short or char variable it might cost you even more
        //operations
        //if this needs to be signed it is what it is, just be aware of
        //the cost
        base *= base;
        ops++;
    }
    result *= base;
    ops++;
    return result;
}

long ipow2(int base, int exp) {
    long result = 1;
    ops++;
    while (exp) {
        //ops++; //cmp exp-0, often optimizes out;
        ops++; //conditional jump
        //if (exp & 1)
        ops++;
        ops++;
        if (exp & 1)
        {
            result *= base;
            ops++;
        }
        exp >>= 1;
        ops++;
        //ops+=?; //right shifting a signed number
        base *= base;
        ops++;
    }
    return result;
}



int main ( void )
{
    int i;
    int j;

    ops = 0;
        for (i = 0; i < 55; i++) {
            for (j = 0; j < 11; j++) {
                ipow1(i, j); // or ipow2
            }
        }
    printf("ops %u\n",ops);

    ops=0;
        for (i = 0; i < 55; i++) {
            for (j = 0; j < 11; j++) {
                ipow2(i, j); // or ipow2
            }
        }
    printf("ops %u\n",ops);

}

假设我计算了所有主要操作，并且没有不公平地给一个功能多于另一个功能：

ops 7865
ops 9515

使用此分析，ipow2慢21％。

我认为最大的杀手是循环中多出50％的时间。如果它取决于数据，那么您可能会在基准测试中找到使功能之间的差异大于或小于所看到的25％的输入。

关于c - 而(n> 1)比(n)快25％？，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/7452028/