我有两个逻辑上等效的函数:
long ipow1(int base, int exp) {
// HISTORICAL NOTE:
// This wasn't here in the original question, I edited it in,
if (exp == 0) return 1;
long result = 1;
while (exp > 1) {
if (exp & 1) result *= base;
exp >>= 1;
base *= base;
}
return result * base;
}
long ipow2(int base, int exp) {
long result = 1;
while (exp) {
if (exp & 1) result *= base;
exp >>= 1;
base *= base;
}
return result;
}
注意:
这些循环是等效的,因为在前一种情况下,我们返回
result * base
(处理在exp
为或已被简化为1
的情况下),但在第二种情况下,我们将返回result
。奇怪的是,
-O3
和-O0
ipow1
的结果因此都比ipow2
高出约25%。这怎么可能?我在Windows 7,x64,gcc 4.5.2上,并使用
gcc ipow.c -O0 -std=c99
进行编译。这是我的分析代码:
int main(int argc, char *argv[]) {
LARGE_INTEGER ticksPerSecond;
LARGE_INTEGER tick;
LARGE_INTEGER start_ticks, end_ticks, cputime;
double totaltime = 0;
int repetitions = 10000;
int rep = 0;
int nopti = 0;
for (rep = 0; rep < repetitions; rep++) {
if (!QueryPerformanceFrequency(&ticksPerSecond)) printf("\tno go QueryPerformance not present");
if (!QueryPerformanceCounter(&tick)) printf("no go counter not installed");
QueryPerformanceCounter(&start_ticks);
/* start real code */
for (int i = 0; i < 55; i++) {
for (int j = 0; j < 11; j++) {
nopti = ipow1(i, j); // or ipow2
}
}
/* end code */
QueryPerformanceCounter(&end_ticks);
cputime.QuadPart = end_ticks.QuadPart - start_ticks.QuadPart;
totaltime += (double)cputime.QuadPart / (double)ticksPerSecond.QuadPart;
}
printf("\tTotal elapsed CPU time: %.9f sec with %d repetitions - %ld:\n", totaltime, repetitions, nopti);
return 0;
}
最佳答案
如果您不想阅读所有这些跳到底部的内容,仅通过分析代码,我就会得出21%的差异。
不同的系统,不同的编译器版本,由不同的人/发行者构建的相同的编译器版本将给出不同的指令组合,这只是您可能会得到的一个示例。
long ipow1(int base, int exp) {
long result = 1;
while (exp > 1) {
if (exp & 1) result *= base;
exp >>= 1;
base *= base;
}
return result * base;
}
long ipow2(int base, int exp) {
long result = 1;
while (exp) {
if (exp & 1) result *= base;
exp >>= 1;
base *= base;
}
return result;
}
0000000000000000 <ipow1>:
0: 83 fe 01 cmp $0x1,%esi
3: ba 01 00 00 00 mov $0x1,%edx
8: 7e 1d jle 27 <ipow1+0x27>
a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
10: 40 f6 c6 01 test $0x1,%sil
14: 74 07 je 1d <ipow1+0x1d>
16: 48 63 c7 movslq %edi,%rax
19: 48 0f af d0 imul %rax,%rdx
1d: d1 fe sar %esi
1f: 0f af ff imul %edi,%edi
22: 83 fe 01 cmp $0x1,%esi
25: 7f e9 jg 10 <ipow1+0x10>
27: 48 63 c7 movslq %edi,%rax
2a: 48 0f af c2 imul %rdx,%rax
2e: c3 retq
2f: 90 nop
0000000000000030 <ipow2>:
30: 85 f6 test %esi,%esi
32: b8 01 00 00 00 mov $0x1,%eax
37: 75 0a jne 43 <ipow2+0x13>
39: eb 19 jmp 54 <ipow2+0x24>
3b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
40: 0f af ff imul %edi,%edi
43: 40 f6 c6 01 test $0x1,%sil
47: 74 07 je 50 <ipow2+0x20>
49: 48 63 d7 movslq %edi,%rdx
4c: 48 0f af c2 imul %rdx,%rax
50: d1 fe sar %esi
52: 75 ec jne 40 <ipow2+0x10>
54: f3 c3 repz retq
隔离循环:
while (exp > 1) {
if (exp & 1) result *= base;
exp >>= 1;
base *= base;
}
//if exp & 1 not true jump to 1d to skip
10: 40 f6 c6 01 test $0x1,%sil
14: 74 07 je 1d <ipow1+0x1d>
//result *= base
16: 48 63 c7 movslq %edi,%rax
19: 48 0f af d0 imul %rax,%rdx
//exp>>=1
1d: d1 fe sar %esi
//base *= base
1f: 0f af ff imul %edi,%edi
//while(exp>1) stayin the loop
22: 83 fe 01 cmp $0x1,%esi
25: 7f e9 jg 10 <ipow1+0x10>
比较零通常可以为您节省指令,您可以在这里看到
while (exp) {
if (exp & 1) result *= base;
exp >>= 1;
base *= base;
}
//base *= base
40: 0f af ff imul %edi,%edi
//if exp & 1 not true jump to skip
43: 40 f6 c6 01 test $0x1,%sil
47: 74 07 je 50 <ipow2+0x20>
//result *= base
49: 48 63 d7 movslq %edi,%rdx
4c: 48 0f af c2 imul %rdx,%rax
//exp>>=1
50: d1 fe sar %esi
//no need for a compare
52: 75 ec jne 40 <ipow2+0x10>
您的计时方法将产生很多错误/混乱。根据循环的拍频和计时器的精度,您可以在一个中产生很多增益,而在另一个中产生很多损耗。此方法通常可以提供更好的准确性:
开始时间= ...
for(rep = bignumber; rep; rep--)
{
//正在测试的代码
...
}
结束时间= ...
总计=结束时间-开始时间;
当然,如果您在操作系统定时上运行此程序,则无论如何都会有相当大的错误。
另外,您还想对计时器变量使用易失性变量,这有助于编译器不重新安排执行顺序。 (在那里看到了)。
如果我们从基乘的角度来看这个:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
unsigned int mults;
long ipow1(int base, int exp) {
long result = 1;
while (exp > 1) {
if (exp & 1) result *= base;
exp >>= 1;
base *= base;
mults++;
}
result *= base;
return result;
}
long ipow2(int base, int exp) {
long result = 1;
while (exp) {
if (exp & 1) result *= base;
exp >>= 1;
base *= base;
mults++;
}
return result;
}
int main ( void )
{
int i;
int j;
mults = 0;
for (i = 0; i < 55; i++) {
for (j = 0; j < 11; j++) {
ipow1(i, j); // or ipow2
}
}
printf("mults %u\n",mults);
mults=0;
for (i = 0; i < 55; i++) {
for (j = 0; j < 11; j++) {
ipow2(i, j); // or ipow2
}
}
printf("mults %u\n",mults);
}
有
mults 1045
mults 1595
ipow2()多50%。实际上,这不只是乘数,还在于您将循环执行50%以上。
ipow1()在其他乘数上略有增加:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
unsigned int mults;
long ipow1(int base, int exp) {
long result = 1;
while (exp > 1) {
if (exp & 1) mults++;
exp >>= 1;
base *= base;
}
mults++;
return result;
}
long ipow2(int base, int exp) {
long result = 1;
while (exp) {
if (exp & 1) mults++;
exp >>= 1;
base *= base;
}
return result;
}
int main ( void )
{
int i;
int j;
mults = 0;
for (i = 0; i < 55; i++) {
for (j = 0; j < 11; j++) {
ipow1(i, j); // or ipow2
}
}
printf("mults %u\n",mults);
mults=0;
for (i = 0; i < 55; i++) {
for (j = 0; j < 11; j++) {
ipow2(i, j); // or ipow2
}
}
printf("mults %u\n",mults);
}
ipow1()执行结果* =基数比ipow2()大(更多)次
mults 990
mults 935
一个long * int会使这些更昂贵。不足以弥补ipow2()中循环的损失。
即使不反汇编,也要对编译器希望使用的操作/指令进行粗略的猜测。这里考虑的处理器通常不一定是x86,某些处理器将比其他处理器更好地运行此代码(从执行的指令数量出发,不计算所有其他因素)。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
unsigned int ops;
long ipow1(int base, int exp) {
long result = 1;
ops++; //result = immediate
while (exp > 1) {
ops++; // compare exp - 1
ops++; // conditional jump
//if (exp & 1)
ops++; //exp&1
ops++; //conditional jump
if (exp & 1)
{
result *= base;
ops++;
}
exp >>= 1;
ops++;
//ops+=?; //using a signed number can cost you this on some systems
//always use unsigned unless you have a specific reason to use signed.
//if this had been a short or char variable it might cost you even more
//operations
//if this needs to be signed it is what it is, just be aware of
//the cost
base *= base;
ops++;
}
result *= base;
ops++;
return result;
}
long ipow2(int base, int exp) {
long result = 1;
ops++;
while (exp) {
//ops++; //cmp exp-0, often optimizes out;
ops++; //conditional jump
//if (exp & 1)
ops++;
ops++;
if (exp & 1)
{
result *= base;
ops++;
}
exp >>= 1;
ops++;
//ops+=?; //right shifting a signed number
base *= base;
ops++;
}
return result;
}
int main ( void )
{
int i;
int j;
ops = 0;
for (i = 0; i < 55; i++) {
for (j = 0; j < 11; j++) {
ipow1(i, j); // or ipow2
}
}
printf("ops %u\n",ops);
ops=0;
for (i = 0; i < 55; i++) {
for (j = 0; j < 11; j++) {
ipow2(i, j); // or ipow2
}
}
printf("ops %u\n",ops);
}
假设我计算了所有主要操作,并且没有不公平地给一个功能多于另一个功能:
ops 7865
ops 9515
使用此分析,ipow2慢21%。
我认为最大的杀手是循环中多出50%的时间。如果它取决于数据,那么您可能会在基准测试中找到使功能之间的差异大于或小于所看到的25%的输入。
关于c - 而(n> 1)比(n)快25%?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/7452028/