我一直在阅读有关由于英特尔 CPU 上地址位 6 到 11 的歧义导致加载/存储重叠导致的 4K 别名。所以我正在尝试编写各种简单的测试(在 i7-3770k、Win7、64 位、VS2017 上)来专门导致问题,以确保我在实践中理解它。

我一直在尝试但未能表现出行为的第一个测试是:

void Test4KAliasing1()
{
    typedef float Value;// Also tried with double


    const uint32_t ValueCount = 1024;
    const uint32_t OffsetCount = 256;
    const uint32_t TestCount = 512;


    Value* a = (Value*)_aligned_malloc(ValueCount * sizeof(Value), 4096);
    Value* b = (Value*)_aligned_malloc(ValueCount * sizeof(Value), 4096);


    for (uint32_t i = 0; i < ValueCount; ++i)
        a[i] = b[i] = (Value)rand();

    for (uint32_t offset = 0; offset < OffsetCount; ++offset)
    {
        uint64_t startTime = StartCPUCycles();


        for (uint32_t test = 0; test < TestCount; ++test)
        {
            for (uint32_t i = 0; i < ValueCount; ++i)
            {
                uint32_t j = (offset + i) % ValueCount;


                a[i] += b[j] * 3.142f;
            }
        }


        uint64_t duration = EndCPUCycles() - startTime;


        printf("time: %llu\toffset: %u ", duration / TestCount, offset);
        printf("\n", a, b);
    }


    _aligned_free(b);
    _aligned_free(a);
}

灵感来自:http://richardstartin.uk/the-much-aligned-garbage-collector/

所以我不太确定为什么这没有从结果时间显示问题?正如我所认为的那样,由于乱序执行,存储然后在循环迭代中会发生到/从不明确的地址加载?

生成的程序集是:
000000013F2510E4  cpuid
000000013F2510E6  rdtsc
000000013F2510E8  shl         rdx,20h
000000013F2510EC  mov         r9d,200h
000000013F2510F2  or          rax,rdx
000000013F2510F5  mov         r10,rax
000000013F2510F8  nop         dword ptr [rax+rax]
000000013F251100  lea         ebx,[rsi+1]
000000013F251103  mov         r8d,80h
000000013F251109  lea         rdx,[r14+8]
000000013F25110D  nop         dword ptr [rax]
000000013F251110  mov         rax,rbx
000000013F251113  lea         ecx,[rbx-1]
000000013F251116  and         eax,3FFh
000000013F25111B  lea         rdx,[rdx+20h]
000000013F25111F  and         ecx,3FFh
000000013F251125  vmulss      xmm1,xmm6,dword ptr [rdi+rcx*4]
000000013F25112A  vaddss      xmm2,xmm1,dword ptr [rdx-28h]
000000013F25112F  vmovss      dword ptr [rdx-28h],xmm2
000000013F251134  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]
000000013F251139  vaddss      xmm2,xmm1,dword ptr [rdx-24h]
000000013F25113E  vmovss      dword ptr [rdx-24h],xmm2
000000013F251143  lea         eax,[rbx+1]
000000013F251146  and         eax,3FFh
000000013F25114B  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]
000000013F251150  vaddss      xmm2,xmm1,dword ptr [rdx-20h]
000000013F251155  vmovss      dword ptr [rdx-20h],xmm2
000000013F25115A  lea         eax,[rbx+2]
000000013F25115D  and         eax,3FFh
000000013F251162  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]
000000013F251167  vaddss      xmm2,xmm1,dword ptr [rdx-1Ch]
000000013F25116C  vmovss      dword ptr [rdx-1Ch],xmm2
000000013F251171  lea         eax,[rbx+3]
000000013F251174  and         eax,3FFh
000000013F251179  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]
000000013F25117E  vaddss      xmm2,xmm1,dword ptr [rdx-18h]
000000013F251183  vmovss      dword ptr [rdx-18h],xmm2
000000013F251188  lea         eax,[rbx+4]
000000013F25118B  and         eax,3FFh
000000013F251190  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]
000000013F251195  vaddss      xmm2,xmm1,dword ptr [rdx-14h]
000000013F25119A  vmovss      dword ptr [rdx-14h],xmm2
000000013F25119F  lea         eax,[rbx+5]
000000013F2511A2  and         eax,3FFh
000000013F2511A7  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]
000000013F2511AC  vaddss      xmm2,xmm1,dword ptr [rdx-10h]
000000013F2511B1  lea         eax,[rbx+6]
000000013F2511B4  add         ebx,8
000000013F2511B7  vmovss      dword ptr [rdx-10h],xmm2
000000013F2511BC  and         eax,3FFh
000000013F2511C1  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]
000000013F2511C6  vaddss      xmm2,xmm1,dword ptr [rdx-0Ch]
000000013F2511CB  vmovss      dword ptr [rdx-0Ch],xmm2
000000013F2511D0  sub         r8,1
000000013F2511D4  jne         Test4KAliasing1+0B0h (013F251110h)
000000013F2511DA  sub         r9,1
000000013F2511DE  jne         Test4KAliasing1+0A0h (013F251100h)
000000013F2511E4  rdtsc

同样在网络上,我看到了各种描述,说底部 12 位必须匹配才能发生这种混叠,而在其他地方只有 6 到 11 位?由于最低 6 位是缓存行中的字节索引,而且一切都是基于缓存行的,那么我会认为它只需要第 6 位到第 11 位来匹配?

编辑:

同样根据彼得斯的回答,我尝试过:
a[i] *= 1.234f;
b[j] += 4.321f;

这似乎没有显示问题并产生:
000000013F6C10E8  cpuid
000000013F6C10EA  rdtsc
000000013F6C10EC  shl         rdx,20h
000000013F6C10F0  mov         ebx,200h
000000013F6C10F5  or          rax,rdx
000000013F6C10F8  mov         r9,rax
000000013F6C10FB  nop         dword ptr [rax+rax]
000000013F6C1100  lea         edx,[rsi+1]
000000013F6C1103  mov         r8d,80h
000000013F6C1109  lea         rcx,[r14+8]
000000013F6C110D  nop         dword ptr [rax]
000000013F6C1110  vmulss      xmm1,xmm6,dword ptr [rcx-8]
000000013F6C1115  vmovss      dword ptr [rcx-8],xmm1
000000013F6C111A  lea         eax,[rdx-1]
000000013F6C111D  and         eax,3FFh
000000013F6C1122  lea         rcx,[rcx+20h]
000000013F6C1126  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]
000000013F6C112B  vmovss      dword ptr [rdi+rax*4],xmm1
000000013F6C1130  vmulss      xmm1,xmm6,dword ptr [rcx-24h]
000000013F6C1135  vmovss      dword ptr [rcx-24h],xmm1
000000013F6C113A  mov         rax,rdx
000000013F6C113D  and         eax,3FFh
000000013F6C1142  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]
000000013F6C1147  vmovss      dword ptr [rdi+rax*4],xmm1
000000013F6C114C  vmulss      xmm0,xmm6,dword ptr [rcx-20h]
000000013F6C1151  lea         eax,[rdx+1]
000000013F6C1154  and         eax,3FFh
000000013F6C1159  vmovss      dword ptr [rcx-20h],xmm0
000000013F6C115E  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]
000000013F6C1163  vmovss      dword ptr [rdi+rax*4],xmm1
000000013F6C1168  vmulss      xmm1,xmm6,dword ptr [rcx-1Ch]
000000013F6C116D  vmovss      dword ptr [rcx-1Ch],xmm1
000000013F6C1172  lea         eax,[rdx+2]
000000013F6C1175  and         eax,3FFh
000000013F6C117A  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]
000000013F6C117F  vmovss      dword ptr [rdi+rax*4],xmm1
000000013F6C1184  vmulss      xmm1,xmm6,dword ptr [rcx-18h]
000000013F6C1189  vmovss      dword ptr [rcx-18h],xmm1
000000013F6C118E  lea         eax,[rdx+3]
000000013F6C1191  and         eax,3FFh
000000013F6C1196  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]
000000013F6C119B  vmovss      dword ptr [rdi+rax*4],xmm1
000000013F6C11A0  vmulss      xmm1,xmm6,dword ptr [rcx-14h]
000000013F6C11A5  vmovss      dword ptr [rcx-14h],xmm1
000000013F6C11AA  lea         eax,[rdx+4]
000000013F6C11AD  and         eax,3FFh
000000013F6C11B2  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]
000000013F6C11B7  vmovss      dword ptr [rdi+rax*4],xmm1
000000013F6C11BC  vmulss      xmm1,xmm6,dword ptr [rcx-10h]
000000013F6C11C1  lea         eax,[rdx+5]
000000013F6C11C4  and         eax,3FFh
000000013F6C11C9  vmovss      dword ptr [rcx-10h],xmm1
000000013F6C11CE  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]
000000013F6C11D3  vmovss      dword ptr [rdi+rax*4],xmm1
000000013F6C11D8  vmulss      xmm1,xmm6,dword ptr [rcx-0Ch]
000000013F6C11DD  lea         eax,[rdx+6]
000000013F6C11E0  add         edx,8
000000013F6C11E3  and         eax,3FFh
000000013F6C11E8  vmovss      dword ptr [rcx-0Ch],xmm1
000000013F6C11ED  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]
000000013F6C11F2  vmovss      dword ptr [rdi+rax*4],xmm1
000000013F6C11F7  sub         r8,1
000000013F6C11FB  jne         Test4KAliasing1+0B0h (013F6C1110h)
000000013F6C1201  sub         rbx,1
000000013F6C1205  jne         Test4KAliasing1+0A0h (013F6C1100h)
000000013F6C120B  rdtsc

同样基于彼得提到的链接问题,我尝试了 3 个数组:
a[i] += b[j] + c[j];

这似乎也没有问题。生成的代码是:
000000013F5110F6  cpuid
000000013F5110F8  rdtsc
000000013F5110FA  shl         rdx,20h
000000013F5110FE  mov         r8d,200h
000000013F511104  or          rax,rdx
000000013F511107  mov         r10,rax
000000013F51110A  nop         word ptr [rax+rax]
000000013F511110  lea         ebx,[rbp+1]
000000013F511113  mov         r9d,100h
000000013F511119  lea         rdx,[r13+8]
000000013F51111D  nop         dword ptr [rax]
000000013F511120  mov         rax,rbx
000000013F511123  lea         ecx,[rbx-1]
000000013F511126  and         eax,7FFh
000000013F51112B  lea         rdx,[rdx+20h]
000000013F51112F  and         ecx,7FFh
000000013F511135  vmovss      xmm0,dword ptr [rsi+rcx*4]
000000013F51113A  vaddss      xmm1,xmm0,dword ptr [rdi+rcx*4]
000000013F51113F  vaddss      xmm2,xmm1,dword ptr [rdx-28h]
000000013F511144  vmovss      dword ptr [rdx-28h],xmm2
000000013F511149  vmovss      xmm0,dword ptr [rsi+rax*4]
000000013F51114E  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]
000000013F511153  vaddss      xmm2,xmm1,dword ptr [rdx-24h]
000000013F511158  vmovss      dword ptr [rdx-24h],xmm2
000000013F51115D  lea         eax,[rbx+1]
000000013F511160  and         eax,7FFh
000000013F511165  vmovss      xmm0,dword ptr [rsi+rax*4]
000000013F51116A  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]
000000013F51116F  vaddss      xmm2,xmm1,dword ptr [rdx-20h]
000000013F511174  vmovss      dword ptr [rdx-20h],xmm2
000000013F511179  lea         eax,[rbx+2]
000000013F51117C  and         eax,7FFh
000000013F511181  vmovss      xmm0,dword ptr [rsi+rax*4]
000000013F511186  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]
000000013F51118B  vaddss      xmm2,xmm1,dword ptr [rdx-1Ch]
000000013F511190  vmovss      dword ptr [rdx-1Ch],xmm2
000000013F511195  lea         eax,[rbx+3]
000000013F511198  and         eax,7FFh
000000013F51119D  vmovss      xmm0,dword ptr [rsi+rax*4]
000000013F5111A2  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]
000000013F5111A7  vaddss      xmm2,xmm1,dword ptr [rdx-18h]
000000013F5111AC  vmovss      dword ptr [rdx-18h],xmm2
000000013F5111B1  lea         eax,[rbx+4]
000000013F5111B4  and         eax,7FFh
000000013F5111B9  vmovss      xmm0,dword ptr [rsi+rax*4]
000000013F5111BE  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]
000000013F5111C3  vaddss      xmm2,xmm1,dword ptr [rdx-14h]
000000013F5111C8  vmovss      dword ptr [rdx-14h],xmm2
000000013F5111CD  lea         eax,[rbx+5]
000000013F5111D0  and         eax,7FFh
000000013F5111D5  vmovss      xmm0,dword ptr [rsi+rax*4]
000000013F5111DA  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]
000000013F5111DF  vaddss      xmm2,xmm1,dword ptr [rdx-10h]
000000013F5111E4  lea         eax,[rbx+6]
000000013F5111E7  add         ebx,8
000000013F5111EA  vmovss      dword ptr [rdx-10h],xmm2
000000013F5111EF  and         eax,7FFh
000000013F5111F4  vmovss      xmm0,dword ptr [rsi+rax*4]
000000013F5111F9  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]
000000013F5111FE  vaddss      xmm2,xmm1,dword ptr [rdx-0Ch]
000000013F511203  vmovss      dword ptr [rdx-0Ch],xmm2
000000013F511208  sub         r9,1
000000013F51120C  jne         Test4KAliasing2+0C0h (013F511120h)
000000013F511212  sub         r8,1
000000013F511216  jne         Test4KAliasing2+0B0h (013F511110h)
000000013F51121C  rdtsc

继彼得对他的回答的评论/更新之后,我尝试了:
a[i] *= 1.234f;
b[i] += 4.321f;

哪个没有显示问题。注意:我试图改变 i 的偏移量, j = i + 偏移量从零偏移量开始,对于大多数这些尝试,如果我能找到它,看看什么偏移量可以缓解问题。 (由于我的 x86 已经生锈了,我仍然在这里挖掘反汇编以了解地址生成)。
000000013F7D1104  cpuid
000000013F7D1106  rdtsc
000000013F7D1108  shl         rdx,20h
000000013F7D110C  or          rax,rdx
000000013F7D110F  mov         edx,200h
000000013F7D1114  mov         rbx,rax
000000013F7D1117  cmp         rsi,r15
000000013F7D111A  ja          Test4KAliasing1+130h (013F7D1190h)
000000013F7D111C  cmp         rbp,r14
000000013F7D111F  jb          Test4KAliasing1+130h (013F7D1190h)
000000013F7D1121  lea         rcx,[rsi+4]
000000013F7D1125  mov         eax,100h
000000013F7D112A  nop         word ptr [rax+rax]
000000013F7D1130  vmulss      xmm1,xmm6,dword ptr [rdi+rcx-4]
000000013F7D1136  vmovss      dword ptr [rdi+rcx-4],xmm1
000000013F7D113C  vaddss      xmm1,xmm7,dword ptr [rcx-4]
000000013F7D1141  vmovss      dword ptr [rcx-4],xmm1
000000013F7D1146  vmulss      xmm1,xmm6,dword ptr [rcx+rdi]
000000013F7D114B  vmovss      dword ptr [rcx+rdi],xmm1
000000013F7D1150  vaddss      xmm0,xmm7,dword ptr [rcx]
000000013F7D1154  vmovss      dword ptr [rcx],xmm0
000000013F7D1158  vmulss      xmm0,xmm6,dword ptr [rdi+rcx+4]
000000013F7D115E  vmovss      dword ptr [rdi+rcx+4],xmm0
000000013F7D1164  vaddss      xmm0,xmm7,dword ptr [rcx+4]
000000013F7D1169  vmovss      dword ptr [rcx+4],xmm0
000000013F7D116E  vmulss      xmm0,xmm6,dword ptr [rdi+rcx+8]
000000013F7D1174  vmovss      dword ptr [rdi+rcx+8],xmm0
000000013F7D117A  vaddss      xmm0,xmm7,dword ptr [rcx+8]
000000013F7D117F  vmovss      dword ptr [rcx+8],xmm0
000000013F7D1184  lea         rcx,[rcx+10h]
000000013F7D1188  sub         rax,1
000000013F7D118C  jne         Test4KAliasing1+0D0h (013F7D1130h)
000000013F7D118E  jmp         Test4KAliasing1+1AEh (013F7D120Eh)
000000013F7D1190  vmovups     xmm2,xmmword ptr [__xmm@3f9df3b63f9df3b63f9df3b63f9df3b6 (013F7EA0E0h)]
000000013F7D1198  vmovups     xmm3,xmmword ptr [__xmm@408a45a2408a45a2408a45a2408a45a2 (013F7EA0F0h)]
000000013F7D11A0  lea         rax,[rsi+10h]
000000013F7D11A4  mov         ecx,40h
000000013F7D11A9  nop         dword ptr [rax]
000000013F7D11B0  vmulps      xmm1,xmm2,xmmword ptr [rdi+rax-10h]
000000013F7D11B6  vmovups     xmmword ptr [rdi+rax-10h],xmm1
000000013F7D11BC  vaddps      xmm1,xmm3,xmmword ptr [rax-10h]
000000013F7D11C1  vmovups     xmmword ptr [rax-10h],xmm1
000000013F7D11C6  vmulps      xmm1,xmm2,xmmword ptr [rdi+rax]
000000013F7D11CB  vmovups     xmmword ptr [rdi+rax],xmm1
000000013F7D11D0  vaddps      xmm1,xmm3,xmmword ptr [rax]
000000013F7D11D4  vmovups     xmmword ptr [rax],xmm1
000000013F7D11D8  vmulps      xmm1,xmm2,xmmword ptr [rdi+rax+10h]
000000013F7D11DE  vmovups     xmmword ptr [rdi+rax+10h],xmm1
000000013F7D11E4  vaddps      xmm1,xmm3,xmmword ptr [rax+10h]
000000013F7D11E9  vmovups     xmmword ptr [rax+10h],xmm1
000000013F7D11EE  vmulps      xmm1,xmm2,xmmword ptr [rdi+rax+20h]
000000013F7D11F4  vmovups     xmmword ptr [rdi+rax+20h],xmm1
000000013F7D11FA  vaddps      xmm1,xmm3,xmmword ptr [rax+20h]
000000013F7D11FF  vmovups     xmmword ptr [rax+20h],xmm1
000000013F7D1204  lea         rax,[rax+40h]
000000013F7D1208  sub         rcx,1
000000013F7D120C  jne         Test4KAliasing1+150h (013F7D11B0h)
000000013F7D120E  sub         rdx,1
000000013F7D1212  jne         Test4KAliasing1+0B7h (013F7D1117h)
000000013F7D1218  rdtsc

典型的计时运行:
a[i] *= 1.234f;
b[i] += 4.321f;

是:
time: 715       offset: 0
time: 647       offset: 1
time: 641       offset: 2
time: 703       offset: 3
time: 658       offset: 4
time: 657       offset: 5
time: 656       offset: 6
time: 657       offset: 7
time: 658       offset: 8
time: 657       offset: 9
time: 658       offset: 10
time: 653       offset: 11
time: 658       offset: 12
time: 652       offset: 13
time: 658       offset: 14
time: 657       offset: 15
time: 658       offset: 16
time: 656       offset: 17
time: 659       offset: 18
time: 656       offset: 19
time: 656       offset: 20
time: 656       offset: 21
time: 663       offset: 22
time: 657       offset: 23
time: 657       offset: 24
time: 704       offset: 25
time: 714       offset: 26
time: 657       offset: 27
time: 658       offset: 28
time: 658       offset: 29
time: 656       offset: 30
time: 656       offset: 31
time: 657       offset: 32
time: 658       offset: 33
time: 658       offset: 34
time: 656       offset: 35
time: 658       offset: 36
time: 658       offset: 37
time: 658       offset: 38
time: 658       offset: 39
time: 660       offset: 40
time: 660       offset: 41
time: 664       offset: 42
time: 656       offset: 43
time: 656       offset: 44
time: 658       offset: 45
time: 656       offset: 46
time: 656       offset: 47
time: 713       offset: 48
time: 658       offset: 49
time: 663       offset: 50
time: 662       offset: 51
time: 665       offset: 52
time: 663       offset: 53
time: 665       offset: 54
time: 658       offset: 55
time: 658       offset: 56
time: 658       offset: 57
time: 656       offset: 58
time: 657       offset: 59
time: 658       offset: 60
time: 658       offset: 61
time: 656       offset: 62
time: 666       offset: 63
time: 656       offset: 64
time: 658       offset: 65
time: 656       offset: 66
time: 657       offset: 67
time: 658       offset: 68
time: 658       offset: 69
time: 652       offset: 70
time: 658       offset: 71
time: 657       offset: 72
time: 658       offset: 73
time: 658       offset: 74
time: 656       offset: 75
time: 658       offset: 76
time: 665       offset: 77
time: 657       offset: 78
time: 656       offset: 79
time: 656       offset: 80
time: 666       offset: 81
time: 656       offset: 82
time: 702       offset: 83
time: 640       offset: 84
time: 640       offset: 85
time: 657       offset: 86
time: 657       offset: 87
time: 658       offset: 88
time: 658       offset: 89
time: 656       offset: 90
time: 657       offset: 91
time: 657       offset: 92
time: 657       offset: 93
time: 658       offset: 94
time: 662       offset: 95
time: 658       offset: 96
time: 656       offset: 97
time: 657       offset: 98
time: 663       offset: 99
time: 660       offset: 100
time: 663       offset: 101
time: 657       offset: 102
time: 656       offset: 103
time: 664       offset: 104
time: 659       offset: 105
time: 659       offset: 106
time: 658       offset: 107
time: 774       offset: 108
time: 707       offset: 109
time: 710       offset: 110
time: 658       offset: 111
time: 657       offset: 112
time: 661       offset: 113
time: 658       offset: 114
time: 656       offset: 115
time: 658       offset: 116
time: 657       offset: 117
time: 658       offset: 118
time: 660       offset: 119
time: 666       offset: 120
time: 657       offset: 121
time: 658       offset: 122
time: 651       offset: 123
time: 658       offset: 124
time: 657       offset: 125
time: 657       offset: 126
time: 658       offset: 127
time: 656       offset: 128
time: 658       offset: 129
time: 656       offset: 130
time: 658       offset: 131
time: 645       offset: 132
time: 640       offset: 133
time: 640       offset: 134
time: 659       offset: 135
time: 664       offset: 136
time: 658       offset: 137
time: 662       offset: 138
time: 656       offset: 139
time: 658       offset: 140
time: 656       offset: 141
time: 658       offset: 142
time: 660       offset: 143
time: 658       offset: 144
time: 658       offset: 145
time: 656       offset: 146
time: 657       offset: 147
time: 664       offset: 148
time: 656       offset: 149
time: 656       offset: 150
time: 658       offset: 151
time: 656       offset: 152
time: 668       offset: 153
time: 656       offset: 154
time: 656       offset: 155
time: 656       offset: 156
time: 658       offset: 157
time: 656       offset: 158
time: 658       offset: 159
time: 660       offset: 160
time: 658       offset: 161
time: 658       offset: 162
time: 658       offset: 163
time: 658       offset: 164
time: 656       offset: 165
time: 686       offset: 166
time: 656       offset: 167
time: 656       offset: 168
time: 658       offset: 169
time: 656       offset: 170
time: 658       offset: 171
time: 656       offset: 172
time: 656       offset: 173
time: 656       offset: 174
time: 658       offset: 175
time: 656       offset: 176
time: 658       offset: 177
time: 658       offset: 178
time: 654       offset: 179
time: 639       offset: 180
time: 639       offset: 181
time: 639       offset: 182
time: 657       offset: 183
time: 641       offset: 184
time: 640       offset: 185
time: 640       offset: 186
time: 640       offset: 187
time: 640       offset: 188
time: 640       offset: 189
time: 640       offset: 190
time: 700       offset: 191
time: 715       offset: 192
time: 657       offset: 193
time: 657       offset: 194
time: 662       offset: 195
time: 703       offset: 196
time: 640       offset: 197
time: 639       offset: 198
time: 638       offset: 199
time: 640       offset: 200
time: 640       offset: 201
time: 640       offset: 202
time: 704       offset: 203
time: 638       offset: 204
time: 640       offset: 205
time: 639       offset: 206
time: 657       offset: 207
time: 658       offset: 208
time: 657       offset: 209
time: 659       offset: 210
time: 663       offset: 211
time: 658       offset: 212
time: 658       offset: 213
time: 657       offset: 214
time: 667       offset: 215
time: 657       offset: 216
time: 657       offset: 217
time: 658       offset: 218
time: 657       offset: 219
time: 656       offset: 220
time: 661       offset: 221
time: 651       offset: 222
time: 658       offset: 223
time: 658       offset: 224
time: 656       offset: 225
time: 658       offset: 226
time: 658       offset: 227
time: 672       offset: 228
time: 658       offset: 229
time: 656       offset: 230
time: 649       offset: 231
time: 665       offset: 232
time: 657       offset: 233
time: 652       offset: 234
time: 664       offset: 235
time: 656       offset: 236
time: 662       offset: 237
time: 658       offset: 238
time: 665       offset: 239
time: 658       offset: 240
time: 657       offset: 241
time: 656       offset: 242
time: 658       offset: 243
time: 657       offset: 244
time: 658       offset: 245
time: 658       offset: 246
time: 656       offset: 247
time: 658       offset: 248
time: 656       offset: 249
time: 658       offset: 250
time: 656       offset: 251
time: 665       offset: 252
time: 658       offset: 253
time: 656       offset: 254
time: 658       offset: 255

但是:我想我犯了一个错误,现在发现它:
a[i] *= 1.234f;
b[j] += 4.321f;

作为一个典型的计时运行现在是:
time: 2794      offset: 0
time: 2737      offset: 1
time: 2655      offset: 2
time: 2748      offset: 3
time: 2605      offset: 4
time: 2730      offset: 5
time: 2665      offset: 6
time: 2703      offset: 7
time: 2571      offset: 8
time: 2558      offset: 9
time: 2213      offset: 10
time: 2200      offset: 11
time: 2325      offset: 12
time: 2200      offset: 13
time: 2200      offset: 14
time: 2264      offset: 15
time: 2264      offset: 16
time: 2355      offset: 17
time: 2348      offset: 18
time: 2262      offset: 19
time: 2260      offset: 20
time: 2262      offset: 21
time: 2260      offset: 22
time: 2490      offset: 23
time: 2261      offset: 24
time: 2260      offset: 25
time: 2255      offset: 26
time: 2261      offset: 27
time: 2263      offset: 28
time: 2260      offset: 29
time: 2260      offset: 30
time: 2262      offset: 31
time: 2264      offset: 32
time: 2355      offset: 33
time: 2266      offset: 34
time: 2270      offset: 35
time: 2260      offset: 36
time: 2268      offset: 37
time: 2260      offset: 38
time: 2260      offset: 39
time: 2262      offset: 40
time: 2260      offset: 41
time: 2259      offset: 42
time: 2260      offset: 43
time: 2260      offset: 44
time: 2255      offset: 45
time: 2260      offset: 46
time: 2265      offset: 47
time: 2263      offset: 48
time: 2355      offset: 49
time: 2293      offset: 50
time: 2204      offset: 51
time: 2323      offset: 52
time: 2200      offset: 53
time: 2200      offset: 54
time: 2460      offset: 55
time: 2200      offset: 56

随着偏移量变大,这可能是大约 20% 的差异吗?

最佳答案

底部 12 位是位 [11 : 0] 。第 11 位是第 12 位,因为我们从 0 开始计数。

CPU 以字节粒度检测加载/存储别名,而不仅仅是加载是否访问与旧存储相同的缓存行。存储到 array[1] 不会减慢 array[2] 的加载速度;这对性能来说真的很糟糕,因为循环遍历数组并一次对每个元素进行 RMW 是一种非常常见的模式。 (没有软件流水线来在我们存储的位置之前加载几个元素。)

所以我认为你在这里没有遇到问题,因为你只是在从 4k 页面内的相同偏移量加载后存储到一个位置。 如果你做了类似这个简单循环的事情(不需要额外的跨步或偏移到另一个额外的页面,不同页面中的两个数组就可以了。)

for (i = 0 ; i < limit ; i++) {
    a[i] *= 1.234;
    b[i] += 4.321;    // load from the same offset we just wrote, but in another page
}

并且编译器在加载 a 之前制作了存储到 b 的 asm ,您会遇到问题,因为 ab 相对于 4k 页面具有相同的对齐方式。

(编译器可以在存储之前执行两次加载,如果它证明了 a != b ,或者在运行执行该操作的循环版本之前发出代码进行检查。或者使用自动矢量化和/或展开,如果它检查了矢量宽度的重叠乘以展开因子。)

这不是一个完美的例子,但是使来自 b 的加载依赖于来自 a 的存储应该使乱序执行至少努力隐藏那么多延迟。

创建 4k 别名的另一种简单方法是将 memcpy 从 src = srcpage 转换为 dst = dstpage + 16 ,当然 srcpage 和 dstpage 都是页面对齐的。存储到 dst[i] 就像 dstpage[i+16] (以字节为单位,而不是任何 C 元素大小),因此存储 dst[i] 将在从 src[i+16] 加载之前(按程序顺序)发生。当循环到达该 i 值时,负载将被 4k 别名阻塞。

参见 L1 memory bandwidth: 50% drop in efficiency using addresses which differ by 4096+64 bytes 示例,@HadiBrais 对包括 IvyBridge(如 i7-3770k)在内的 CPU 进行了性能分析。

关于performance - 了解英特尔 CPU 上的 4K 锯齿,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/54415140/

10-11 20:38