我是SSIM.js和jest-image-snapshot的活跃作者和维护者。当前,我正在努力优化图像处理实现,以利用WebAssembly改善性能。
现在,我注意到正在生成的代码从llvm程序集(webassembly text?)输出 Angular 以及Node.js的实际程序集输出(--print-wasm-code)都添加了不必要的指令。需要特别注意的是,在加载常量时,它的确很奇怪。例如,在下面的三部分代码中,查看名为multiplier或常量舍入的数组。在GCC上,乘数将存储在程序集的.rodata节中,以便一次加载或转换为整数,并且舍入器将与movd或movq内联。这里似乎是在循环的每一轮中插入值。它也用vpblendw做一些我完全不了解的事情。
我该如何解决?
alignas(64) const static uint16_t multiplierArray[8]= {77,150,29,1,77,160,29,1};
extern "C"
int rgba2y(void* inputDataBuffer, ptrdiff_t length) {
typedef __u8x16 v8x16;
typedef __u16x8 v16x8;
v8x16* pInputPtr = (v8x16*) inputDataBuffer;
v8x16* pInputPtrEnd = (v8x16*)((uint8_t*)inputDataBuffer + length);
v8x16* pOutputPtr = (v8x16*) inputDataBuffer;
__m128i rounder = _mm_cvtsi32_si128(0x80808080);
v8x16 zero;
zero ^= zero;
__m128i multiplier = *((__m128i*)multiplierArray);
// v16x8 multiplier = wasm_i64x2_splat(0x1001d0096004d);
unsigned i = 0;
for (; (i+4)*sizeof(__m128i)<= length; i+= 4) {
v8x16 iv0 = wasm_v8x16_shuffle(pInputPtr[i/4],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
v8x16 iv1 = wasm_v8x16_shuffle(pInputPtr[i/4+1],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
v8x16 iv2 = wasm_v8x16_shuffle(pInputPtr[i/4+2],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
v8x16 iv3 = wasm_v8x16_shuffle(pInputPtr[i/4+3],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
// rg ba rg ba rg ba rg ba rg ba rg ba rg ba
__m128i rg0 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv0, (__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv0,(__m128i)zero),(__m128i)multiplier));
__m128i rg1 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier));
__m128i rg2 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier));
__m128i rg3 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier));
// rgba rgba rgba rgba rgba rgba rgba rgba
__m128i rgba0 = wasm_u16x8_shr(_mm_hadd_epi16(rg0,rg1), 8);
__m128i rgba1 = wasm_u16x8_shr(_mm_hadd_epi16(rg2,rg3), 8);
pOutputPtr[i/4] = wasm_u8x16_narrow_i16x8(rgba0,rgba1);
}
// abbreviated...
return 0;
}
llvm程序集是: .section .text.rgba2y,"",@
.hidden rgba2y # -- Begin function rgba2y
.globl rgba2y
.type rgba2y,@function
rgba2y: # @rgba2y
.Lfunc_begin0:
.loc 2 56 0 # rgb2y-sample.cpp:56:0
.functype rgba2y (i32, i32) -> (i32)
.local i32, i32, v128, v128, v128, v128, v128, v128
# %bb.0: # %entry
#DEBUG_VALUE: rgba2y:length <- %4
#DEBUG_VALUE: rgba2y:pInputPtrEnd <- undef
#DEBUG_VALUE: rgba2y:i <- 0
#DEBUG_VALUE: rgba2y:inputDataBuffer <- %3
#DEBUG_VALUE: rgba2y:pInputPtr <- %3
#DEBUG_VALUE: rgba2y:pOutputPtr <- %3
#DEBUG_VALUE: rgba2y:rounder <- undef
#DEBUG_VALUE: rgba2y:zero <- undef
#DEBUG_VALUE: rgba2y:multiplier <- undef
block
.Ltmp0:
.loc 2 68 30 prologue_end # rgb2y-sample.cpp:68:30
local.get 1
i32.const 64
i32.lt_u
.Ltmp1:
.loc 2 68 2 is_stmt 0 # rgb2y-sample.cpp:68:2
br_if 0 # 0: down to label0
.Ltmp2:
# %bb.1:
.loc 2 0 2 # rgb2y-sample.cpp:0:2
i32.const 0
local.set 2
i32.const 4
local.set 3
.LBB0_2: # %for.body
# =>This Inner Loop Header: Depth=1
loop # label1:
.Ltmp3:
#DEBUG_VALUE: rgba2y:i <- %101
#DEBUG_VALUE: rgba0 <- undef
#DEBUG_VALUE: rgba1 <- undef
.loc 2 69 15 is_stmt 1 # rgb2y-sample.cpp:69:15
local.get 0
local.get 2
i32.const 2
i32.shl
i32.add
local.tee 2
local.get 2
v128.load 0
i32.const 0
i8x16.splat
local.tee 4
i32.const -128
i8x16.replace_lane 0
i32.const -128
i8x16.replace_lane 1
i32.const -128
i8x16.replace_lane 2
i32.const -128
i8x16.replace_lane 3
local.tee 5
v8x16.shuffle 0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp4:
.loc 2 74 48 # rgb2y-sample.cpp:74:48
local.tee 6
.Ltmp5:
#DEBUG_VALUE: iv0 <- undef
#DEBUG_VALUE: iv0 <- %153
local.get 4
v8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
i32.const 77
.loc 2 74 32 is_stmt 0 # rgb2y-sample.cpp:74:32
i16x8.splat
i32.const 150
i16x8.replace_lane 1
i32.const 29
i16x8.replace_lane 2
i32.const 1
i16x8.replace_lane 3
i32.const 160
i16x8.replace_lane 5
i32.const 29
i16x8.replace_lane 6
i32.const 1
i16x8.replace_lane 7
local.tee 7
i16x8.mul
.loc 2 74 133 # rgb2y-sample.cpp:74:133
local.tee 8
local.get 6
local.get 4
v8x16.shuffle 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc 2 74 117 # rgb2y-sample.cpp:74:117
local.get 7
i16x8.mul
.loc 2 74 17 # rgb2y-sample.cpp:74:17
local.tee 6
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 8
local.get 6
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp6:
.loc 2 0 17 # rgb2y-sample.cpp:0:17
local.tee 6
.Ltmp7:
#DEBUG_VALUE: rg0 <- undef
#DEBUG_VALUE: rg0 <- %153
.loc 2 70 15 is_stmt 1 # rgb2y-sample.cpp:70:15
local.get 2
i32.const 16
i32.add
v128.load 0
local.get 5
v8x16.shuffle 0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp8:
.loc 2 75 62 # rgb2y-sample.cpp:75:62
local.tee 8
.Ltmp9:
#DEBUG_VALUE: iv1 <- undef
#DEBUG_VALUE: iv1 <- %157
local.get 4
v8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
.loc 2 75 46 is_stmt 0 # rgb2y-sample.cpp:75:46
local.get 7
i16x8.mul
.loc 2 75 146 # rgb2y-sample.cpp:75:146
local.tee 9
local.get 8
local.get 4
v8x16.shuffle 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc 2 75 130 # rgb2y-sample.cpp:75:130
local.get 7
i16x8.mul
.loc 2 75 31 # rgb2y-sample.cpp:75:31
local.tee 8
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 9
local.get 8
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp10:
.loc 2 79 33 is_stmt 1 # rgb2y-sample.cpp:79:33
local.tee 8
.Ltmp11:
#DEBUG_VALUE: rg1 <- undef
#DEBUG_VALUE: rg1 <- %157
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 6
local.get 8
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
i32.const 8
.loc 2 79 18 is_stmt 0 # rgb2y-sample.cpp:79:18
i16x8.shr_u
.loc 2 71 15 is_stmt 1 # rgb2y-sample.cpp:71:15
local.get 2
i32.const 32
i32.add
v128.load 0
local.get 5
v8x16.shuffle 0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp12:
.loc 2 76 62 # rgb2y-sample.cpp:76:62
local.tee 6
.Ltmp13:
#DEBUG_VALUE: iv2 <- undef
#DEBUG_VALUE: iv2 <- %153
local.get 4
v8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
.loc 2 76 46 is_stmt 0 # rgb2y-sample.cpp:76:46
local.get 7
i16x8.mul
.loc 2 76 146 # rgb2y-sample.cpp:76:146
local.tee 8
local.get 6
local.get 4
v8x16.shuffle 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc 2 76 130 # rgb2y-sample.cpp:76:130
local.get 7
i16x8.mul
.loc 2 76 31 # rgb2y-sample.cpp:76:31
local.tee 6
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 8
local.get 6
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp14:
.loc 2 0 31 # rgb2y-sample.cpp:0:31
local.tee 6
.Ltmp15:
#DEBUG_VALUE: rg2 <- undef
#DEBUG_VALUE: rg2 <- %153
.loc 2 72 15 is_stmt 1 # rgb2y-sample.cpp:72:15
local.get 2
i32.const 48
i32.add
v128.load 0
local.get 5
v8x16.shuffle 0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp16:
.loc 2 77 62 # rgb2y-sample.cpp:77:62
local.tee 5
.Ltmp17:
#DEBUG_VALUE: iv3 <- undef
#DEBUG_VALUE: iv3 <- %98
local.get 4
v8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
.loc 2 77 46 is_stmt 0 # rgb2y-sample.cpp:77:46
local.get 7
i16x8.mul
.loc 2 77 146 # rgb2y-sample.cpp:77:146
local.tee 8
local.get 5
local.get 4
v8x16.shuffle 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc 2 77 130 # rgb2y-sample.cpp:77:130
local.get 7
i16x8.mul
.loc 2 77 31 # rgb2y-sample.cpp:77:31
local.tee 4
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 8
local.get 4
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp18:
.loc 2 80 33 is_stmt 1 # rgb2y-sample.cpp:80:33
local.tee 4
.Ltmp19:
#DEBUG_VALUE: rg3 <- undef
#DEBUG_VALUE: rg3 <- %93
v8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get 6
local.get 4
v8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
i32.const 8
.loc 2 80 18 is_stmt 0 # rgb2y-sample.cpp:80:18
i16x8.shr_u
.loc 2 81 21 is_stmt 1 # rgb2y-sample.cpp:81:21
i8x16.narrow_i16x8_u
.loc 2 81 19 is_stmt 0 # rgb2y-sample.cpp:81:19
v128.store 0
.Ltmp20:
#DEBUG_VALUE: rgba2y:i <- %170
.loc 2 0 19 # rgb2y-sample.cpp:0:19
local.get 3
local.tee 3
local.set 2
.Ltmp21:
.loc 2 68 11 is_stmt 1 # rgb2y-sample.cpp:68:11
local.get 3
i32.const 4
i32.add
local.tee 3
i32.const 4
.loc 2 68 14 is_stmt 0 # rgb2y-sample.cpp:68:14
i32.shl
.loc 2 68 30 # rgb2y-sample.cpp:68:30
local.get 1
i32.le_u
.Ltmp22:
.loc 2 68 2 # rgb2y-sample.cpp:68:2
br_if 0 # 0: up to label1
.Ltmp23:
.LBB0_3: # %for.end
end_loop
end_block # label0:
i32.const 0
.Ltmp24:
.loc 2 84 2 is_stmt 1 # rgb2y-sample.cpp:84:2
# fallthrough-return
end_function
.Ltmp25:
.Lfunc_end0:
.size rgba2y, .Lfunc_end0-rgba2y
# -- End function
组装/组装:--- WebAssembly code ---
index: 2
kind: wasm function
compiler: TurboFan
Body (size = 1088 = 1086 + 2 padding)
Instructions (size = 1064)
0xa5976359180 0 55 push rbp
0xa5976359181 1 4889e5 REX.W movq rbp,rsp
0xa5976359184 4 6a0a push 0xa
0xa5976359186 6 56 push rsi
0xa5976359187 7 4883ec58 REX.W subq rsp,0x58
0xa597635918b b 488b5e17 REX.W movq rbx,[rsi+0x17]
0xa597635918f f 83fa40 cmpl rdx,0x40
0xa5976359192 12 0f8307000000 jnc 0xa597635919f <+0x1f>
0xa5976359198 18 33c9 xorl rcx,rcx
0xa597635919a 1a e990030000 jmp 0xa597635952f <+0x3af>
0xa597635919f 1f b94d000000 movl rcx,0x4d
0xa59763591a4 24 c5f96ec1 vmovd xmm0,rcx
0xa59763591a8 28 c5fb70c000 vpshuflw xmm0,xmm0,0x0
0xa59763591ad 2d c5f970c000 vpshufd xmm0,xmm0,0x0
0xa59763591b2 32 33c9 xorl rcx,rcx
0xa59763591b4 34 c5f96ec9 vmovd xmm1,rcx
0xa59763591b8 38 c4410057ff vxorps xmm15,xmm15,xmm15
0xa59763591bd 3d c4c27100cf vpshufb xmm1,xmm1,xmm15
0xa59763591c2 42 bf96000000 movl rdi,0x96
0xa59763591c7 47 c5f9c4c701 vpinsrw xmm0,xmm0,rdi,0x1
0xa59763591cc 4c bf80ffffff movl rdi,0xffffff80
0xa59763591d1 51 c5f928d1 vmovapd xmm2,xmm1
0xa59763591d5 55 c4e36920d700 vpinsrb xmm2,xmm2,dil,0x0
0xa59763591db 5b 41b81d000000 movl r8,0x1d
0xa59763591e1 61 c4c179c4c002 vpinsrw xmm0,xmm0,r8,0x2
0xa59763591e7 67 c4e36920d701 vpinsrb xmm2,xmm2,dil,0x1
0xa59763591ed 6d 41b901000000 movl r9,0x1
0xa59763591f3 73 c4c179c4c103 vpinsrw xmm0,xmm0,r9,0x3
0xa59763591f9 79 c4e36920d702 vpinsrb xmm2,xmm2,dil,0x2
0xa59763591ff 7f 41bba0000000 movl r11,0xa0
0xa5976359205 85 c4c179c4c305 vpinsrw xmm0,xmm0,r11,0x5
0xa597635920b 8b c4e36920d703 vpinsrb xmm2,xmm2,dil,0x3
0xa5976359211 91 c4c179c4c006 vpinsrw xmm0,xmm0,r8,0x6
0xa5976359217 97 c4c179c4c107 vpinsrw xmm0,xmm0,r9,0x7
0xa597635921d 9d 488bf9 REX.W movq rdi,rcx
0xa5976359220 a0 41b804000000 movl r8,0x4
0xa5976359226 a6 e90b000000 jmp 0xa5976359236 <+0xb6>
0xa597635922b ab 0f1f440000 nop
0xa5976359230 b0 498bf8 REX.W movq rdi,r8
0xa5976359233 b3 4d8bc1 REX.W movq r8,r9
0xa5976359236 b6 4c8b4e2f REX.W movq r9,[rsi+0x2f]
0xa597635923a ba 493b21 REX.W cmpq rsp,[r9]
0xa597635923d bd 0f86f4020000 jna 0xa5976359537 <+0x3b7>
0xa5976359243 c3 458d4804 leal r9,[r8+0x4]
0xa5976359247 c7 4d8bd9 REX.W movq r11,r9
0xa597635924a ca 41c1e304 shll r11, 4
0xa597635924e ce 8d3cb8 leal rdi,[rax+rdi*4]
0xa5976359251 d1 c5fa6f1c3b vmovdqu xmm3,[rbx+rdi*1]
0xa5976359256 d6 c5fa6f641f10 vmovdqu xmm4,[rdi+rbx*1+0x10]
0xa597635925c dc c5fa6f6c1f20 vmovdqu xmm5,[rdi+rbx*1+0x20]
0xa5976359262 e2 c5fa6f741f30 vmovdqu xmm6,[rdi+rbx*1+0x30]
0xa5976359268 e8 c57810fe vmovups xmm15,xmm6
0xa597635926c ec 49ba0001028004050680 REX.W movq r10,0x8006050480020100
0xa5976359276 f6 c441f96ec2 vmovq xmm8,r10
0xa597635927b fb 49ba08090a800c0d0e80 REX.W movq r10,0x800e0d0c800a0908
0xa5976359285 105 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa597635928b 10b c4420100f8 vpshufb xmm15,xmm15,xmm8
0xa5976359290 110 0f10fa movups xmm7,xmm2
0xa5976359293 113 49ba8080800080808000 REX.W movq r10,0x80808000808080
0xa597635929d 11d c441f96ec2 vmovq xmm8,r10
0xa59763592a2 122 4c8b15ecffffff REX.W movq r10,[rip+0xffffffec]
0xa59763592a9 129 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa59763592af 12f c4c24100f8 vpshufb xmm7,xmm7,xmm8
0xa59763592b4 134 c4c141ebff vpor xmm7,xmm7,xmm15
0xa59763592b9 139 c57810fd vmovups xmm15,xmm5
0xa59763592bd 13d 4c8b15aaffffff REX.W movq r10,[rip+0xffffffaa]
0xa59763592c4 144 c441f96ec2 vmovq xmm8,r10
0xa59763592c9 149 4c8b15adffffff REX.W movq r10,[rip+0xffffffad]
0xa59763592d0 150 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa59763592d6 156 c4420100f8 vpshufb xmm15,xmm15,xmm8
0xa59763592db 15b 0f10f2 movups xmm6,xmm2
0xa59763592de 15e 4c8b15b0ffffff REX.W movq r10,[rip+0xffffffb0]
0xa59763592e5 165 c441f96ec2 vmovq xmm8,r10
0xa59763592ea 16a 4c8b15a4ffffff REX.W movq r10,[rip+0xffffffa4]
0xa59763592f1 171 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa59763592f7 177 c4c24900f0 vpshufb xmm6,xmm6,xmm8
0xa59763592fc 17c c4c149ebf7 vpor xmm6,xmm6,xmm15
0xa5976359301 181 c57810fc vmovups xmm15,xmm4
0xa5976359305 185 4c8b1562ffffff REX.W movq r10,[rip+0xffffff62]
0xa597635930c 18c c441f96ec2 vmovq xmm8,r10
0xa5976359311 191 4c8b1565ffffff REX.W movq r10,[rip+0xffffff65]
0xa5976359318 198 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa597635931e 19e c4420100f8 vpshufb xmm15,xmm15,xmm8
0xa5976359323 1a3 0f10ea movups xmm5,xmm2
0xa5976359326 1a6 4c8b1568ffffff REX.W movq r10,[rip+0xffffff68]
0xa597635932d 1ad c441f96ec2 vmovq xmm8,r10
0xa5976359332 1b2 4c8b155cffffff REX.W movq r10,[rip+0xffffff5c]
0xa5976359339 1b9 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa597635933f 1bf c4c25100e8 vpshufb xmm5,xmm5,xmm8
0xa5976359344 1c4 c4c151ebef vpor xmm5,xmm5,xmm15
0xa5976359349 1c9 c57810fb vmovups xmm15,xmm3
0xa597635934d 1cd 4c8b151affffff REX.W movq r10,[rip+0xffffff1a]
0xa5976359354 1d4 c441f96ec2 vmovq xmm8,r10
0xa5976359359 1d9 4c8b151dffffff REX.W movq r10,[rip+0xffffff1d]
0xa5976359360 1e0 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa5976359366 1e6 c4420100f8 vpshufb xmm15,xmm15,xmm8
0xa597635936b 1eb 0f10e2 movups xmm4,xmm2
0xa597635936e 1ee 4c8b1520ffffff REX.W movq r10,[rip+0xffffff20]
0xa5976359375 1f5 c441f96ec2 vmovq xmm8,r10
0xa597635937a 1fa 4c8b1514ffffff REX.W movq r10,[rip+0xffffff14]
0xa5976359381 201 c443b922c201 vpinsrq xmm8,xmm8,r10,0x1
0xa5976359387 207 c4c25900e0 vpshufb xmm4,xmm4,xmm8
0xa597635938c 20c c4c159ebe7 vpor xmm4,xmm4,xmm15
0xa5976359391 211 c5f928df vmovapd xmm3,xmm7
0xa5976359395 215 c5e168d9 vpunpckhbw xmm3,xmm3,xmm1
0xa5976359399 219 c5c160f9 vpunpcklbw xmm7,xmm7,xmm1
0xa597635939d 21d c57928c6 vmovapd xmm8,xmm6
0xa59763593a1 221 c53968c1 vpunpckhbw xmm8,xmm8,xmm1
0xa59763593a5 225 c5c960f1 vpunpcklbw xmm6,xmm6,xmm1
0xa59763593a9 229 c57928cd vmovapd xmm9,xmm5
0xa59763593ad 22d c53168c9 vpunpckhbw xmm9,xmm9,xmm1
0xa59763593b1 231 c5d160e9 vpunpcklbw xmm5,xmm5,xmm1
0xa59763593b5 235 c57928d4 vmovapd xmm10,xmm4
0xa59763593b9 239 c52968d1 vpunpckhbw xmm10,xmm10,xmm1
0xa59763593bd 23d c5d960e1 vpunpcklbw xmm4,xmm4,xmm1
0xa59763593c1 241 c5e1d5d8 vpmullw xmm3,xmm3,xmm0
0xa59763593c5 245 c5c1d5f8 vpmullw xmm7,xmm7,xmm0
0xa59763593c9 249 c539d5c0 vpmullw xmm8,xmm8,xmm0
0xa59763593cd 24d c5c9d5f0 vpmullw xmm6,xmm6,xmm0
0xa59763593d1 251 c531d5c8 vpmullw xmm9,xmm9,xmm0
0xa59763593d5 255 c5d1d5e8 vpmullw xmm5,xmm5,xmm0
0xa59763593d9 259 c529d5d0 vpmullw xmm10,xmm10,xmm0
0xa59763593dd 25d c5d9d5e0 vpmullw xmm4,xmm4,xmm0
0xa59763593e1 261 c57928df vmovapd xmm11,xmm7
0xa59763593e5 265 c44101efff vpxor xmm15,xmm15,xmm15
0xa59763593ea 26a c463010efb55 vpblendw xmm15,xmm15,xmm3,0x55
0xa59763593f0 270 c443210edfaa vpblendw xmm11,xmm11,xmm15,0xaa
0xa59763593f6 276 c442212bdf vpackusdw xmm11,xmm11,xmm15
0xa59763593fb 27b c57810fb vmovups xmm15,xmm3
0xa59763593ff 27f c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359405 285 c5c172d710 vpsrld xmm7,xmm7,16
0xa597635940a 28a c4c2412bff vpackusdw xmm7,xmm7,xmm15
0xa597635940f 28f c5f928de vmovapd xmm3,xmm6
0xa5976359413 293 c44101efff vpxor xmm15,xmm15,xmm15
0xa5976359418 298 c443010ef855 vpblendw xmm15,xmm15,xmm8,0x55
0xa597635941e 29e c4c3610edfaa vpblendw xmm3,xmm3,xmm15,0xaa
0xa5976359424 2a4 c4c2612bdf vpackusdw xmm3,xmm3,xmm15
0xa5976359429 2a9 c4417810f8 vmovups xmm15,xmm8
0xa597635942e 2ae c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359434 2b4 c5c972d610 vpsrld xmm6,xmm6,16
0xa5976359439 2b9 c4c2492bf7 vpackusdw xmm6,xmm6,xmm15
0xa597635943e 2be c57928c5 vmovapd xmm8,xmm5
0xa5976359442 2c2 c44101efff vpxor xmm15,xmm15,xmm15
0xa5976359447 2c7 c443010ef955 vpblendw xmm15,xmm15,xmm9,0x55
0xa597635944d 2cd c443390ec7aa vpblendw xmm8,xmm8,xmm15,0xaa
0xa5976359453 2d3 c442392bc7 vpackusdw xmm8,xmm8,xmm15
0xa5976359458 2d8 c4417810f9 vmovups xmm15,xmm9
0xa597635945d 2dd c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359463 2e3 c5d172d510 vpsrld xmm5,xmm5,16
0xa5976359468 2e8 c4c2512bef vpackusdw xmm5,xmm5,xmm15
0xa597635946d 2ed c57928cc vmovapd xmm9,xmm4
0xa5976359471 2f1 c44101efff vpxor xmm15,xmm15,xmm15
0xa5976359476 2f6 c443010efa55 vpblendw xmm15,xmm15,xmm10,0x55
0xa597635947c 2fc c443310ecfaa vpblendw xmm9,xmm9,xmm15,0xaa
0xa5976359482 302 c442312bcf vpackusdw xmm9,xmm9,xmm15
0xa5976359487 307 c4417810fa vmovups xmm15,xmm10
0xa597635948c 30c c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359492 312 c5d972d410 vpsrld xmm4,xmm4,16
0xa5976359497 317 c4c2592be7 vpackusdw xmm4,xmm4,xmm15
0xa597635949c 31c c4c141fdfb vpaddw xmm7,xmm7,xmm11
0xa59763594a1 321 c5c9fdf3 vpaddw xmm6,xmm6,xmm3
0xa59763594a5 325 c4c151fde8 vpaddw xmm5,xmm5,xmm8
0xa59763594aa 32a c4c159fde1 vpaddw xmm4,xmm4,xmm9
0xa59763594af 32f c5f928de vmovapd xmm3,xmm6
0xa59763594b3 333 c44101efff vpxor xmm15,xmm15,xmm15
0xa59763594b8 338 c463010eff55 vpblendw xmm15,xmm15,xmm7,0x55
0xa59763594be 33e c4c3610edfaa vpblendw xmm3,xmm3,xmm15,0xaa
0xa59763594c4 344 c4c2612bdf vpackusdw xmm3,xmm3,xmm15
0xa59763594c9 349 c57810ff vmovups xmm15,xmm7
0xa59763594cd 34d c4c10172d710 vpsrld xmm15,xmm15,16
0xa59763594d3 353 c5c972d610 vpsrld xmm6,xmm6,16
0xa59763594d8 358 c4c2492bf7 vpackusdw xmm6,xmm6,xmm15
0xa59763594dd 35d c5f928fc vmovapd xmm7,xmm4
0xa59763594e1 361 c44101efff vpxor xmm15,xmm15,xmm15
0xa59763594e6 366 c463010efd55 vpblendw xmm15,xmm15,xmm5,0x55
0xa59763594ec 36c c4c3410effaa vpblendw xmm7,xmm7,xmm15,0xaa
0xa59763594f2 372 c4c2412bff vpackusdw xmm7,xmm7,xmm15
0xa59763594f7 377 c57810fd vmovups xmm15,xmm5
0xa59763594fb 37b c4c10172d710 vpsrld xmm15,xmm15,16
0xa5976359501 381 c5d972d410 vpsrld xmm4,xmm4,16
0xa5976359506 386 c4c2592be7 vpackusdw xmm4,xmm4,xmm15
0xa597635950b 38b c5c9fdf3 vpaddw xmm6,xmm6,xmm3
0xa597635950f 38f c5d9fde7 vpaddw xmm4,xmm4,xmm7
0xa5976359513 393 c5c971d608 vpsrlw xmm6,xmm6,8
0xa5976359518 398 c5d971d408 vpsrlw xmm4,xmm4,8
0xa597635951d 39d c5d967e6 vpackuswb xmm4,xmm4,xmm6
0xa5976359521 3a1 c5fa7f243b vmovdqu [rbx+rdi*1],xmm4
0xa5976359526 3a6 443bda cmpl r11,rdx
0xa5976359529 3a9 0f8601fdffff jna 0xa5976359230 <+0xb0>
0xa597635952f 3af 488bc1 REX.W movq rax,rcx
0xa5976359532 3b2 488be5 REX.W movq rsp,rbp
0xa5976359535 3b5 5d pop rbp
0xa5976359536 3b6 c3 retl
0xa5976359537 3b7 488955e8 REX.W movq [rbp-0x18],rdx
0xa597635953b 3bb 48895de0 REX.W movq [rbp-0x20],rbx
0xa597635953f 3bf c5f81145d0 vmovups [rbp-0x30],xmm0
0xa5976359544 3c4 c5f8114dc0 vmovups [rbp-0x40],xmm1
0xa5976359549 3c9 c5f81155b0 vmovups [rbp-0x50],xmm2
0xa597635954e 3ce 488945a8 REX.W movq [rbp-0x58],rax
0xa5976359552 3d2 48897da0 REX.W movq [rbp-0x60],rdi
0xa5976359556 3d6 4c894598 REX.W movq [rbp-0x68],r8
0xa597635955a 3da e8615dffff call 0xa597634f2c0 ;; wasm stub: WasmStackGuard
0xa597635955f 3df 33c9 xorl rcx,rcx
0xa5976359561 3e1 488b55e8 REX.W movq rdx,[rbp-0x18]
0xa5976359565 3e5 488b5de0 REX.W movq rbx,[rbp-0x20]
0xa5976359569 3e9 c5f81045d0 vmovups xmm0,[rbp-0x30]
0xa597635956e 3ee c5f8104dc0 vmovups xmm1,[rbp-0x40]
0xa5976359573 3f3 c5f81055b0 vmovups xmm2,[rbp-0x50]
0xa5976359578 3f8 488b45a8 REX.W movq rax,[rbp-0x58]
0xa597635957c 3fc 488b7da0 REX.W movq rdi,[rbp-0x60]
0xa5976359580 400 4c8b4598 REX.W movq r8,[rbp-0x68]
0xa5976359584 404 488b75f0 REX.W movq rsi,[rbp-0x10]
0xa5976359588 408 e9b6fcffff jmp 0xa5976359243 <+0xc3>
0xa597635958d 40d e8fe5affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359592 412 e8f95affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359597 417 e8f45affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa597635959c 41c e8ef5affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a1 421 e8ea5affff call 0xa597634f090 ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a6 426 90 nop
0xa59763595a7 427 90 nop
Protected instructions:
pc offset land pad
3a1 40d
e2 412
dc 417
d6 41c
d1 421
Source positions:
pc offset position
d1 43
d6 239
dc 416
e2 545
3a1 722
3b7 29
40d 722
412 545
417 416
41c 239
421 43
Safepoints (size = 22)
0xa5a7635917fffffffff 000000000000000 (sp -> fp)
RelocInfo (size = 8)
0xa597635955b wasm stub call
0xa597635958e wasm stub call
0xa5976359593 wasm stub call
0xa5976359598 wasm stub call
0xa597635959d wasm stub call
0xa59763595a2 wasm stub call
--- End code ---
最佳答案
从Emscripten issue复制我的答案:
我们之所以不使用v128.const的原因是v128.const仅在V8中才实现。为避免破坏原始试用用户,在相关的V8修补程序进入Chrome稳定版之前,我们无法更新LLVM发出v128.const。我会密切注意此仪表板,以确定何时是进行此更改的好时机。如果您使用的是较新版本的Chrome或其他不支持v128.const的执行环境,则可以尝试使用-munimplemented-simd128标志编译项目,该标志将在LLVM中启用v128.const(但可能还会引入其他不需要的更改)。一旦v128.const广泛可用,LLVM使用v128.const会比从内存中加载 vector 要好,因为这允许引擎在给定运行时平台的情况下确定实现 vector 的最佳方法。
还可能需要考虑移植代码中性能敏感的部分以直接使用WebAssembly内部函数头,而不是依赖于模拟的SSE。这将减少您的代码与基础机器代码之间的阻抗不匹配层。
最后,如果您发现任何地方的指令选择都不理想,那么您可以就特定问题提交LLVM错误(如果在代码-> wasm侧)或V8错误(如果在wasm->本机侧),这对您有帮助看到。这种反馈对我们来说非常有值(value)。