问题描述
我有这样的代码
for(int k = 0; k {
int xc0 = 512 +((idx + k * iddx)> 6);
int yc0 = 512 +((idy + k * iddy)>> 6);
int xc1 = 512 +((idx +(k + 1)* iddx)>> 6);
int yc1 = 512 +((idy +(k + 1)* iddy)>> 6);
int xc2 = 512 +((idx +(k + 2)* iddx)> 6);
int yc2 = 512 +((idy +(k + 2)* iddy)> 6);
int xc3 = 512 +((idx +(k + 3)* iddx)> 6);
int yc3 = 512 +((idy +(k + 3)* iddy)> 6);
unsigned color0 = working_buffer [yc0 * working_buffer_size_x + xc0];
unsigned color1 = working_buffer [yc1 * working_buffer_size_x + xc1];
unsigned color2 = working_buffer [yc2 * working_buffer_size_x + xc2];
unsigned color3 = working_buffer [yc3 * working_buffer_size_x + xc3];
int adr = base_adr + k;
frame_bitmap [adr] = color0;
frame_bitmap [adr + 1] = color1;
frame_bitmap [adr + 2] = colour2;
frame_bitmap [adr + 3] = color3;
}
这里是int / unsigned,这是循环的关键部分,不是肯定如果整数sse将有助于这里的速度,但奇迹,如果它会工作在所有?可以someopne帮助这个?
(im使用mingw32)
生锈,但你应该做的是:
xmm0:[k,k + 1,k + 2,k + 3] // xc0,xc1,...
xmm1:[k,k + 1,k + 2,k + 3] // yc0,yc1,....
//循环
xmm2:[512,512,512,512]
xmm3:[idx,idx,idx,idx]
xmm4:[iddx,iddx,iddx,iddx]
xmm5:[idy,idy,idy,idy]
xmm6:[iddy,iddy,iddy,iddy]
xmm7:[working_buffer_size_x,working_buffer_size_x,working_buffer_size_x,working_buffer_size_x]
计算:
xmm0 * xmm4
xmm0 + xmm3
xmm0> 6
xmm0 + xmm2
xmm0:[xc0,xc1,xc2,xc3]
/////////////////// ////////////
xmm1 * xmm6
xmm1 + xmm5
xmm1> 6
xmm1 + xmm2
xmm1:[yc0,yc1,yc2,yc3]
xmm1 * xmm7
xmm1 + xmm0
现在 xmm1 是:
xmm1:[yc0 * working_buffer_size_x + xc0,yc1 * working_buffer_size_x + xc1,yc2 * working_buffer_size_x + xc2,yc3 * working_buffer_size_x + xc3]
您正在每个循环(working_buffer,frame_bitmap数组)中读取和写入内存,操作的速度太慢于计算本身,
p>您需要 working_buffer和frame_bitmap 数组对齐和 SSE4.1 :
#include< emmintrin.h>
#include< smmintrin.h> //SSE4.1
int a [4] __attribute __((aligned(16)));
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
xmm2 = _mm_set1_epi32(512);
xmm3 = _mm_set1_epi32(idx);
xmm4 = _mm_set1_epi32(iddx);
xmm5 = _mm_set1_epi32(idy);
xmm6 = _mm_set1_epi32(iddy);
xmm7 = _mm_set1_epi32(working_buffer_size_x);
for(k = 0;k≤n-4; k + = 4){
xmm0 = _mm_set_epi32(k + 3,k + 2,k + 1,k) ;
xmm1 = _mm_set_epi32(k + 3,k + 2,k + 1,k);
// xmm0 * xmm4
xmm0 = _mm_mullo_epi32(xmm0,xmm4);
// xmm0 + xmm3
xmm0 = _mm_add_epi32(xmm0,xmm3);
// xmm0>> 6
xmm0 = _mm_srai_epi32(xmm0,6);
// xmm0 + xmm2
xmm0 = _mm_add_epi32(xmm0,xmm2);
// xmm1 * xmm6
xmm1 = _mm_mullo_epi32(xmm1,xmm6);
// xmm1 + xmm5
xmm1 = _mm_add_epi32(xmm1,xmm5);
// xmm1>> 6
xmm1 = _mm_srai_epi32(xmm1,6);
// xmm1 + xmm2
xmm1 = _mm_add_epi32(xmm1,xmm2);
// xmm1 * xmm7
xmm1 = _mm_mullo_epi32(xmm1,xmm7);
// xmm1 + xmm0
xmm1 = _mm_add_epi32(xmm1,xmm0);
// a [0] = yc0 * working_buffer_size_x + xc0
// a [1] = yc1 * working_buffer_size_x + xc1
// a [2] = yc2 * working_buffer_size_x + xc2
// a [3] = yc3 * working_buffer_size_x + xc3
_mm_store_si128((__ m128i *)& a [0],xmm1);
unsigned color0 = working_buffer [a [0]];
unsigned color1 = working_buffer [a [1]];
unsigned color2 = working_buffer [a [2]];
unsigned color3 = working_buffer [a [3]];
int adr = base_adr + k;
frame_bitmap [adr] = color0;
frame_bitmap [adr + 1] = color1;
frame_bitmap [adr + 2] = color2;
frame_bitmap [adr + 3] = color3;
}
您可以通过避免 _mm_store_si128 ((__m128i *)& a [0],xmm1); 或 int adr = base_adr + k;
Im new in sse intrinsics and would appreciate some hints assistance in using this 9as this is yet foggy to me)
I got such code
for(int k=0; k<=n-4; k+=4) { int xc0 = 512 + ((idx + k*iddx)>>6); int yc0 = 512 + ((idy + k*iddy)>>6); int xc1 = 512 + ((idx + (k+1)*iddx)>>6); int yc1 = 512 + ((idy + (k+1)*iddy)>>6); int xc2 = 512 + ((idx + (k+2)*iddx)>>6); int yc2 = 512 + ((idy + (k+2)*iddy)>>6); int xc3 = 512 + ((idx + (k+3)*iddx)>>6); int yc3 = 512 + ((idy + (k+3)*iddy)>>6); unsigned color0 = working_buffer[yc0*working_buffer_size_x + xc0]; unsigned color1 = working_buffer[yc1*working_buffer_size_x + xc1]; unsigned color2 = working_buffer[yc2*working_buffer_size_x + xc2]; unsigned color3 = working_buffer[yc3*working_buffer_size_x + xc3]; int adr = base_adr + k; frame_bitmap[adr] = color0; frame_bitmap[adr+1]= color1; frame_bitmap[adr+2]= color2; frame_bitmap[adr+3]= color3; }
all here is int/unsigned, this is critical part of the loop, not sure if integer sse would help here in speed but wonder if it would work at all? could someopne help with this?
(im using mingw32)
My sse is a bit rusty, but what you should do is:
xmm0: [k, k+1, k+2, k+3] //xc0, xc1,.... xmm1: [k, k+1, k+2, k+3] //yc0, yc1,.... //initialize before the loop xmm2: [512, 512, 512, 512] xmm3: [idx, idx, idx, idx] xmm4: [iddx, iddx, iddx, iddx] xmm5: [idy, idy, idy, idy] xmm6: [iddy, iddy, iddy, iddy] xmm7: [working_buffer_size_x, working_buffer_size_x, working_buffer_size_x, working_buffer_size_x]
Calculations:
xmm0 * xmm4 xmm0 + xmm3 xmm0 >> 6 xmm0 + xmm2 xmm0: [xc0, xc1, xc2, xc3] /////////////////////////////// xmm1 * xmm6 xmm1 + xmm5 xmm1 >> 6 xmm1 + xmm2 xmm1: [yc0, yc1, yc2, yc3] xmm1 * xmm7 xmm1 + xmm0
Now xmm1 is:
xmm1: [yc0*working_buffer_size_x + xc0, yc1*working_buffer_size_x + xc1, yc2*working_buffer_size_x + xc2, yc3*working_buffer_size_x + xc3]
You are reading and writing memory in each loop (working_buffer, frame_bitmap arrays), operations that are way too slower than the calculations itself, so the speed improvement won't be as much as you expected to be.
EDIT
You need working_buffer and frame_bitmap arrays to be aligned and SSE4.1:
#include <emmintrin.h> #include <smmintrin.h> //SSE4.1 int a[4] __attribute__((aligned(16))); __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm2 = _mm_set1_epi32(512); xmm3 = _mm_set1_epi32(idx); xmm4 = _mm_set1_epi32(iddx); xmm5 = _mm_set1_epi32(idy); xmm6 = _mm_set1_epi32(iddy); xmm7 = _mm_set1_epi32(working_buffer_size_x); for(k = 0; k <= n - 4; k +=4){ xmm0 = _mm_set_epi32(k + 3, k + 2, k + 1, k); xmm1 = _mm_set_epi32(k + 3, k + 2, k + 1, k); //xmm0 * xmm4 xmm0 = _mm_mullo_epi32(xmm0, xmm4); //xmm0 + xmm3 xmm0 = _mm_add_epi32(xmm0, xmm3); //xmm0 >> 6 xmm0 = _mm_srai_epi32(xmm0, 6); //xmm0 + xmm2 xmm0 = _mm_add_epi32(xmm0, xmm2); //xmm1 * xmm6 xmm1 = _mm_mullo_epi32(xmm1, xmm6); //xmm1 + xmm5 xmm1 = _mm_add_epi32(xmm1, xmm5); //xmm1 >> 6 xmm1 = _mm_srai_epi32(xmm1, 6); //xmm1 + xmm2 xmm1 = _mm_add_epi32(xmm1, xmm2); //xmm1 * xmm7 xmm1 = _mm_mullo_epi32(xmm1, xmm7); //xmm1 + xmm0 xmm1 = _mm_add_epi32(xmm1, xmm0); //a[0] = yc0*working_buffer_size_x + xc0 //a[1] = yc1*working_buffer_size_x + xc1 //a[2] = yc2*working_buffer_size_x + xc2 //a[3] = yc3*working_buffer_size_x + xc3 _mm_store_si128((__m128i *)&a[0], xmm1); unsigned color0 = working_buffer[ a[0] ]; unsigned color1 = working_buffer[ a[1] ]; unsigned color2 = working_buffer[ a[2] ]; unsigned color3 = working_buffer[ a[3] ]; int adr = base_adr + k; frame_bitmap[adr] = color0; frame_bitmap[adr+1]= color1; frame_bitmap[adr+2]= color2; frame_bitmap[adr+3]= color3; }
You can optimize it even more by avoiding the _mm_store_si128((__m128i *)&a[0], xmm1); or the int adr = base_adr + k; using assembly with direct manipulation of memory.
这篇关于如何将此代码重写为sse内在函数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!