我正在学习汇编程序,并且开始在Digital-Mars C++编译器(更易于阅读的intel sytanx)中的SSE和MMX寄存器上进行实验。我已经完成了一个将var_1作为值并将其转换为var_2数字系统的程序(目前为8位。稍后将其扩展为32 64 128)。程序通过两种方式执行此操作:
__asm
内联问题:您能告诉我使用xmm0-7和mm0-7寄存器的更有效方法吗?您能告诉我如何与al,ah ... 8位寄存器交换它们的确切字节吗?
与计算机上的__asm(pentium-m centrino 2.0GHz)相比,以C++常用方式执行的常规%(modulo)运算符非常慢。
如果您能告诉我如何摆脱__asmm中的除法指令,它将更快。
当我运行程序时,它给了我:
(for the values: var_1=17,var_2=2,all loops are 200M times)
17 is 10001 in number system 2
__asm(clock)...........: 7250 <------too bad. it is 8-bit calc.
C++(clock).............: 12250 <------not very slow(var_2 is a power of 2)
(for the values: var_1=33,var_2=7,all loops are 200M times)
33 is 45 in number system 7
__asm(clock)..........: 2875 <-------not good. it is 8-bit calc.
C++(clock)............: 6328 <----------------really slow(var_2 is not a power of 2)
第二个C++代码(带有%运算符的代码):////////////////////////////////////// ///////////////////
t1=clock();//reference time
for(int i=0;i<200000000;i++)
{
y=x;
counter=0;
while(y>g)
{
var_3[counter]=y%g;
y/=g;
counter++;
}
var_3[counter]=y%g;
}
t2=clock();//final time
_asm代码://////////////////////////////////////////////// ///////////////////////////////////////////////////// ///////////
__asm // i love assembly in some parts of C++
{
pushf //here does register backup
push eax
push ebx
push ecx
push edx
push edi
mov eax,0h //this will be outer loop counter init to zero
//init of medium-big registers to zero
movd xmm0,eax //cannot set to immediate constant: xmm0=outer loop counter
shufps xmm0,xmm0,0h //this makes all bits zero
movd xmm1,eax
movd xmm2,eax
shufps xmm1,xmm1,0h
shufps xmm2,xmm2,0h
movd xmm2,eax
shufps xmm3,xmm3,0h//could have made pxor xmm3,xmm3(single instruction)
//init complete(xmm0,xmm1,xmm2,xmm3 are zero)
movd xmm1,[var_1] //storing variable_1 to register
movd xmm2,[var_2] //storing var_2 to register
lea ebx,var_3 //calculate var_3 address
movd xmm3,ebx //storing var_3's address to register
for_loop:
mov eax,0h
//this line is index-init to zero(digit array index)
movd edx,xmm2
mov cl,dl //this is the var_1 stored in cl
movd edx,xmm1
mov al,dl //this is the var_2 stored in al
mov edx,0h
dng:
mov ah,00h //preparation for a 8-bit division
div cl //divide
movd ebx,xmm3 //get var_3 address
add ebx,edx //i couldnt find a way to multiply with 4
add ebx,edx //so i added 4 times ^^
add ebx,edx //add
add ebx,edx //last adding
//below, mov [ebx],ah is the only memory accessing instruction
mov [ebx],ah //(8 bit)this line is equivalent to var_3[i]=remainder
inc edx //i++;
cmp al,00h //is division zero?
jne dng //if no, loop again
//here edi register has the number of digits
movd eax,xmm0 //get the outer loop counter from medium-big register
add eax,01h //j++;
movd xmm0,eax //store the new counter to medium-big register
cmp eax,0BEBC200h //is j<(200,000,000) ?
jb for_loop //if yes, go loop again
mov [var_3_size],edx //now we have number of digits too!
//here does registers revert back to old values
pop edi
pop edx
pop ecx
pop ebx
pop eax
popf
}
整个代码:///////////////////////////////////////////////// ////////////////////////////////////////////
#include <iostream.h>
#include <cmath>
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
int main()
{
srand(time(0));
clock_t t1=clock();
clock_t t2=clock();
int var_1=17; //number itself
int var_2=2; //number system
int var_3[100]; //digits to be showed(maximum 100 as seen )
int var_3_size=0;//asm block will decide what will the number of digits be
for(int i=0;i<100;i++)
{
var_3[i]=0; //here we initialize digits to zeroes
}
t1=clock();//reference time to take
__asm // i love assembly in some parts of C++
{
pushf //here does register backup
push eax
push ebx
push ecx
push edx
push edi
mov eax,0h //this will be outer loop counter init to zero
//init of medium-big registers to zero
movd xmm0,eax //cannot set to immediate constant: xmm0=outer loop counter
shufps xmm0,xmm0,0h //this makes all bits zero
movd xmm1,eax
movd xmm2,eax
shufps xmm1,xmm1,0h
shufps xmm2,xmm2,0h
movd xmm2,eax
shufps xmm3,xmm3,0h
//init complete(xmm0,xmm1,xmm2,xmm3 are zero)
movd xmm1,[var_1] //storing variable_1 to register
movd xmm2,[var_2] //storing var_2 to register
lea ebx,var_3 //calculate var_3 address
movd xmm3,ebx //storing var_3's address to register
for_loop:
mov eax,0h
//this line is index-init to zero(digit array index)
movd edx,xmm2
mov cl,dl //this is the var_1 stored in cl
movd edx,xmm1
mov al,dl //this is the var_2 stored in al
mov edx,0h
dng:
mov ah,00h //preparation for a 8-bit division
div cl //divide
movd ebx,xmm3 //get var_3 address
add ebx,edx //i couldnt find a way to multiply with 4
add ebx,edx //so i added 4 times ^^
add ebx,edx //add
add ebx,edx //last adding
//below, mov [ebx],ah is the only memory accessing instruction
mov [ebx],ah //(8 bit)this line is equivalent to var_3[i]=remainder
inc edx //i++;
cmp al,00h //is division zero?
jne dng //if no, loop again
//here edi register has the number of digits
movd eax,xmm0 //get the outer loop counter from medium-big register
add eax,01h //j++;
movd xmm0,eax //store the new counter to medium-big register
cmp eax,0BEBC200h //is j<(200,000,000) ?
jb for_loop //if yes, go loop again
mov [var_3_size],edx //now we have number of digits too!
//here does registers revert back to old values
pop edi
pop edx
pop ecx
pop ebx
pop eax
popf
}
t2=clock(); //finish time
printf("\n assembly_inline(clocks): %i for the 200 million calculations",(t2-t1));
printf("\n value %i(in decimal) is: ",var_1);
for(int i=var_3_size-1;i>=0;i--)
{
printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",var_2);
//and: more readable form(end easier)
int counter=var_3_size;
int x=var_1;
int g=var_2;
int y=x;// backup
t1=clock();//reference time
for(int i=0;i<200000000;i++)
{
y=x;
counter=0;
while(y>g)
{
var_3[counter]=y%g;
y/=g;
counter++;
}
var_3[counter]=y%g;
}
t2=clock();//final time
printf("\n C++(clocks): %i for the 200 million calculations",(t2-t1));
printf("\n value %i(in decimal) is: ",x);
for(int i=var_3_size-1;i>=0;i--)
{
printf("%i",var_3[i]);
}
printf(" in the number system: %i \n",g);
return 0;
}
编辑:
这是32位版本
void get_digits_asm()
{
__asm
{
pushf //couldnt store this in other registers
movd xmm0,eax//storing in xmm registers instead of pushing
movd xmm1,ebx//
movd xmm2,ecx//
movd xmm3,edx//
movd xmm4,edi//end of push backups
mov eax,[variable_x]
mov ebx,[number_system]
mov ecx,0h
mov edi,0h
begin_loop:
mov edx,0h
div ebx
lea edi,digits
mov [edi+ecx*4],edx
add ecx,01h
cmp eax,ebx
ja begin_loop
mov edx,0
div ebx
lea edi,digits
mov [edi+ecx*4],edx
inc ecx
mov [digits_total],ecx
movd edi,xmm4//pop edi
movd edx,xmm3//pop edx
movd ecx,xmm2//pop ecx
movd ebx,xmm1//pop ebx
movd eax,xmm0//pop eax
popf
}
}
最佳答案
当然,代码可以简单得多:(以C++版本为模型,不包含push和pops,并且未经测试)
mov esi,200000000
_bigloop:
mov eax,[y]
mov ebx,[g]
lea edi,var_3
; eax = y
; ebx = g
; edi = var_3
xor ecx,ecx
; ecx = counter
_loop:
xor edx,edx
div ebx
mov [edi+ecx*4],edx
add ecx,1
test eax,eax
jnz _loop
sub esi,1
jnz _bigloop
但是如果它比C++版本快,我会感到惊讶,实际上,如果基数是2的幂,它几乎肯定会变慢-所有理智的编译器都知道如何通过两成移位和按位与。
这是使用8位除法的版本。类似的警告适用,但是现在除法甚至可能溢出(如果
y / g
大于255)。 mov esi,200000000
_bigloop:
mov eax,[y]
mov ebx,[g]
lea edi,var_3
; eax = y
; ebx = g
; edi = var_3
xor ecx,ecx
; ecx = counter
_loop:
div bl
mov [edi+ecx],ah
add ecx,1
and eax,0xFF
jnz _loop
sub esi,1
jnz _bigloop