今天讲讲arm汇编中除法的底层实现。汇编代码本身比较长了,如需参考请直接拉到文末。
下面我直接把arm的除法算法的汇编代码转译成C语言的代码贴出来,并进行解析。
因为篇幅有限,所以在此只解析无符号整型的除法运算,关于无符号除法和有符号除法的区别请参考上一篇推送。
代码较长如下,电脑端看效果更佳,如无耐心请直接拉下去看讲解即可:
#include<stdio.h>
unsigned int count_leading_zeros(unsigned int num)
{
unsigned int cnt = 0;
while(!(num & 0x80000000) && cnt < 32){
cnt++;
num <<= 1;
}
return cnt;
}
unsigned int div_unsigned(unsigned int dividend, unsigned int divisor)
{
unsigned int answer = 0;
int cc;
unsigned int divisor_lz = 0, dividend_lz = 0;
if (divisor == 1){
return dividend;
}else if (divisor < 1){
return -1;
}
if (divisor == dividend){
return 1;
}else if (dividend < divisor){
return 0;
}
if ((divisor & (divisor - 1)) == 0){
return dividend >> (31 - count_leading_zeros(divisor));
}
divisor_lz = count_leading_zeros(divisor);
dividend_lz = count_leading_zeros(dividend);
printf("dividend[0x%x], dividend_lz[%d], divisor[0x%x], divisor_lz[%d]\n", dividend, dividend_lz, divisor, divisor_lz);
cc = divisor_lz - dividend_lz;
while(cc >= 0){
answer <<= 1;
if (dividend >= (divisor << cc)){
answer += 1;
dividend -= (divisor << cc);
}
cc--;
}
return answer;
}
main(){
unsigned int a = 0x80000000 / 3;
unsigned int b = div_unsigned(0x80000000, 3);
printf("[0x%x][0x%x]",a, b);
}
2次幂和移位运算
在以上代码中我们终于看到了移位运算对除法运算的优化:
当除数是2的N次幂时,可以直接对被除数做右移运算来代替除法, 比如除数是2即(2的1次幂),此时只需要对被除数做一次右移即可,同理如果除数是8则对被除数做三次右移。
而判断一个数字是不是2的N次幂只需要一行代码:
if ((divisor & (divisor - 1)) == 0){
这一行代码也几乎就是leetcode的第231题2的幂的答案:
如有疑问请继续参考leetcode的题解:https://leetcode-cn.com/problems/power-of-two/solution/power-of-two-er-jin-zhi-ji-jian-by-jyd/
而计算2的N次幂中的N,也只需要这一句即可:
(31 - count_leading_zeros(divisor))
count_leading_zeros即为一个32bit的数字以二进制呈现的时候,从高位向低位数开始数有连续多少个0的数量。
比如数字2的二进制是: 0000 0000 0000 0000 0000 0000 0000 0010
在第一个bit1出现之前有30个0。
判断是否是2的N次幂,并且计算出N的大小并进行右移也只需要以下三行代码。
if ((divisor & (divisor - 1)) == 0){
return dividend >> (31 - count_leading_zeros(divisor));
}
为什么要使用count_leading_zeros这种方法呢,虽然我在上面的代码中定义了函数count_leading_zeros,但是在arm汇编中只需要一条指令clz即可,计算2的N次幂的N加上右移也只需要三条指令即可,非常高效:
clz r2, r1 //计算leading zeros的数量
rsb r2, r2, #31 //31 - count_leading_zeros(divisor)
lsr.w r0, r0, r2 // 进行右移
二进制的除法解析
那么更多情况下,除数也并不是2的N次幂。如果除数是3,那么还是要做一下正规的除法了。
我做了一张图来对比8/3的十进制和二进制的除法。
在二进制时,任何一个bit不可能大于1,所以当两个数字的leading zeros相同时,被除数不可能会整除除数超过或者等于两次。也就是说leading zeros相同时,被除数要么能整除除数一次,要么是0次。
二进制运算除法的时候,首先会对除数做左移操作,让除数和被除数进行“对齐”(即leading zeros数量相同),如果此时的被除数大于等于此时(左移后的)除数,那么在相应的答案位上置一,否则置0。然后对(左移后的)除数做右移一位操作再继续和被除数做比较,直到除数恢复成原来的初始值(这时候会作最后一次运算)。如下代码所示:
cc = divisor_lz - dividend_lz;
while(cc >= 0){
answer <<= 1;
if (dividend >= (divisor << cc)){
answer += 1;
dividend -= (divisor << cc);
}
cc--;
}
所以在二进制整型数字的除法世界中,只需要减法和移位操作就能够满足除法运算的需求。最后我才发现,二进制的除法原本就是这么简单,比十进制的除法还要简单。
本文完,以下为参考资料。
arm的指令集查文档:
http://users.ece.utexas.edu/~valvano/Volume1/QuickReferenceCard.pdf
https://iitd-plos.github.io/col718/ref/arm-instructionset.pdf
div无符号整形的除法汇编如下:
00010490 <__udivsi3>:
10490: 1e4a subs r2, r1, #1
10492: bf08 it eq
10494: 4770 bxeq lr
10496: f0c0 8124 bcc.w 106e2 <__udivsi3+0x252>
1049a: 4288 cmp r0, r1
1049c: f240 8116 bls.w 106cc <__udivsi3+0x23c>
104a0: 4211 tst r1, r2
104a2: f000 8117 beq.w 106d4 <__udivsi3+0x244>
104a6: fab0 f380 clz r3, r0
104aa: fab1 f281 clz r2, r1
104ae: eba2 0303 sub.w r3, r2, r3
104b2: f1c3 031f rsb r3, r3, #31
104b6: a204 add r2, pc, #16 ; (adr r2, 104c8 <__udivsi3+0x38>)
104b8: eb02 1303 add.w r3, r2, r3, lsl #4
104bc: f04f 0200 mov.w r2, #0
104c0: 469f mov pc, r3
104c2: bf00 nop
104c4: f3af 8000 nop.w
104c8: ebb0 7fc1 cmp.w r0, r1, lsl #31
104cc: bf00 nop
104ce: eb42 0202 adc.w r2, r2, r2
104d2: bf28 it cs
104d4: eba0 70c1 subcs.w r0, r0, r1, lsl #31
104d8: ebb0 7f81 cmp.w r0, r1, lsl #30
104dc: bf00 nop
104de: eb42 0202 adc.w r2, r2, r2
104e2: bf28 it cs
104e4: eba0 7081 subcs.w r0, r0, r1, lsl #30
104e8: ebb0 7f41 cmp.w r0, r1, lsl #29
104ec: bf00 nop
104ee: eb42 0202 adc.w r2, r2, r2
104f2: bf28 it cs
104f4: eba0 7041 subcs.w r0, r0, r1, lsl #29
104f8: ebb0 7f01 cmp.w r0, r1, lsl #28
104fc: bf00 nop
104fe: eb42 0202 adc.w r2, r2, r2
10502: bf28 it cs
10504: eba0 7001 subcs.w r0, r0, r1, lsl #28
10508: ebb0 6fc1 cmp.w r0, r1, lsl #27
1050c: bf00 nop
1050e: eb42 0202 adc.w r2, r2, r2
10512: bf28 it cs
10514: eba0 60c1 subcs.w r0, r0, r1, lsl #27
10518: ebb0 6f81 cmp.w r0, r1, lsl #26
1051c: bf00 nop
1051e: eb42 0202 adc.w r2, r2, r2
10522: bf28 it cs
10524: eba0 6081 subcs.w r0, r0, r1, lsl #26
10528: ebb0 6f41 cmp.w r0, r1, lsl #25
1052c: bf00 nop
1052e: eb42 0202 adc.w r2, r2, r2
10532: bf28 it cs
10534: eba0 6041 subcs.w r0, r0, r1, lsl #25
10538: ebb0 6f01 cmp.w r0, r1, lsl #24
1053c: bf00 nop
1053e: eb42 0202 adc.w r2, r2, r2
10542: bf28 it cs
10544: eba0 6001 subcs.w r0, r0, r1, lsl #24
10548: ebb0 5fc1 cmp.w r0, r1, lsl #23
1054c: bf00 nop
1054e: eb42 0202 adc.w r2, r2, r2
10552: bf28 it cs
10554: eba0 50c1 subcs.w r0, r0, r1, lsl #23
10558: ebb0 5f81 cmp.w r0, r1, lsl #22
1055c: bf00 nop
1055e: eb42 0202 adc.w r2, r2, r2
10562: bf28 it cs
10564: eba0 5081 subcs.w r0, r0, r1, lsl #22
10568: ebb0 5f41 cmp.w r0, r1, lsl #21
1056c: bf00 nop
1056e: eb42 0202 adc.w r2, r2, r2
10572: bf28 it cs
10574: eba0 5041 subcs.w r0, r0, r1, lsl #21
10578: ebb0 5f01 cmp.w r0, r1, lsl #20
1057c: bf00 nop
1057e: eb42 0202 adc.w r2, r2, r2
10582: bf28 it cs
10584: eba0 5001 subcs.w r0, r0, r1, lsl #20
10588: ebb0 4fc1 cmp.w r0, r1, lsl #19
1058c: bf00 nop
1058e: eb42 0202 adc.w r2, r2, r2
10592: bf28 it cs
10594: eba0 40c1 subcs.w r0, r0, r1, lsl #19
10598: ebb0 4f81 cmp.w r0, r1, lsl #18
1059c: bf00 nop
1059e: eb42 0202 adc.w r2, r2, r2
105a2: bf28 it cs
105a4: eba0 4081 subcs.w r0, r0, r1, lsl #18
105a8: ebb0 4f41 cmp.w r0, r1, lsl #17
105ac: bf00 nop
105ae: eb42 0202 adc.w r2, r2, r2
105b2: bf28 it cs
105b4: eba0 4041 subcs.w r0, r0, r1, lsl #17
105b8: ebb0 4f01 cmp.w r0, r1, lsl #16
105bc: bf00 nop
105be: eb42 0202 adc.w r2, r2, r2
105c2: bf28 it cs
105c4: eba0 4001 subcs.w r0, r0, r1, lsl #16
105c8: ebb0 3fc1 cmp.w r0, r1, lsl #15
105cc: bf00 nop
105ce: eb42 0202 adc.w r2, r2, r2
105d2: bf28 it cs
105d4: eba0 30c1 subcs.w r0, r0, r1, lsl #15
105d8: ebb0 3f81 cmp.w r0, r1, lsl #14
105dc: bf00 nop
105de: eb42 0202 adc.w r2, r2, r2
105e2: bf28 it cs
105e4: eba0 3081 subcs.w r0, r0, r1, lsl #14
105e8: ebb0 3f41 cmp.w r0, r1, lsl #13
105ec: bf00 nop
105ee: eb42 0202 adc.w r2, r2, r2
105f2: bf28 it cs
105f4: eba0 3041 subcs.w r0, r0, r1, lsl #13
105f8: ebb0 3f01 cmp.w r0, r1, lsl #12
105fc: bf00 nop
105fe: eb42 0202 adc.w r2, r2, r2
10602: bf28 it cs
10604: eba0 3001 subcs.w r0, r0, r1, lsl #12
10608: ebb0 2fc1 cmp.w r0, r1, lsl #11
1060c: bf00 nop
1060e: eb42 0202 adc.w r2, r2, r2
10612: bf28 it cs
10614: eba0 20c1 subcs.w r0, r0, r1, lsl #11
10618: ebb0 2f81 cmp.w r0, r1, lsl #10
1061c: bf00 nop
1061e: eb42 0202 adc.w r2, r2, r2
10622: bf28 it cs
10624: eba0 2081 subcs.w r0, r0, r1, lsl #10
10628: ebb0 2f41 cmp.w r0, r1, lsl #9
1062c: bf00 nop
1062e: eb42 0202 adc.w r2, r2, r2
10632: bf28 it cs
10634: eba0 2041 subcs.w r0, r0, r1, lsl #9
10638: ebb0 2f01 cmp.w r0, r1, lsl #8
1063c: bf00 nop
1063e: eb42 0202 adc.w r2, r2, r2
10642: bf28 it cs
10644: eba0 2001 subcs.w r0, r0, r1, lsl #8
10648: ebb0 1fc1 cmp.w r0, r1, lsl #7
1064c: bf00 nop
1064e: eb42 0202 adc.w r2, r2, r2
10652: bf28 it cs
10654: eba0 10c1 subcs.w r0, r0, r1, lsl #7
10658: ebb0 1f81 cmp.w r0, r1, lsl #6
1065c: bf00 nop
1065e: eb42 0202 adc.w r2, r2, r2
10662: bf28 it cs
10664: eba0 1081 subcs.w r0, r0, r1, lsl #6
10668: ebb0 1f41 cmp.w r0, r1, lsl #5
1066c: bf00 nop
1066e: eb42 0202 adc.w r2, r2, r2
10672: bf28 it cs
10674: eba0 1041 subcs.w r0, r0, r1, lsl #5
10678: ebb0 1f01 cmp.w r0, r1, lsl #4
1067c: bf00 nop
1067e: eb42 0202 adc.w r2, r2, r2
10682: bf28 it cs
10684: eba0 1001 subcs.w r0, r0, r1, lsl #4
10688: ebb0 0fc1 cmp.w r0, r1, lsl #3
1068c: bf00 nop
1068e: eb42 0202 adc.w r2, r2, r2
10692: bf28 it cs
10694: eba0 00c1 subcs.w r0, r0, r1, lsl #3
10698: ebb0 0f81 cmp.w r0, r1, lsl #2
1069c: bf00 nop
1069e: eb42 0202 adc.w r2, r2, r2
106a2: bf28 it cs
106a4: eba0 0081 subcs.w r0, r0, r1, lsl #2
106a8: ebb0 0f41 cmp.w r0, r1, lsl #1
106ac: bf00 nop
106ae: eb42 0202 adc.w r2, r2, r2
106b2: bf28 it cs
106b4: eba0 0041 subcs.w r0, r0, r1, lsl #1
106b8: ebb0 0f01 cmp.w r0, r1
106bc: bf00 nop
106be: eb42 0202 adc.w r2, r2, r2
106c2: bf28 it cs
106c4: eba0 0001 subcs.w r0, r0, r1
106c8: 4610 mov r0, r2
106ca: 4770 bx lr
106cc: bf0c ite eq
106ce: 2001 moveq r0, #1
106d0: 2000 movne r0, #0
106d2: 4770 bx lr
106d4: fab1 f281 clz r2, r1
106d8: f1c2 021f rsb r2, r2, #31
106dc: fa20 f002 lsr.w r0, r0, r2
106e0: 4770 bx lr
106e2: b108 cbz r0, 106e8 <__udivsi3+0x258>
106e4: f04f 30ff mov.w r0, #4294967295 ; 0xffffffff
106e8: f000 b966 b.w 109b8 <__aeabi_idiv0>