ARM Memory Copy

扫码查看
         MODULE  ARM_MEMORY

         PUBLIC  ARM_MEMCPY
PUBLIC ARM_MEMSET
PUBLIC ARM_MEMSET8
PUBLIC ARM_MEMSET16
PUBLIC ARM_MEMSET32 SECTION .text:CODE:NOROOT()
CODE32 ;-------------------------------------------------------------------------------
; void ARM_MEMCPY(void* pDest, void* pSrc, U32 NumBytes)
;
; Function description
; Copy data in memory from source address to destination address.
;
; Register usage:
;
; R0 pDest
; R1 pSrc
; R2 NumBytes
;
; R3 Used for data transfers
; R4 Used for data transfers
; R12 Used for data transfers
; R14 Used for data transfers
;
; R13 SP
; R14 LR (contains return address)
; R15 PC
;
;-------------------------------------------------------------------------------
ARM_MEMCPY:
;-------------------------------------------------------------------------------
cmp R2, #+ ; R2 = NumBytes
bls ARM_MEMCPY_HandleTrailingBytes ; If we have less than one complete word, use single byte transfer ands R12, R0, #+ ; R0 = destination address
beq ARM_MEMCPY_DestIsDWordAligned ; Is destination address already word aligned ? ;-------------------------------------------------------------------------------
; Handle as much bytes as necessary to align destination address
;
ldrb R3, [R1], #+ ; We need at least one byte to the next word alignment, so we read one.
cmp R12, #+ ; Set condition codes according to the mis-alignment
add R2, R2, R12 ; Adjust NumBytes : 1, 2, 3
ldrbls R12, [R1], #+ ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address
strb R3, [R0], #+
ldrbcc R3, [R1], #+ ; Carry clear (CC)? -> We need one more byte
strbls R12, [R0], #+
sub R2, R2, #+ ; Adjust NumBytes
strbcc R3, [R0], #+ ; now destination address already is word aligned ;-------------------------------------------------------------------------------
; Choose best way to transfer data
;
ARM_MEMCPY_DestIsDWordAligned:
ands R3, R1, #+
beq ARM_MEMCPY_HandleBulkWordData ; If source and destination are aligned, use bulk word transfer subs R2, R2, #+
bcc ARM_MEMCPY_HandleTrailingBytes ; If we have less than one complete word left, use single byte transfer ldr R12, [R1, -R3]! ; Read first mis-aligned data word and word align source address
cmp R3, #+
beq ARM_MEMCPY_Loop16BitShift bhi ARM_MEMCPY_Loop24BitShift ;-------------------------------------------------------------------------------
; Handle data in units of word
;
; This is done by reading mis-aligned words from source address and
; shift them into the right alignment. After this the next data word
; will be read to complete the missing data part.
;
ARM_MEMCPY_Loop8BitShift:
mov R3, R12, LSR #+ ; Shift data word into right position
ldr R12, [R1, #+]! ; Load next mis-aligned data word
subs R2, R2, #+ ; Decrement NumBytes
orr R3, R3, R12, LSL #+ ; Combine missing part of data to build full data word
str R3, [R0], #+ ; Store complete word
bcs ARM_MEMCPY_Loop8BitShift add R1, R1, #+ ; Adjust source address
b ARM_MEMCPY_HandleTrailingBytes ; Handle trailing bytes ARM_MEMCPY_Loop16BitShift:
mov R3, R12, LSR #+ ; Shift data word into right position
ldr R12, [R1, #+]! ; Load next mis-aligned data word
subs R2, R2, #+ ; Decrement NumBytes
orr R3, R3, R12, LSL #+ ; Combine missing part of data to build full data word
str R3, [R0], #+ ; Store complete word
bcs ARM_MEMCPY_Loop16BitShift add R1, R1, #+ ; Adjust source address
b ARM_MEMCPY_HandleTrailingBytes ; Handle trailing bytes ARM_MEMCPY_Loop24BitShift:
mov R3, R12, LSR #+ ; Shift data word into right position
ldr R12, [R1, #+]! ; Load next mis-aligned data word
subs R2, R2, #+ ; Decrement NumBytes
orr R3, R3, R12, LSL #+ ; Combine missing part of data to build full data word
str R3, [R0], #+ ; Store complete word
bcs ARM_MEMCPY_Loop24BitShift add R1, R1, #+ ; Adjust source address
b ARM_MEMCPY_HandleTrailingBytes ; Handle trailing bytes ;-------------------------------------------------------------------------------
; Handle large bulk data in blocks of 8 words (32 bytes)
;
ARM_MEMCPY_HandleBulkWordData:
subs R2, R2, #+0x20
stmdb SP!, {R4, LR}
bcc ARM_MEMCPY_HandleTrailingWords ARM_MEMCPY_LoopHandleBulkWord:
ldm R1!, {R3, R4, R12, LR} ; Transfer 16 bytes at once
stm R0!, {R3, R4, R12, LR}
ldm R1!, {R3, R4, R12, LR} ; Transfer 16 bytes at once
stm R0!, {R3, R4, R12, LR}
subs R2, R2, #+0x20
bcs ARM_MEMCPY_LoopHandleBulkWord ;-------------------------------------------------------------------------------
; Handle trailing 7 words
;
ARM_MEMCPY_HandleTrailingWords:
movs R12, R2, LSL # ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldmcs R1!, {R3, R4, R12, LR} ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set)
stmcs R0!, {R3, R4, R12, LR}
ldmmi R1!, {R3, R4} ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set)
stmmi R0!, {R3, R4} movs R12, R2, LSL #+ ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldmia SP!, {R4, LR}
ldrcs R3, [R1], #+ ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set)
strcs R3, [R0], #+
bxeq LR ;-------------------------------------------------------------------------------
; Handle trailing 3 bytes
;
; N Z C V Q ***** I F T M4 3 2 1 0
; N = bit[31]
; C = last shift bit : shift
; C = 1 ADD/CMN has carry bit
; C = 0 SUB/CMP no borrow bit
; xxxxxxxxxxxxxxxxxxxx10 << 31 : N=0, C=1
; xxxxxxxxxxxxxxxxxxxx01 << 31 : N=1, C=0
; BMI : N=1
; BCS : C=1
ARM_MEMCPY_HandleTrailingBytes:
movs R2, R2, LSL #+ ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldrbmi R2, [R1], #+ ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set)
ldrbcs R3, [R1], #+ ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set)
ldrbcs R12, [R1], #+
strbmi R2, [R0], #+
strbcs R3, [R0], #+
strbcs R12, [R0], #+
bx LR ;-------------------------------------------------------------------------------
; void ARM_MEMSET(void* pDest, U32 c, U32 NumBytes)
;
; Function description
; Copy data in memory from source address to destination address.
;
; Register usage:
;
; R0 pDest
; R1 c
; R2 NumBytes
;
; R3 Used for data transfers
; R4 Used for data transfers
; R5 Used for data transfers
; R6 Used for data transfers
;
; R13 SP
; R14 LR (contains return address)
; R15 PC
;
;-------------------------------------------------------------------------------
ARM_MEMSET:
;-------------------------------------------------------------------------------
orr R1, R1, R1, LSL #+
orr R1, R1, R1, LSL #+ cmp R2, #+ ; R2 = NumBytes
bls ARM_MEMSET_HandleTrailingBytes ; If we have less than one complete word, use single byte transfer ands R3, R0, #+ ; R0 = destination address
beq ARM_MEMSET_DestIsAligned ; Is destination address already word aligned ? ; Handle as much bytes as necessary to align destination address strb R1, [R0], #+ ; We need at least one byte to the next word alignment, so we read one.
cmp R3, #+ ; Set condition codes according to the mis-alignment
add R2, R2, R3 ; Adjust NumBytes
strbls R1, [R0], #+ ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address
sub R2, R2, #+ ; Adjust NumBytes
strbcc R1, [R0], #+ ; Carry clear (CC)? -> We need one more byte ; Choose best way to transfer data ARM_MEMSET_DestIsAligned: ; destination is aligned, use bulk word transfer ; Handle large bulk data in blocks of 8 words (32 bytes) ARM_MEMSET_HandleBulkWordData:
stmdb SP!, {R4, R5, R6} mov R3, R1, LSL #+ ; Transfer 16 bytes at once
mov R4, R1, LSL #+
mov R5, R1, LSL #+ subs R2, R2, #+0x20 ; 32 Bytes = 8 DWords
bcc ARM_MEMSET_HandleTrailingWords ARM_MEMSET_LoopHandleBulkWord:
stm R0!, {R1, R3, R4, R5}
stm R0!, {R1, R3, R4, R5}
subs R2, R2, #+0x20
bcs ARM_MEMSET_LoopHandleBulkWord ; Handle trailing 7 words ARM_MEMSET_HandleTrailingWords:
movs R6, R2, LSL # ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
stmcs R0!, {R1, R3, R4, R5} ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set)
stmmi R0!, {R1, R3} ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set) movs R6, R2, LSL #+ ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
strcs R1, [R0], #+ ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set) ldmia SP!, {R4, R5, R6}
bxeq LR ; Z flag contain no Trailing Bytes ; Handle trailing 3 bytes ARM_MEMSET_HandleTrailingBytes:
movs R2, R2, LSL #+ ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
strbmi R1, [R0], #+ ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set)
strbcs R1, [R0], #+ ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set)
strbcs R1, [R0], #+
bx LR ; int ARM_MEMSET8(void* pDest, U32 c, U32 NumBytes);
;-------------------------------------------------------------------------------
ARM_MEMSET8:
;-------------------------------------------------------------------------------
stmdb SP!, {R4, R5}
cmp R2, #
blt ARM_MEMSET8_loop3 ; Alignment is unknown
tst R0, #
strneb R1, [R0], #
subne R2, R2, # ; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit)
orr R1, R1, R1, LSL #
tst R0, #
strneh R1, [R0], #
subne R2, R2, # ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
orr R1, R1, R1, LSL #
mov R3, R1
cmp R2, #
blt ARM_MEMSET8_loop2
tst R0, #
strne R1, [R0], #
subne R2, R2, #
tst R0, #
stmneia R0!, {R1, R3}
subne R2, R2, # ; Now we are 128-bit aligned
mov R4, R1
mov R5, R1
ARM_MEMSET8_loop1:
; Copy 4 32-bit values per loop iteration
subs R2, R2, #
stmgeia R0!, {R1, R3, R4, R5}
bge ARM_MEMSET8_loop1
add R2, R2, # ARM_MEMSET8_loop2:
; Copy up to 3 remaining 32-bit values
tst R2, #
stmneia R0!, {R1, R3}
tst R2, #
strne R1, [R0], #
and R2, R2, # ARM_MEMSET8_loop3:
; Copy up to 3 remaining bytes
subs R2, R2, #
strgeb R1, [R0], #
subs R2, R2, #
strgeb R1, [R0], #
subs R2, R2, #
strgeb R1, [R0], #
ldmia SP!, {R4, R5}
bx LR ; int ARM_MEMSET16(void* pDest, U32 c, U32 NumHalfWords);
;-------------------------------------------------------------------------------
ARM_MEMSET16:
;-------------------------------------------------------------------------------
stmdb SP!, {R4, R5} cmp R2, #
blt ARM_MEMSET16_HandleTrailingHalfWord ; 1 or 0 ; Alignment is known to be at least 16-bit
tst R0, #
strneh R1, [R0], # ; xxxx-xx10 --->
subne R2, R2, # ; xxxx-xx00 ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit )
orr R1, R1, R1, LSL #
mov R4, R1 cmp R2, #
blt ARM_MEMSET16_HandleTrailingWords ; 7, 6, ... 0 tst R0, #
strne R1, [R0], # ; xxxx-x100 --->
subne R2, R2, # ; xxxx-x000 ---> ; Now we are 64-bit aligned
tst R0, #
stmneia R0!, {R1, R4} ; xxxx-1000 --->
subne R2, R2, # ; xxxx-0000 ---> ARM_MEMSET16_HandleBulkWordData:
; Now we are 128-bit aligned
mov R5, R1
mov R3, R1 ARM_MEMSET16_LoopHandleBulkWord:
; Copy 4 32-bit values per loop iteration
subs R2, R2, #
stmgeia R0!, {R1, R3, R4, R5}
bge ARM_MEMSET16_LoopHandleBulkWord
add R2, R2, # ARM_MEMSET16_HandleTrailingWords:
; Copy up to 3 remaining 32-bit values
tst R2, #
stmneia R0!, {R1, R4} tst R2, #
strne R1, [R0], # and R2, R2, # ARM_MEMSET16_HandleTrailingHalfWord:
; Copy up to 1 remaining 16-bit value
subs R2, R2, #
strgeh R1, [R0], # ldmia SP!, {R4, R5}
bx LR ; int ARM_MEMSET32(void* pDest, U32 c, U32 NumWords);
;-------------------------------------------------------------------------------
ARM_MEMSET32:
;-------------------------------------------------------------------------------
stmdb SP!, {R4, R5} cmp R2, #
blt ARM_MEMSET32_loop2 ; Alignment is known to be at least 32-bit
mov R3, R1 tst R0, #
strne R1, [R0], #
subne R2, R2, # ; Now we are 64-bit aligned
tst R0, #
stmneia R0!, {R1, R3}
subne R2, R2, # ; Now we are 128-bit aligned
mov R4, R1
mov R5, R1
ARM_MEMSET32_loop1:
; Copy 4 32-bit values per loop iteration
subs R2, R2, #
stmgeia R0!, {R1, R3, R4, R5}
bge ARM_MEMSET32_loop1
add R2, R2, # ARM_MEMSET32_loop2:
; Copy up to 3 remaining 32-bit values
subs R2, R2, #
strge R1, [R0], #
subs R2, R2, #
strge R1, [R0], #
subs R2, R2, #
strge R1, [R0], # ldmia SP!, {R4, R5}
bx LR ;-__arm void ARM_memxor(void* pDest, U32 c, U32 NumBytes);
; r0 r1 r2
;-------------------------------------------------------------------------------
arm_memxor:
;-------------------------------------------------------------------------------
orr R1, R1, R1, LSL #+
orr R1, R1, R1, LSL #+ cmp R2, #+ ; R2 = NumBytes
bls arm_memxor_HandleTrailingBytes ; If we have less than one complete word, use single byte transfer ands R3, R0, #+ ; R0 = destination address
beq arm_memxor_DestIsAligned ; Is destination address already word aligned ? ;-
; Handle as much bytes as necessary to align destination address
;-
ldrb R12, [R0], #+ ; We need at least one byte to the next word alignment, so we read one.
eor R12, R12, r1
strb R12, [R0], #+ ; We need at least one byte to the next word alignment, so we read one. cmp R3, #+ ; Set condition codes according to the mis-alignment
add R2, R2, R3 ; Adjust NumBytes ldrbls R3, [R0], #+ ; We need at least one byte to the next word alignment, so we read one.
eorls R3, R3, r1
strbls R3, [R0], #+ ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address sub R2, R2, #+ ; Adjust NumBytes ldrbcc R3, [R0], #+ ; We need at least one byte to the next word alignment, so we read one.
eorcc R3, R3, r1
strbcc R3, [R0], #+ ; Carry clear (CC)? -> We need one more byte ;-
; Choose best way to transfer data
;-
arm_memxor_DestIsAligned: ; destination is aligned, use bulk word transfer
;-
; Handle large bulk data in blocks of 8 words (32 bytes)
;-
arm_memxor_HandleBulkWordData:
stmdb SP!, {R4, R5, R6, R7} subs R2, R2, #+0x20 ; 32 Bytes = 8 DWords
bcc arm_memxor_HandleTrailingWords arm_memxor_LoopHandleBulkWord:
ldm R0, {R3, R4, R5, R6}
eor r3, r3, r1
eor r4, r4, r1
eor r5, r5, r1
eor r6, r6, r1
stm R0!, {R3, R4, R5, R6} ldm R0, {R3, R4, R5, R6}
eor r3, r3, r1
eor r4, r4, r1
eor r5, r5, r1
eor r6, r6, r1
stm R0!, {R3, R4, R5, R6} subs R2, R2, #+0x20
bcs arm_memxor_LoopHandleBulkWord ;-
; Handle trailing 7 words
;-
arm_memxor_HandleTrailingWords:
movs R7, R2, LSL # ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldmcs R0, {R3, R4, R5, R6}
eorcs r3, r3, r1
eorcs r4, r4, r1
eorcs r5, r5, r1
eorcs r6, r6, r1
stmcs R0!, {R3, R4, R5, R6} ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is xor) ldmmi R0, {R3, R4}
eormi r3, r3, r1
eormi r4, r4, r1
stmmi R0!, {R3, R4} ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is xor) movs R7, R2, LSL #+ ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldrcs R3, [R0]
eorcs r3, r3, r1
strcs R3, [R0], #+ ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is xor) ldmia SP!, {R4, R5, R6, R7}
bxeq LR ; Z flag contain no Trailing Bytes ;-
; Handle trailing 3 bytes
;-
arm_memxor_HandleTrailingBytes:
movs R2, R2, LSL #+ ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data ldrmi R2, [R0]
eormi R2, R2, r1
strbmi R2, [R0], #+ ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is xor) ldrcs R2, [R0]
eorcs R2, R2, r1
strbcs R2, [R0], #+ ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor) ldrcs R2, [R0]
eorcs R2, R2, r1
strbcs R2, [R0], #+ ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor) bx LR ;-__arm int arm_memxor8(void* pDest, U32 c, U32 NumBytes);
; r0 r1 r2
;-------------------------------------------------------------------------------
arm_memxor8:
;-------------------------------------------------------------------------------
stmdb SP!, {R4, R5, R6} orr R1, R1, R1, LSL #+
orr R1, R1, R1, LSL #+ cmp R2, #
blt arm_memxor8_loop3 ; Alignment is unknown
tst R0, # ldrneb R6, [R0]
eorne R6, r6, R1
strneb R6, [R0], # subne R2, R2, # ; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit)
tst R0, # ldrneh R6, [R0]
eorne R6, r6, R1
strneh R6, [R0], # subne R2, R2, # ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
cmp R2, #
blt arm_memxor8_loop2
tst R0, # ldrne R6, [R0]
eorne R6, r6, R1
strne R6, [R0], #
; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
subne R2, R2, #
tst R0, # ldmneia R0, {R3, R6}
eorne R3, r3, R1
eorne R6, r6, R1
stmneia R0!, {R3, R6} subne R2, R2, # ; Now we are 128-bit aligned
mov R4, R1
mov R5, R1
arm_memxor8_loop1:
; Copy 4 32-bit values per loop iteration
subs R2, R2, # ldmgeia R0, {R3, R4, R5, R6}
eorge r3, r3, r1
eorge r4, r4, r1
eorge r5, r5, r1
eorge r6, r6, r1
stmgeia R0!, {R3, R4, R5, R6} bge arm_memxor8_loop1
add R2, R2, # arm_memxor8_loop2:
; Copy up to 3 remaining 32-bit values
tst R2, # ldmneia R0, {R3, R4}
eorne r3, r3, r1
eorne r4, r4, r1
stmneia R0!, {R3, R4} tst R2, # ldrne R3, [R0]
eorne r3, r3, r1
strne R3, [R0], # and R2, R2, # arm_memxor8_loop3:
; Copy up to 3 remaining bytes
subs R2, R2, # ldrgeb R3, [R0]
eorge r3, r3, r1
strgeb R3, [R0], # subs R2, R2, # ldrgeb R3, [R0]
eorge r3, r3, r1
strgeb R1, [R0], # subs R2, R2, # ldrgeb R3, [R0]
eorge r3, r3, r1
strgeb R1, [R0], # ldmia SP!, {R4, R5, R6}
bx LR ;-__arm int arm_memxor16(void* pDest, U32 c, U32 NumHalfWords);
; r0 r1 r2
;-------------------------------------------------------------------------------
arm_memxor16:
;-------------------------------------------------------------------------------
stmdb SP!, {R4, R5, R6}
orr R1, R1, R1, LSL #+ cmp R2, #
blt arm_memxor16_HandleTrailingHalfWord ; 1 or 0 ; Alignment is known to be at least 16-bit
tst R0, # ldrneh R6, [R0]
eorne R6, r6, R1
strneh R6, [R0], # ; xxxx-xx10 ---> subne R2, R2, # ; xxxx-xx00 ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit )
cmp R2, #
blt arm_memxor16_HandleTrailingWords ; 7, 6, ... 0 tst R0, # ldrne R3, [R0]
eorne r3, r3, r1
strne R3, [R0], # ; xxxx-x100 ---> subne R2, R2, # ; xxxx-x000 ---> ; Now we are 64-bit aligned
tst R0, # ldmneia R0, {R3, R4}
eorne r3, r3, r1
eorne r4, r4, r1
stmneia R0!, {R3, R4} ; xxxx-1000 ---> subne R2, R2, # ; xxxx-0000 ---> arm_memxor16_HandleBulkWordData:
; Now we are 128-bit aligned
mov R5, R1
mov R6, R1 arm_memxor16_LoopHandleBulkWord:
; Copy 4 32-bit values per loop iteration
subs R2, R2, # ldmgeia R0, {R3, R4, R5, R6}
eorge r3, r3, r1
eorge r4, r4, r1
eorge r5, r5, r1
eorge r6, r6, r1
stmgeia R0!, {R3, R4, R5, R6} bge arm_memxor16_LoopHandleBulkWord
add R2, R2, # arm_memxor16_HandleTrailingWords:
; Copy up to 3 remaining 32-bit values
tst R2, # ldmneia R0, {R3, R4}
eorne r3, r3, r1
eorne r4, r4, r1
stmneia R0!, {R3, R4} tst R2, # ldrne R3, [R0]
eorne r3, r3, r1
strne R3, [R0], # and R2, R2, # arm_memxor16_HandleTrailingHalfWord:
; Copy up to 1 remaining 16-bit value
subs R2, R2, # ldrgeh R3, [R0]
eorge r3, r3, r1
strgeh R3, [R0], # ldmia SP!, {R4, R5, R6}
bx LR ;-__arm int arm_memxor32(void* pDest, U32 c, U32 NumWords);
; r0 r1 r2
;-------------------------------------------------------------------------------
arm_memxor32:
;-------------------------------------------------------------------------------
stmdb SP!, {R4, R5, R6} cmp R2, #
blt arm_memxor32_loop2 ; Alignment is known to be at least 32-bit, is it 64-bit aligned ?
tst R0, #
; No, it is 32-bit aligned
ldrne R3, [R0]
eorne R3, r3, R1
strne R3, [R0], #
subne R2, R2, # ; Now we are 64-bit aligned, is it 128-bit aligned ?
tst R0, #
; No, it is 64-bit aligned
ldmneia R0, {R3, R4}
eorne r3, r3, r1
eorne r4, r4, r1
stmneia R0!, {R3, R4} ; xxxx-1000 --->
subne R2, R2, # ; Now we are 128-bit aligned
mov R4, R1
mov R5, R1
arm_memxor32_loop1:
; Copy 4 32-bit values per loop iteration
subs R2, R2, # ldmgeia R0, {R3, R4, R5, R6}
eorge r3, r3, r1
eorge r4, r4, r1
eorge r5, r5, r1
eorge r6, r6, r1
stmgeia R0!, {R3, R4, R5, R6} bge arm_memxor32_loop1
add R2, R2, # arm_memxor32_loop2:
; Copy up to 3 remaining 32-bit values subs R2, R2, #
ldrge R3, [R0]
eorge r3, r3, r1
strge R3, [R0], # subs R2, R2, #
ldrge R3, [R0]
eorge r3, r3, r1
strge R3, [R0], # subs R2, R2, #
ldrge R3, [R0]
eorge r3, r3, r1
strge R3, [R0], # ldmia SP!, {R4, R5, R6}
bx LR END
05-11 15:02
查看更多