问题描述
为了学习有关 ARM 汇编的知识,我编写了一个简单的测试项目来使用内联汇编和 NEON 指令执行图像缩小.你可以在这里看到:
in an attempt to learn something about ARM assembly, i have written a simple test project to perform image downscaling using inline assembly and NEON instructions. you can see it here:
https://github.com/rmaz/NEON-Image-Downscaling
经过一些努力,我设法让它工作,快乐的日子.除了它仅适用于小于 -O2 的优化级别.我已经查看了生成的 ASM,但我看不出任何明显的原因为什么会发生这种情况.任何人都可以提供任何见解吗?这是负责内联汇编部分的函数:
after some effort i managed to get it working, happy days. except that it only works for optimization levels less than -O2. i have taken a look at the generated ASM, but i cannot see any obvious reason why this should occur. can anyone offer any insight? here is the function responsible for the inline assembly part:
static void inline resizeRow(uint32_t *dst, uint32_t *src, uint32_t pixelsPerRow)
{
const uint32_t * rowB = src + pixelsPerRow;
// force the number of pixels per row to a mutliple of 8
pixelsPerRow = 8 * (pixelsPerRow / 8);
__asm__ volatile("Lresizeloop: \n" // start loop
"vld1.32 {d0-d3}, [%1]! \n" // load 8 pixels from the top row
"vld1.32 {d4-d7}, [%2]! \n" // load 8 pixels from the bottom row
"vhadd.u8 q0, q0, q2 \n" // average the pixels vertically
"vhadd.u8 q1, q1, q3 \n"
"vtrn.32 q0, q2 \n" // transpose to put the horizontally adjacent pixels in different registers
"vtrn.32 q1, q3 \n"
"vhadd.u8 q0, q0, q2 \n" // average the pixels horizontally
"vhadd.u8 q1, q1, q3 \n"
"vtrn.32 d0, d1 \n" // fill the registers with pixels
"vtrn.32 d2, d3 \n"
"vswp d1, d2 \n"
"vst1.64 {d0-d1}, [%0]! \n" // store the result
"subs %3, %3, #8 \n" // subtract 8 from the pixel count
"bne Lresizeloop \n" // repeat until the row is complete
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3"
);
}
函数在 O1 处为周围函数和循环生成的输出如下:
the functioning generated output at O1 for the surrounding function and loop is as follows:
.align 2
.code 16 @ @"\01-[BDPViewController downscaleImageNeon:]"
.thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
.cfi_startproc
Lfunc_begin4:
.loc 1 86 0 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
@ BB#0:
.loc 1 86 1 prologue_end @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
push {r4, r5, r6, r7, lr}
add r7, sp, #12
push.w {r8, r10, r11}
sub sp, #20
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
.loc 1 88 20 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
movw r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
mov r6, r2
Ltmp43:
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
movt r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
add r0, pc
ldr.w r11, [r0]
mov r0, r6
blx _objc_retain
mov r4, r0
mov r0, r6
mov r1, r11
Ltmp44:
blx _objc_msgSend
blx _CGImageGetWidth
mov r5, r0
Ltmp45:
@DEBUG_VALUE: width <- R5+0
.loc 1 89 21 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
mov r0, r6
mov r1, r11
str r5, [sp, #16] @ 4-byte Spill
blx _objc_msgSend
blx _CGImageGetHeight
mov r10, r0
Ltmp46:
@DEBUG_VALUE: height <- R10+0
.loc 1 90 26 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetBytesPerRow
str r0, [sp, #12] @ 4-byte Spill
Ltmp47:
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
.loc 1 91 35 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetAlphaInfo
str r0, [sp, #4] @ 4-byte Spill
Ltmp48:
@DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
.loc 1 94 45 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
mov r0, r6
mov r1, r11
blx _objc_msgSend
mov r6, r0
Ltmp49:
mov r0, r4
blx _objc_release
mov r0, r6
.loc 1 98 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
mul r8, r10, r5
Ltmp50:
@DEBUG_VALUE: width <- [sp+#16]+#0
.loc 1 94 45 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
blx _CGImageGetDataProvider
blx _CGDataProviderCopyData
Ltmp51:
@DEBUG_VALUE: data <- R0+0
str r0, [sp, #8] @ 4-byte Spill
Ltmp52:
@DEBUG_VALUE: data <- [sp+#8]+#0
.loc 1 95 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
blx _CFDataGetBytePtr
mov r4, r0
Ltmp53:
@DEBUG_VALUE: buffer <- R4+0
.loc 1 98 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
lsr.w r0, r8, #2
movs r1, #4
blx _calloc
mov r5, r0
Ltmp54:
@DEBUG_VALUE: outputBuffer <- R5+0
mov r0, r10
Ltmp55:
@DEBUG_VALUE: height <- R0+0
.loc 1 101 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
cmp r0, #0
Ltmp56:
@DEBUG_VALUE: rowIndex <- 0+0
beq LBB4_3
@ BB#1: @ %.lr.ph
Ltmp57:
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
@DEBUG_VALUE: width <- [sp+#16]+#0
@DEBUG_VALUE: height <- R0+0
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
@DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
@DEBUG_VALUE: data <- [sp+#8]+#0
@DEBUG_VALUE: buffer <- R4+0
@DEBUG_VALUE: outputBuffer <- R5+0
@DEBUG_VALUE: rowIndex <- 0+0
ldr r1, [sp, #12] @ 4-byte Reload
Ltmp58:
@DEBUG_VALUE: bytesPerRow <- R1+0
mov.w r8, #0
lsl.w r11, r1, #1
.loc 1 104 74 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
lsr.w r10, r1, #1
Ltmp60:
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2: @ =>This Inner Loop Header: Depth=1
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
@DEBUG_VALUE: width <- [sp+#16]+#0
@DEBUG_VALUE: height <- R0+0
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
@DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
@DEBUG_VALUE: data <- [sp+#8]+#0
@DEBUG_VALUE: outputBuffer <- R5+0
@DEBUG_VALUE: rowIndex <- 0+0
lsr.w r1, r8, #1
Ltmp61:
mov r6, r0
Ltmp62:
@DEBUG_VALUE: height <- R6+0
mla r0, r1, r10, r5
Ltmp63:
@DEBUG_VALUE: destRow <- R1+0
.loc 1 105 9 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
ldr r2, [sp, #16] @ 4-byte Reload
mov r1, r4
Ltmp64:
bl _resizeRow
mov r0, r6
Ltmp65:
@DEBUG_VALUE: height <- R0+0
.loc 1 101 50 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
add.w r8, r8, #2
Ltmp66:
@DEBUG_VALUE: rowIndex <- R8+0
.loc 1 101 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
add r4, r11
cmp r8, r0
blo LBB4_2
Ltmp67:
LBB4_3: @ %._crit_edge
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
@DEBUG_VALUE: width <- [sp+#16]+#0
@DEBUG_VALUE: height <- R0+0
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
@DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
@DEBUG_VALUE: data <- [sp+#8]+#0
@DEBUG_VALUE: outputBuffer <- R5+0
.loc 1 109 28 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
ldr r1, [sp, #4] @ 4-byte Reload
Ltmp68:
lsrs r2, r0, #1
str r1, [sp]
mov r6, r5
Ltmp69:
@DEBUG_VALUE: outputBuffer <- R6+0
ldr r1, [sp, #16] @ 4-byte Reload
ldr r0, [sp, #12] @ 4-byte Reload
Ltmp70:
lsrs r1, r1, #1
lsrs r3, r0, #1
mov r0, r5
bl _createBitmapContext
mov r4, r0
Ltmp71:
@DEBUG_VALUE: context <- R4+0
.loc 1 110 30 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
blx _CGBitmapContextCreateImage
.loc 1 111 66 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
.loc 1 110 30 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
mov r5, r0
Ltmp72:
@DEBUG_VALUE: scaledImage <- R5+0
.loc 1 111 66 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
add r1, pc
LPC4_2:
add r0, pc
mov r2, r5
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
Ltmp73:
@DEBUG_VALUE: returnImage <- R0+0
@ InlineAsm Start
mov r7, r7 @ marker for objc_retainAutoreleaseReturnValue
@ InlineAsm End
blx _objc_retainAutoreleasedReturnValue
Ltmp74:
mov r8, r0
.loc 1 112 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
mov r0, r5
blx _CGImageRelease
.loc 1 113 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
mov r0, r4
blx _CGContextRelease
.loc 1 114 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
ldr r0, [sp, #8] @ 4-byte Reload
blx _CFRelease
.loc 1 115 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
mov r0, r6
blx _free
Ltmp75:
.loc 1 118 1 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
mov r0, r8
add sp, #20
pop.w {r8, r10, r11}
pop.w {r4, r5, r6, r7, lr}
Ltmp76:
b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
.cfi_endproc
.align 2
.code 16 @ @resizeRow
.thumb_func _resizeRow
_resizeRow:
.cfi_startproc
Lfunc_begin5:
.loc 1 26 0 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
@ BB#0:
@DEBUG_VALUE: resizeRow:dst <- R0+0
@DEBUG_VALUE: resizeRow:src <- R1+0
@DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
.loc 1 27 47 prologue_end @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
add.w r3, r1, r2, lsl #2
Ltmp78:
@DEBUG_VALUE: rowB <- R3+0
.loc 1 30 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
bic r2, r2, #7
Ltmp79:
.loc 1 32 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
@ InlineAsm Start
Lresizeloop:
vld1.32 {d0-d3}, [r1]!
vld1.32 {d4-d7}, [r3]!
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 q0, q2
vtrn.32 q1, q3
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 d0, d1
vtrn.32 d2, d3
vswp d1, d2
vst1.64 {d0-d1}, [r0]!
subs r2, r2, #8
bne Lresizeloop
@ InlineAsm End
Ltmp80:
.loc 1 51 1 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
bx lr
Ltmp81:
Lfunc_end5:
.cfi_endproc
O2 处的无功能输出如下:
and the non functioning output at O2 is as follows:
.align 2
.code 16 @ @"\01-[BDPViewController downscaleImageNeon:]"
.thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
.cfi_startproc
Lfunc_begin4:
.loc 1 86 0 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
@ BB#0:
.loc 1 86 1 prologue_end @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
push {r4, r5, r6, r7, lr}
add r7, sp, #12
push.w {r8, r10, r11}
sub sp, #20
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
.loc 1 88 20 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
movw r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
mov r6, r2
Ltmp43:
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
movt r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
add r0, pc
ldr.w r11, [r0]
mov r0, r6
blx _objc_retain
mov r4, r0
mov r0, r6
mov r1, r11
Ltmp44:
blx _objc_msgSend
blx _CGImageGetWidth
mov r5, r0
Ltmp45:
@DEBUG_VALUE: width <- R5+0
.loc 1 89 21 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
mov r0, r6
mov r1, r11
str r5, [sp, #16] @ 4-byte Spill
blx _objc_msgSend
blx _CGImageGetHeight
mov r10, r0
Ltmp46:
@DEBUG_VALUE: height <- R10+0
.loc 1 90 26 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetBytesPerRow
str r0, [sp, #12] @ 4-byte Spill
Ltmp47:
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
.loc 1 91 35 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetAlphaInfo
str r0, [sp, #4] @ 4-byte Spill
Ltmp48:
@DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
.loc 1 94 45 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
mov r0, r6
mov r1, r11
blx _objc_msgSend
mov r6, r0
Ltmp49:
mov r0, r4
blx _objc_release
mov r0, r6
.loc 1 98 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
mul r8, r10, r5
Ltmp50:
@DEBUG_VALUE: width <- [sp+#16]+#0
.loc 1 94 45 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
blx _CGImageGetDataProvider
blx _CGDataProviderCopyData
Ltmp51:
@DEBUG_VALUE: data <- R0+0
str r0, [sp, #8] @ 4-byte Spill
Ltmp52:
@DEBUG_VALUE: data <- [sp+#8]+#0
.loc 1 95 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
blx _CFDataGetBytePtr
mov r4, r0
Ltmp53:
@DEBUG_VALUE: buffer <- R4+0
.loc 1 98 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
lsr.w r0, r8, #2
movs r1, #4
blx _calloc
mov r5, r0
Ltmp54:
@DEBUG_VALUE: outputBuffer <- R5+0
mov r0, r10
Ltmp55:
@DEBUG_VALUE: height <- R0+0
.loc 1 101 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
cmp r0, #0
Ltmp56:
@DEBUG_VALUE: rowIndex <- 0+0
beq LBB4_3
@ BB#1: @ %.lr.ph
Ltmp57:
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
@DEBUG_VALUE: width <- [sp+#16]+#0
@DEBUG_VALUE: height <- R0+0
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
@DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
@DEBUG_VALUE: data <- [sp+#8]+#0
@DEBUG_VALUE: buffer <- R4+0
@DEBUG_VALUE: outputBuffer <- R5+0
@DEBUG_VALUE: rowIndex <- 0+0
ldr r1, [sp, #12] @ 4-byte Reload
Ltmp58:
@DEBUG_VALUE: bytesPerRow <- R1+0
mov.w r8, #0
lsl.w r11, r1, #1
.loc 1 104 74 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
lsr.w r10, r1, #1
Ltmp60:
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2: @ =>This Inner Loop Header: Depth=1
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
@DEBUG_VALUE: width <- [sp+#16]+#0
@DEBUG_VALUE: height <- R0+0
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
@DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
@DEBUG_VALUE: data <- [sp+#8]+#0
@DEBUG_VALUE: outputBuffer <- R5+0
@DEBUG_VALUE: rowIndex <- 0+0
lsr.w r1, r8, #1
Ltmp61:
mov r6, r0
Ltmp62:
@DEBUG_VALUE: height <- R6+0
mla r0, r1, r10, r5
Ltmp63:
@DEBUG_VALUE: destRow <- R1+0
.loc 1 105 9 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
ldr r2, [sp, #16] @ 4-byte Reload
mov r1, r4
Ltmp64:
bl _resizeRow
mov r0, r6
Ltmp65:
@DEBUG_VALUE: height <- R0+0
.loc 1 101 50 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
add.w r8, r8, #2
Ltmp66:
@DEBUG_VALUE: rowIndex <- R8+0
.loc 1 101 29 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
add r4, r11
cmp r8, r0
blo LBB4_2
Ltmp67:
LBB4_3: @ %._crit_edge
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
@DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
@DEBUG_VALUE: width <- [sp+#16]+#0
@DEBUG_VALUE: height <- R0+0
@DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
@DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
@DEBUG_VALUE: data <- [sp+#8]+#0
@DEBUG_VALUE: outputBuffer <- R5+0
.loc 1 109 28 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
ldr r1, [sp, #4] @ 4-byte Reload
Ltmp68:
lsrs r2, r0, #1
str r1, [sp]
mov r6, r5
Ltmp69:
@DEBUG_VALUE: outputBuffer <- R6+0
ldr r1, [sp, #16] @ 4-byte Reload
ldr r0, [sp, #12] @ 4-byte Reload
Ltmp70:
lsrs r1, r1, #1
lsrs r3, r0, #1
mov r0, r5
bl _createBitmapContext
mov r4, r0
Ltmp71:
@DEBUG_VALUE: context <- R4+0
.loc 1 110 30 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
blx _CGBitmapContextCreateImage
.loc 1 111 66 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
.loc 1 110 30 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
mov r5, r0
Ltmp72:
@DEBUG_VALUE: scaledImage <- R5+0
.loc 1 111 66 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
add r1, pc
LPC4_2:
add r0, pc
mov r2, r5
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
Ltmp73:
@DEBUG_VALUE: returnImage <- R0+0
@ InlineAsm Start
mov r7, r7 @ marker for objc_retainAutoreleaseReturnValue
@ InlineAsm End
blx _objc_retainAutoreleasedReturnValue
Ltmp74:
mov r8, r0
.loc 1 112 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
mov r0, r5
blx _CGImageRelease
.loc 1 113 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
mov r0, r4
blx _CGContextRelease
.loc 1 114 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
ldr r0, [sp, #8] @ 4-byte Reload
blx _CFRelease
.loc 1 115 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
mov r0, r6
blx _free
Ltmp75:
.loc 1 118 1 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
mov r0, r8
add sp, #20
pop.w {r8, r10, r11}
pop.w {r4, r5, r6, r7, lr}
Ltmp76:
b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
.cfi_endproc
.align 2
.code 16 @ @resizeRow
.thumb_func _resizeRow
_resizeRow:
.cfi_startproc
Lfunc_begin5:
.loc 1 26 0 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
@ BB#0:
@DEBUG_VALUE: resizeRow:dst <- R0+0
@DEBUG_VALUE: resizeRow:src <- R1+0
@DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
.loc 1 27 47 prologue_end @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
add.w r3, r1, r2, lsl #2
Ltmp78:
@DEBUG_VALUE: rowB <- R3+0
.loc 1 30 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
bic r2, r2, #7
Ltmp79:
.loc 1 32 5 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
@ InlineAsm Start
Lresizeloop:
vld1.32 {d0-d3}, [r1]!
vld1.32 {d4-d7}, [r3]!
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 q0, q2
vtrn.32 q1, q3
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 d0, d1
vtrn.32 d2, d3
vswp d1, d2
vst1.64 {d0-d1}, [r0]!
subs r2, r2, #8
bne Lresizeloop
@ InlineAsm End
Ltmp80:
.loc 1 51 1 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
bx lr
Ltmp81:
Lfunc_end5:
.cfi_endproc
推荐答案
这是我使用 -O2
从您的 Xcode 项目中获得的汇编代码片段.(使用 -O1
构建不会费心内联函数,所以我并不惊讶它可以正常工作.)
Here's a snippet of the assembly code I get from your Xcode project with -O2
. (Building with -O1
doesn't bother to inline the function, so I'm not surprised it works fine.)
Ltmp55:
@DEBUG_VALUE: rowIndex <- R3+0
.loc 1 101 29 @ /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
add r8, r12
cmp r3, r11
.loc 1 32 5 @ /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
Ltmp56:
@ InlineAsm Start
Lresizeloop:
vld1.32 {d0-d3}, [r4]!
vld1.32 {d4-d7}, [r5]!
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 q0, q2
vtrn.32 q1, q3
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 d0, d1
vtrn.32 d2, d3
vswp d1, d2
vst1.64 {d0-d1}, [r6]!
subs r2, r2, #8
bne Lresizeloop
@ InlineAsm End
Ltmp57:
blo LBB2_2
看到最后一行的 blo
(branch-if-lower) 指令了吗?它使用由汇编块顶部的 cmp r3, r11
设置的条件代码.但是当然,到那时您的内联汇编代码已经完全破坏了条件代码寄存器.那么这是编译器错误吗?... 不!您只是忘记了告诉编译器您的内联汇编代码破坏了条件代码.替换
See that blo
(branch-if-lower) instruction on the final line? It uses the condition codes set by the cmp r3, r11
at the top of the assembly block. But of course your inline assembly code has totally trashed the condition code register by then. So is this a compiler bug?... Nope! You just forgot to tell the compiler that your inline assembly code trashes the condition codes. Replace
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3"
);
与
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3", "cc"
);
并且程序集输出会自行修复.我还没有运行该应用程序,但我打赌你会发现它现在更好了.:)
and the assembly output fixes itself. I haven't run the app, but I bet you'll find it's all better now. :)
这篇关于为什么 clang 优化破坏了我的内联汇编代码?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!