问题描述
我想知道如何在GPU CUDA中实现一个漂亮的软阈值函数内核?
软阈值函数如下:
其中lambda是阈值,x是输入向量或矩阵。假设它们都是真实的。
我更喜欢一个可靠运行的现有代码示例。我不愿意从头开始。任何人都这样做过吗?或者知道一些我在哪里可以找到一个很好的示例代码?感谢。
上面注释中提出的两个解决方案是为元素处理设置的,如下:
__ global__ void myKernel1(float * __restrict__ x,float lambda,const int N)
{
int tid = threadIdx .x + blockIdx.x * blockDim.x;
if(tid< N){
float xa = fabs(x [tid]);
x [tid] =(xa> lambda)? x [tid] *((xa-lambda)/ xa):0;
}
}
和
__ global__ void myKernel2(float * __restrict__ x,float lambda,const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
float xa = fabs(x [tid]);
x [tid] = signbit(lambda-xa)* copysign(xa-lambda,x [tid]);
}
}
两个解决方案的反汇编代码报告如下。正如@njuffa所注意到的,由于缺少 x / | x |
部分,第二个似乎原则上比第一个更负担。然而,也由@njuffa注意到,这种情况将可能是内存绑定,而不是计算绑定。然而,也许这个分析表明,当实现为非元素计算的 __ device __
函数时,第二个解决方案是优选的。
第一个解决方案的已解析代码
sm_21
的代码功能:_Z9myKernel1Pffi
.headerflags @EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)
/ * 0000 * / MOV R1,c [0x1] [0x100] / * 0x2800440400005de4 * /
/ * 0008 * / S2R R0,SR_CTAID.X; / * 0x2c00000094001c04 * /
/ * 0010 * / S2R R3,SR_TID.X; / * 0x2c0000008400dc04 * /
/ * 0018 * / IMAD R0,R0,c [0x0] [0x8],R3; / * 0x2006400020001ca3 * /
/ * 0020 * / ISETP.GE.AND P0,PT,R0,c [0x0] [0x2c] / * 0x1b0e4000b001dc23 * /
/ * 0028 * / @ P0退出; / * 0x80000000000001e7 * /
/ * 0030 * / MOV32I R3,0x4; / * 0x180000001000dde2 * /
/ * 0038 * / SSY 0x90; / * 0x6000000140000007 * /
/ * 0040 * / IMAD R16.CC,R0,R3,c [0x0] [0x20] / * 0x2007800080041ca3 * /
/ * 0048 * / IMAD.HI.X R17,R0,R3,c [0x0] [0x24] / * 0x2086800090045ce3 * /
/ * 0050 * / LD.E R2,[R16]; / * 0x8400000001009c85 * /
/ * 0058 * / FSETP.GT.AND P0,PT,| R2 |,c [0x0] [0x28] / * 0x220e4000a021dc80 * /
/ * 0060 * / F2F.F32.F32 R5,| R2 |; / * 0x1000000009215c44 * /
/ * 0068 * / @ P0 BRA 0x78; / * 0x40000000200001e7 * /
/ * 0070 * / MOV.S R0,RZ; / * 0x28000000fc001df4 * /
/ * 0078 * / FADD R4,| R2 |,-c [0x0] [0x28] / * 0x50004000a0211d80 * /
/ * 0080 * / JCAL 0x0; / * 0x1000000000010007 * /
/ * 0088 * / FMUL.S R0,R2,R4; / * 0x5800000010201c10 * /
/ * 0090 * / ST.E [R16],R0; / * 0x9400000001001c85 * /
/ * 0098 * / EXIT; / * 0x8000000000001de7 * /
.................................
功能:__cuda_sm20_div_rn_noftz_f32_slowpath
.headerflags @EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)
/ * 0000 * / SHL R0,R4,0x1; / * 0x6000c00004401c03 * /
/ * 0008 * / MOV32I R6,0x1; / * 0x1800000004019de2 * /
/ * 0010 * / SHL R3,R5,0x1; / * 0x6000c0000450dc03 * /
/ * 0018 * / IMAD.U32.U32.HI R0,R0,0x100,-R6; / * 0x200cc00400001d43 * /
/ * 0020 * / ISETP.GT.U32.AND P0,PT,R0,0xfd,PT; / * 0x1a0ec003f401dc03 * /
/ * 0028 * / IMAD.U32.U32.HI R3,R3,0x100,-R6; / * 0x200cc0040030dd43 * /
/ * 0030 * / ISETP.GT.U32.OR P0,PT,R3,0xfd,P0; / * 0x1a20c003f431dc03 * /
/ * 0038 * / @!P0 BRA 0x178; / * 0x40000004e00021e7 * /
/ * 0040 * / FSETP.LE.AND P0,PT,| R4 |,+ INF,PT; / * 0x218edfe00041dc80 * /
/ * 0048 * / @!P0 BRA 0x60; / * 0x40000000400021e7 * /
/ * 0050 * / FSETP.LE.AND P0,PT,| R5 |,+ INF,PT; / * 0x218edfe00051dc80 * /
/ * 0058 * / @ P0 BRA 0x70; / * 0x40000000400001e7 * /
/ * 0060 * / FADD R4,R4,R5; / * 0x5000000014411c00 * /
/ * 0068 * / BRA 0x370; / * 0x4000000c00001de7 * /
/ * 0070 * / SHL R7,R5,0x1; / * 0x6000c0000451dc03 * /
/ * 0078 * / SHL R6,R4,0x1; / * 0x6000c00004419c03 * /
/ * 0080 * / ISETP.EQ.U32.AND P2,PT,R7,RZ,PT; / * 0x190e0000fc75dc03 * /
/ * 0088 * / ISETP.EQ.U32.AND P1,PT,R6,RZ,PT; / * 0x190e0000fc63dc03 * /
/ * 0090 * / PSETP.AND.AND P0,PT,P1,P2,PT; / * 0x0c0e00000811dc04 * /
/ * 0098 * / @ P0 BRA 0xc0; / * 0x40000000800001e7 * /
/ * 00a0 * / FSETP.EQ.AND P3,PT,| R4 |,+ INF,PT; / * 0x210edfe00047dc80 * /
/ * 00a8 * / FSETP.EQ.AND P0,PT,| R5 |,+ INF,PT; / * 0x210edfe00051dc80 * /
/ * 00b0 * / @!P3 BRA 0xd8; / * 0x4000000080002de7 * /
/ * 00b8 * / @!P0 BRA 0xd8; / * 0x40000000600021e7 * /
/ * 00c0 * / MOV32I R0,0xffc00000; / * 0x1bff000000001de2 * /
/ * 00c8 * / MUFU.RSQ R4,R0; / * 0xc800000014011c00 * /
/ * 00d0 * / BRA 0x370; / * 0x4000000a60001de7 * /
/ * 00d8 * / PSETP.OR.AND P0,PT,P0,P1,PT; / * 0x0c0e00004401dc04 * /
/ * 00e0 * / @!P0 BRA 0x100; / * 0x40000000600021e7 * /
/ * 00e8 * / LOP.XOR R0,R5,R4; / * 0x6800000010501c83 * /
/ * 00f0 * / LOP32I.AND R4,R0,0x80000000; / * 0x3a00000000011c02 * /
/ * 00f8 * / BRA 0x370; / * 0x40000009c0001de7 * /
/ * 0100 * / PSETP.OR.AND P0,PT,P3,P2,PT; / * 0x0c0e00004831dc04 * /
/ * 0108 * / @!P0 BRA 0x130; / * 0x40000000800021e7 * /
/ * 0110 * / LOP.XOR R0,R5,R4; / * 0x6800000010501c83 * /
/ * 0118 * / LOP32I.AND R0,R0,0x80000000; / * 0x3a00000000001c02 * /
/ * 0120 * / LOP32I.OR R4,R0,0x7f800000; / * 0x39fe000000011c42 * /
/ * 0128 * / BRA 0x370; / * 0x4000000900001de7 * /
/ * 0130 * / ISETP.GE.AND P1,PT,R0,RZ,PT; / * 0x1b0e0000fc03dc23 * /
/ * 0138 * / ISETP.GE.AND P0,PT,R3,RZ,PT; / * 0x1b0e0000fc31dc23 * /
/ * 0140 * / @!P1 MOV32I R6,0xffffffc0; / * 0x1bffffff0001a5e2 * /
/ * 0148 * / @!P1 FFMA R4,R4,1.84467440737095520000e + 019,RZ; / * 0x307ed7e000412400 * /
/ * 0150 * / @ P1 MOV R6,RZ; / * 0x28000000fc0185e4 * /
/ * 0158 * / @ P0 BRA 0x180; / * 0x40000000800001e7 * /
/ * 0160 * / FFMA R5,R5,1.84467440737095520000e + 019,RZ; / * 0x307ed7e000515c00 * /
/ * 0168 * / IADD R6,R6,0x40; / * 0x4800c00100619c03 * /
/ * 0170 * / BRA 0x180; / * 0x4000000020001de7 * /
/ * 0178 * / MOV R6,RZ; / * 0x28000000fc019de4 * /
/ * 0180 * / IADD R7,R3,-0x7e; / * 0x4800fffe0831dc03 * /
/ * 0188 * / MOV32I R9,0x3f800000; / * 0x18fe000000025de2 * /
/ * 0190 * / ISCADD R7,-R7,R5,0x17; / * 0x410000001471dee3 * /
/ * 0198 * / ISUB R3,R0,R3; / * 0x480000000c00dd03 * /
/ * 01a0 * / MUFU.RCP R8,R7; / * 0xc800000010721c00 * /
/ * 01a8 * / IADD R5,R0,-0x7e; / * 0x4800fffe08015c03 * /
/ * 01b0 * / FFMA R9,-R7,R8,R9; / * 0x3012000020725e00 * /
/ * 01b8 * / ISCADD R4,-R5,R4,0x17; / * 0x4100000010511ee3 * /
/ * 01c0 * / FFMA R5,R8,R9,R8; / * 0x3010000024815c00 * /
/ * 01c8 * / FFMA R8,R4,R5,RZ; / * 0x307e000014421c00 * /
/ * 01d0 * / FFMA R9,-R7,R8,R4; / * 0x3008000020725e00 * /
/ * 01d8 * / FFMA R8,R9,R5,R8; / * 0x3010000014921c00 * /
/ * 01e0 * / FFMA R7,-R7,R8,R4; / * 0x300800002071de00 * /
/ * 01e8 * / FFMA R4,R7,R5,R8; / * 0x3010000014711c00 * /
/ * 01f0 * / SHL R9,R4,0x1; / * 0x6000c00004425c03 * /
/ * 01f8 * / SHR.U32 R9,R9,0x18; / * 0x5800c00060925c03 * /
/ * 0200 * / IADD R0,R3,R9; / * 0x4800000024301c03 * /
/ * 0208 * / IADD R6,R6,R0; / * 0x4800000000619c03 * /
/ * 0210 * / IADD R0,R6,-0x1; / * 0x4800fffffc601c03 * /
/ * 0218 * / ISETP.GT.U32.AND P0,PT,R0,0xfd,PT; / * 0x1a0ec003f401dc03 * /
/ * 0220 * / @ P0 BRA 0x240; / * 0x40000000600001e7 * /
/ * 0228 * / ISUB R0,R6,R9; / * 0x4800000024601d03 * /
/ * 0230 * / ISCADD R4,R0,R4,0x17; / * 0x4000000010011ee3 * /
/ * 0238 * / BRA 0x370; / * 0x40000004c0001de7 * /
/ * 0240 * / ISETP.LE.AND P0,PT,R6,0xfe,PT; / * 0x198ec003f861dc23 * /
/ * 0248 * / @ P0 BRA 0x268; / * 0x40000000600001e7 * /
/ * 0250 * / LOP32I.AND R0,R4,0x80000000; / * 0x3a00000000401c02 * /
/ * 0258 * / LOP32I.OR R4,R0,0x7f800000; / * 0x39fe000000011c42 * /
/ * 0260 * / BRA 0x370; / * 0x4000000420001de7 * /
/ * 0268 * / ISETP.GT.AND P0,PT,R6,RZ,PT; / * 0x1a0e0000fc61dc23 * /
/ * 0270 * / @ P0 BRA 0x370; / * 0x40000003e00001e7 * /
/ * 0278 * / ISETP.GE.AND P0,PT,R6,-0x18,PT; / * 0x1b0effffa061dc23 * /
/ * 0280 * / @ P0 BRA 0x298; / * 0x40000000400001e7 * /
/ * 0288 * / LOP32I.AND R4,R4,0x80000000; / * 0x3a00000000411c02 * /
/ * 0290 * / BRA 0x370; / * 0x4000000360001de7 * /
/ * 0298 * / FFMA.RP R3,R7,R5,R8; / * 0x311000001470dc00 * /
/ * 02a0 * / FFMA.RM R0,R7,R5,R8; / * 0x3090000014701c00 * /
/ * 02a8 * / FFMA.RZ R5,R7,R5,R8; / * 0x3190000014715c00 * /
/ * 02b0 * / FSET.NEU.AND R3,R0,R3,PT; / * 0x168e00000c00dc00 * /
/ * 02b8 * / I2I.S32.S32 R7,-R6; / * 0x1c0000001921df84 * /
/ * 02c0 * / LOP32I.AND R5,R5,0x7fffff; / * 0x3801fffffc515c02 * /
/ * 02c8 * / ISETP.EQ.AND P0,PT,R7,RZ,PT; / * 0x190e0000fc71dc23 * /
/ * 02d0 * / LOP32I.AND R0,R4,0x80000000; / * 0x3a00000000401c02 * /
/ * 02d8 * / I2I.S32.S32 R3,-R3; / * 0x1c0000000d20df84 * /
/ * 02e0 * / I2I.S32.S32 R4,-R6; / * 0x1c00000019211f84 * /
/ * 02e8 * / LOP32I.OR R7,R5,0x800000; / * 0x380200000051dc42 * /
/ * 02f0 * / @ P0 BRA.U 0x328; / * 0x40000000c00081e7 * /
/ * 02f8 * / @!P0 IADD R5,R6,0x20; / * 0x4800c00080616003 * /
/ * 0300 * / @!P0 SHL R5,R7,R5; / * 0x6000000014716003 * /
/ * 0308 * / @!P0 ICMP.EQ.U32 R5,RZ,0x1,R5; / * 0x310ac00007f16003 * /
/ * 0310 * / @!P0 SHR.U32 R7,R7,R4; / * 0x580000001071e003 * /
/ * 0318 * / @!P0 LOP.OR R3,R3,R5; / * 0x680000001430e043 * /
/ * 0320 * / NOP; / * 0x4000000000001de4 * /
/ * 0328 * / SHL R4,R7,0x1e; / * 0x6000c00078711c03 * /
/ * 0330 * / SHR.U32 R5,R4,0x1f; / * 0x5800c0007c415c03 * /
/ * 0338 * / LOP.AND R4,R7,0x1; / * 0x6800c00004711c03 * /
/ * 0340 * / LOP.OR R3,R3,R5; / * 0x680000001430dc43 * /
/ * 0348 * / LOP.AND R3,R4,R3; / * 0x680000000c40dc03 * /
/ * 0350 * / SHR.U32 R4,R7,0x1; / * 0x5800c00004711c03 * /
/ * 0358 * / ISETP.NE.U32.AND P0,PT,R3,RZ,PT; / * 0x1a8e0000fc31dc03 * /
/ * 0360 * / @ P0 IADD R4,R4,0x1; / * 0x4800c00004410003 * /
/ * 0368 * / LOP.OR R4,R0,R4; / * 0x6800000010011c43 * /
/ * 0370 * / RET; / * 0x9000000000001de7 * /
......................................... .............
功能:__cuda_sm20_div_rn_f32
.headerflags @EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)
/ * 0000 * / MUFU.RCP R3,R5; / * 0xc80000001050dc00 * /
/ * 0008 * / MOV32I R6,0x3f800000; / * 0x18fe000000019de2 * /
/ * 0010 * / LOP32I.AND R0,R4,0x7fffff; / * 0x3801fffffc401c02 * /
/ * 0018 * / FFMA.FTZ R6,-R5,R3,R6; / * 0x300c00000c519e40 * /
/ * 0020 * / LOP32I.OR R0,R0,0x3f800000; / * 0x38fe000000001c42 * /
/ * 0028 * / FFMA.FTZ R3,R3,R6,R3; / * 0x300600001830dc40 * /
/ * 0030 * / FFMA.FTZ R6,R0,R3,RZ; / * 0x307e00000c019c40 * /
/ * 0038 * / FFMA.FTZ R7,-R5,R6,R0; / * 0x300000001851de40 * /
/ * 0040 * / FFMA.FTZ R6,R7,R3,R6; / * 0x300c00000c719c40 * /
/ * 0048 * / FFMA.FTZ R0,-R5,R6,R0; / * 0x3000000018501e40 * /
/ * 0050 * / LOP32I.AND R7,R4,0xff800000; / * 0x3bfe00000041dc02 * /
/ * 0058 * / FFMA.FTZ R6,R0,R3,R6; / * 0x300c00000c019c40 * /
/ * 0060 * / FFMA.FTZ R0,R6,R7,RZ; / * 0x307e00001c601c40 * /
/ * 0068 * / LOP32I.AND R3,R0,0x7fffffff; / * 0x39fffffffc00dc02 * /
/ * 0070 * / MOV32I R6,0x7effffef; / * 0x19fbffffbc019de2 * /
/ * 0078 * / IADD32I R3,R3,-0x800010; / * 0x0bfdffffc030dc02 * /
/ * 0080 * / ISETP.GT.U32.AND P0,PT,R3,R6,PT; / * 0x1a0e00001831dc03 * /
/ * 0088 * / @!P0 BRA 0xa8; / * 0x40000000600021e7 * /
/ * 0090 * / JCAL 0x0; / * 0x1000000000010007 * /
/ * 0098 * / MOV R0,R4; / * 0x2800000010001de4 * /
/ * 00a0 * / NOP; / * 0x4000000000001de4 * /
/ * 00a8 * / MOV R4,R0; / * 0x2800000000011de4 * /
/ * 00b0 * / RET; / * 0x9000000000001de7 * /
.......................................
第二个解决方案的已解析代码
代码sm_21
功能:_Z9myKernel2Pffi
.headerflags @EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)
/ * 0000 * / MOV R1, c [0x1] [0x100]; / * 0x2800440400005de4 * /
/ * 0008 * / S2R R0,SR_CTAID.X; / * 0x2c00000094001c04 * /
/ * 0010 * / S2R R2,SR_TID.X; / * 0x2c00000084009c04 * /
/ * 0018 * / IMAD R0,R0,c [0x0] [0x8],R2; / * 0x2004400020001ca3 * /
/ * 0020 * / ISETP.GE.AND P0,PT,R0,c [0x0] [0x2c] / * 0x1b0e4000b001dc23 * /
/ * 0028 * / @ P0 BRA.U 0x98; / * 0x40000001a00081e7 * /
/ * 0030 * / @!P0 MOV32I R3,0x4; / * 0x180000001000e1e2 * /
/ * 0038 * / @!P0 IMAD R2.CC,R0,R3,c [0x0] [0x20] / * 0x200780008000a0a3 * /
/ * 0040 * / @!P0 IMAD.HI.X R3,R0,R3,c [0x0] [0x24] / * 0x208680009000e0e3 * /
/ * 0048 * / @!P0 LD.E R0,[R2]; / * 0x8400000000202085 * /
/ * 0050 * / @!P0 FADD R5,| R0 |,-c [0x0] [0x28] / * 0x50004000a0016180 * /
/ * 0058 * / @!P0 FADD R4, - | R0 |,c [0x0] [0x28] / * 0x50004000a0012280 * /
/ * 0060 * / @!P0 LOP32I.AND R0,R0,0x80000000; / * 0x3a00000000002002 * /
/ * 0068 * / @!P0 LOP32I.AND R5,R5,0x7fffffff; / * 0x39fffffffc516002 * /
/ * 0070 * / @!P0 SHR.U32 R4,R4,0x1f; / * 0x5800c0007c412003 * /
/ * 0078 * / @!P0 LOP.OR R5,R0,R5; / * 0x6800000014016043 * /
/ * 0080 * / @!P0 I2F.F32.S32 R0,R4; / * 0x1800000011202204 * /
/ * 0088 * / @!P0 FMUL R0,R0,R5; / * 0x5800000014002000 * /
/ * 0090 * / @!P0 ST.E [R2],R0; / * 0x9400000000202085 * /
/ * 0098 * / EXIT; / * 0x8000000000001de7 * /
.................................
EDIT
帖子已出现在中。
I am wondering how should I implement a nice soft thresholding function kernel in GPU CUDA?The soft thresholding function is like following:
where lambda is the threshold, and x is the input vector or matrix. Suppose they are both real.
I prefer aN existing code sample that runs reliably. I am reluctant to start from scratch. Anyone has done this before? Or know some where I can find a good sample code? Thanks.
The two solutions proposed in the comments above, set up for an elementwise processing, are the following:
__global__ void myKernel1(float* __restrict__ x, float lambda, const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float xa = fabs(x[tid]);
x[tid] = (xa > lambda) ? x[tid] * ((xa - lambda) / xa) : 0;
}
}
and
__global__ void myKernel2(float* __restrict__ x, float lambda, const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float xa = fabs(x[tid]);
x[tid] = signbit(lambda-xa)*copysign(xa-lambda,x[tid]);
}
}
The disassembled codes for the two solutions are reported below. As noticed also by @njuffa, the second one seems to be in principle less burdened than the first one due to the lacking x/|x|
division. However, as also noticed by @njuffa, this scenario will be likely memory bound rather than compute bound. However, perhaps this analysis gives indication that the second solution is preferrable when implemented as __device__
functions for non-elementwise computationS.
DISASSEMBLED CODE FOR THE FIRST SOLUTION
code for sm_21
Function : _Z9myKernel1Pffi
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ S2R R3, SR_TID.X; /* 0x2c0000008400dc04 */
/*0018*/ IMAD R0, R0, c[0x0][0x8], R3; /* 0x2006400020001ca3 */
/*0020*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc23 */
/*0028*/ @P0 EXIT ; /* 0x80000000000001e7 */
/*0030*/ MOV32I R3, 0x4; /* 0x180000001000dde2 */
/*0038*/ SSY 0x90; /* 0x6000000140000007 */
/*0040*/ IMAD R16.CC, R0, R3, c[0x0][0x20]; /* 0x2007800080041ca3 */
/*0048*/ IMAD.HI.X R17, R0, R3, c[0x0][0x24]; /* 0x2086800090045ce3 */
/*0050*/ LD.E R2, [R16]; /* 0x8400000001009c85 */
/*0058*/ FSETP.GT.AND P0, PT, |R2|, c[0x0][0x28], PT; /* 0x220e4000a021dc80 */
/*0060*/ F2F.F32.F32 R5, |R2|; /* 0x1000000009215c44 */
/*0068*/ @P0 BRA 0x78; /* 0x40000000200001e7 */
/*0070*/ MOV.S R0, RZ; /* 0x28000000fc001df4 */
/*0078*/ FADD R4, |R2|, -c[0x0][0x28]; /* 0x50004000a0211d80 */
/*0080*/ JCAL 0x0; /* 0x1000000000010007 */
/*0088*/ FMUL.S R0, R2, R4; /* 0x5800000010201c10 */
/*0090*/ ST.E [R16], R0; /* 0x9400000001001c85 */
/*0098*/ EXIT ; /* 0x8000000000001de7 */
.................................
Function : __cuda_sm20_div_rn_noftz_f32_slowpath
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ SHL R0, R4, 0x1; /* 0x6000c00004401c03 */
/*0008*/ MOV32I R6, 0x1; /* 0x1800000004019de2 */
/*0010*/ SHL R3, R5, 0x1; /* 0x6000c0000450dc03 */
/*0018*/ IMAD.U32.U32.HI R0, R0, 0x100, -R6; /* 0x200cc00400001d43 */
/*0020*/ ISETP.GT.U32.AND P0, PT, R0, 0xfd, PT; /* 0x1a0ec003f401dc03 */
/*0028*/ IMAD.U32.U32.HI R3, R3, 0x100, -R6; /* 0x200cc0040030dd43 */
/*0030*/ ISETP.GT.U32.OR P0, PT, R3, 0xfd, P0; /* 0x1a20c003f431dc03 */
/*0038*/ @!P0 BRA 0x178; /* 0x40000004e00021e7 */
/*0040*/ FSETP.LE.AND P0, PT, |R4|, +INF , PT; /* 0x218edfe00041dc80 */
/*0048*/ @!P0 BRA 0x60; /* 0x40000000400021e7 */
/*0050*/ FSETP.LE.AND P0, PT, |R5|, +INF , PT; /* 0x218edfe00051dc80 */
/*0058*/ @P0 BRA 0x70; /* 0x40000000400001e7 */
/*0060*/ FADD R4, R4, R5; /* 0x5000000014411c00 */
/*0068*/ BRA 0x370; /* 0x4000000c00001de7 */
/*0070*/ SHL R7, R5, 0x1; /* 0x6000c0000451dc03 */
/*0078*/ SHL R6, R4, 0x1; /* 0x6000c00004419c03 */
/*0080*/ ISETP.EQ.U32.AND P2, PT, R7, RZ, PT; /* 0x190e0000fc75dc03 */
/*0088*/ ISETP.EQ.U32.AND P1, PT, R6, RZ, PT; /* 0x190e0000fc63dc03 */
/*0090*/ PSETP.AND.AND P0, PT, P1, P2, PT; /* 0x0c0e00000811dc04 */
/*0098*/ @P0 BRA 0xc0; /* 0x40000000800001e7 */
/*00a0*/ FSETP.EQ.AND P3, PT, |R4|, +INF , PT; /* 0x210edfe00047dc80 */
/*00a8*/ FSETP.EQ.AND P0, PT, |R5|, +INF , PT; /* 0x210edfe00051dc80 */
/*00b0*/ @!P3 BRA 0xd8; /* 0x4000000080002de7 */
/*00b8*/ @!P0 BRA 0xd8; /* 0x40000000600021e7 */
/*00c0*/ MOV32I R0, 0xffc00000; /* 0x1bff000000001de2 */
/*00c8*/ MUFU.RSQ R4, R0; /* 0xc800000014011c00 */
/*00d0*/ BRA 0x370; /* 0x4000000a60001de7 */
/*00d8*/ PSETP.OR.AND P0, PT, P0, P1, PT; /* 0x0c0e00004401dc04 */
/*00e0*/ @!P0 BRA 0x100; /* 0x40000000600021e7 */
/*00e8*/ LOP.XOR R0, R5, R4; /* 0x6800000010501c83 */
/*00f0*/ LOP32I.AND R4, R0, 0x80000000; /* 0x3a00000000011c02 */
/*00f8*/ BRA 0x370; /* 0x40000009c0001de7 */
/*0100*/ PSETP.OR.AND P0, PT, P3, P2, PT; /* 0x0c0e00004831dc04 */
/*0108*/ @!P0 BRA 0x130; /* 0x40000000800021e7 */
/*0110*/ LOP.XOR R0, R5, R4; /* 0x6800000010501c83 */
/*0118*/ LOP32I.AND R0, R0, 0x80000000; /* 0x3a00000000001c02 */
/*0120*/ LOP32I.OR R4, R0, 0x7f800000; /* 0x39fe000000011c42 */
/*0128*/ BRA 0x370; /* 0x4000000900001de7 */
/*0130*/ ISETP.GE.AND P1, PT, R0, RZ, PT; /* 0x1b0e0000fc03dc23 */
/*0138*/ ISETP.GE.AND P0, PT, R3, RZ, PT; /* 0x1b0e0000fc31dc23 */
/*0140*/ @!P1 MOV32I R6, 0xffffffc0; /* 0x1bffffff0001a5e2 */
/*0148*/ @!P1 FFMA R4, R4, 1.84467440737095520000e+019, RZ; /* 0x307ed7e000412400 */
/*0150*/ @P1 MOV R6, RZ; /* 0x28000000fc0185e4 */
/*0158*/ @P0 BRA 0x180; /* 0x40000000800001e7 */
/*0160*/ FFMA R5, R5, 1.84467440737095520000e+019, RZ; /* 0x307ed7e000515c00 */
/*0168*/ IADD R6, R6, 0x40; /* 0x4800c00100619c03 */
/*0170*/ BRA 0x180; /* 0x4000000020001de7 */
/*0178*/ MOV R6, RZ; /* 0x28000000fc019de4 */
/*0180*/ IADD R7, R3, -0x7e; /* 0x4800fffe0831dc03 */
/*0188*/ MOV32I R9, 0x3f800000; /* 0x18fe000000025de2 */
/*0190*/ ISCADD R7, -R7, R5, 0x17; /* 0x410000001471dee3 */
/*0198*/ ISUB R3, R0, R3; /* 0x480000000c00dd03 */
/*01a0*/ MUFU.RCP R8, R7; /* 0xc800000010721c00 */
/*01a8*/ IADD R5, R0, -0x7e; /* 0x4800fffe08015c03 */
/*01b0*/ FFMA R9, -R7, R8, R9; /* 0x3012000020725e00 */
/*01b8*/ ISCADD R4, -R5, R4, 0x17; /* 0x4100000010511ee3 */
/*01c0*/ FFMA R5, R8, R9, R8; /* 0x3010000024815c00 */
/*01c8*/ FFMA R8, R4, R5, RZ; /* 0x307e000014421c00 */
/*01d0*/ FFMA R9, -R7, R8, R4; /* 0x3008000020725e00 */
/*01d8*/ FFMA R8, R9, R5, R8; /* 0x3010000014921c00 */
/*01e0*/ FFMA R7, -R7, R8, R4; /* 0x300800002071de00 */
/*01e8*/ FFMA R4, R7, R5, R8; /* 0x3010000014711c00 */
/*01f0*/ SHL R9, R4, 0x1; /* 0x6000c00004425c03 */
/*01f8*/ SHR.U32 R9, R9, 0x18; /* 0x5800c00060925c03 */
/*0200*/ IADD R0, R3, R9; /* 0x4800000024301c03 */
/*0208*/ IADD R6, R6, R0; /* 0x4800000000619c03 */
/*0210*/ IADD R0, R6, -0x1; /* 0x4800fffffc601c03 */
/*0218*/ ISETP.GT.U32.AND P0, PT, R0, 0xfd, PT; /* 0x1a0ec003f401dc03 */
/*0220*/ @P0 BRA 0x240; /* 0x40000000600001e7 */
/*0228*/ ISUB R0, R6, R9; /* 0x4800000024601d03 */
/*0230*/ ISCADD R4, R0, R4, 0x17; /* 0x4000000010011ee3 */
/*0238*/ BRA 0x370; /* 0x40000004c0001de7 */
/*0240*/ ISETP.LE.AND P0, PT, R6, 0xfe, PT; /* 0x198ec003f861dc23 */
/*0248*/ @P0 BRA 0x268; /* 0x40000000600001e7 */
/*0250*/ LOP32I.AND R0, R4, 0x80000000; /* 0x3a00000000401c02 */
/*0258*/ LOP32I.OR R4, R0, 0x7f800000; /* 0x39fe000000011c42 */
/*0260*/ BRA 0x370; /* 0x4000000420001de7 */
/*0268*/ ISETP.GT.AND P0, PT, R6, RZ, PT; /* 0x1a0e0000fc61dc23 */
/*0270*/ @P0 BRA 0x370; /* 0x40000003e00001e7 */
/*0278*/ ISETP.GE.AND P0, PT, R6, -0x18, PT; /* 0x1b0effffa061dc23 */
/*0280*/ @P0 BRA 0x298; /* 0x40000000400001e7 */
/*0288*/ LOP32I.AND R4, R4, 0x80000000; /* 0x3a00000000411c02 */
/*0290*/ BRA 0x370; /* 0x4000000360001de7 */
/*0298*/ FFMA.RP R3, R7, R5, R8; /* 0x311000001470dc00 */
/*02a0*/ FFMA.RM R0, R7, R5, R8; /* 0x3090000014701c00 */
/*02a8*/ FFMA.RZ R5, R7, R5, R8; /* 0x3190000014715c00 */
/*02b0*/ FSET.NEU.AND R3, R0, R3, PT; /* 0x168e00000c00dc00 */
/*02b8*/ I2I.S32.S32 R7, -R6; /* 0x1c0000001921df84 */
/*02c0*/ LOP32I.AND R5, R5, 0x7fffff; /* 0x3801fffffc515c02 */
/*02c8*/ ISETP.EQ.AND P0, PT, R7, RZ, PT; /* 0x190e0000fc71dc23 */
/*02d0*/ LOP32I.AND R0, R4, 0x80000000; /* 0x3a00000000401c02 */
/*02d8*/ I2I.S32.S32 R3, -R3; /* 0x1c0000000d20df84 */
/*02e0*/ I2I.S32.S32 R4, -R6; /* 0x1c00000019211f84 */
/*02e8*/ LOP32I.OR R7, R5, 0x800000; /* 0x380200000051dc42 */
/*02f0*/ @P0 BRA.U 0x328; /* 0x40000000c00081e7 */
/*02f8*/ @!P0 IADD R5, R6, 0x20; /* 0x4800c00080616003 */
/*0300*/ @!P0 SHL R5, R7, R5; /* 0x6000000014716003 */
/*0308*/ @!P0 ICMP.EQ.U32 R5, RZ, 0x1, R5; /* 0x310ac00007f16003 */
/*0310*/ @!P0 SHR.U32 R7, R7, R4; /* 0x580000001071e003 */
/*0318*/ @!P0 LOP.OR R3, R3, R5; /* 0x680000001430e043 */
/*0320*/ NOP; /* 0x4000000000001de4 */
/*0328*/ SHL R4, R7, 0x1e; /* 0x6000c00078711c03 */
/*0330*/ SHR.U32 R5, R4, 0x1f; /* 0x5800c0007c415c03 */
/*0338*/ LOP.AND R4, R7, 0x1; /* 0x6800c00004711c03 */
/*0340*/ LOP.OR R3, R3, R5; /* 0x680000001430dc43 */
/*0348*/ LOP.AND R3, R4, R3; /* 0x680000000c40dc03 */
/*0350*/ SHR.U32 R4, R7, 0x1; /* 0x5800c00004711c03 */
/*0358*/ ISETP.NE.U32.AND P0, PT, R3, RZ, PT; /* 0x1a8e0000fc31dc03 */
/*0360*/ @P0 IADD R4, R4, 0x1; /* 0x4800c00004410003 */
/*0368*/ LOP.OR R4, R0, R4; /* 0x6800000010011c43 */
/*0370*/ RET ; /* 0x9000000000001de7 */
......................................................
Function : __cuda_sm20_div_rn_f32
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MUFU.RCP R3, R5; /* 0xc80000001050dc00 */
/*0008*/ MOV32I R6, 0x3f800000; /* 0x18fe000000019de2 */
/*0010*/ LOP32I.AND R0, R4, 0x7fffff; /* 0x3801fffffc401c02 */
/*0018*/ FFMA.FTZ R6, -R5, R3, R6; /* 0x300c00000c519e40 */
/*0020*/ LOP32I.OR R0, R0, 0x3f800000; /* 0x38fe000000001c42 */
/*0028*/ FFMA.FTZ R3, R3, R6, R3; /* 0x300600001830dc40 */
/*0030*/ FFMA.FTZ R6, R0, R3, RZ; /* 0x307e00000c019c40 */
/*0038*/ FFMA.FTZ R7, -R5, R6, R0; /* 0x300000001851de40 */
/*0040*/ FFMA.FTZ R6, R7, R3, R6; /* 0x300c00000c719c40 */
/*0048*/ FFMA.FTZ R0, -R5, R6, R0; /* 0x3000000018501e40 */
/*0050*/ LOP32I.AND R7, R4, 0xff800000; /* 0x3bfe00000041dc02 */
/*0058*/ FFMA.FTZ R6, R0, R3, R6; /* 0x300c00000c019c40 */
/*0060*/ FFMA.FTZ R0, R6, R7, RZ; /* 0x307e00001c601c40 */
/*0068*/ LOP32I.AND R3, R0, 0x7fffffff; /* 0x39fffffffc00dc02 */
/*0070*/ MOV32I R6, 0x7effffef; /* 0x19fbffffbc019de2 */
/*0078*/ IADD32I R3, R3, -0x800010; /* 0x0bfdffffc030dc02 */
/*0080*/ ISETP.GT.U32.AND P0, PT, R3, R6, PT; /* 0x1a0e00001831dc03 */
/*0088*/ @!P0 BRA 0xa8; /* 0x40000000600021e7 */
/*0090*/ JCAL 0x0; /* 0x1000000000010007 */
/*0098*/ MOV R0, R4; /* 0x2800000010001de4 */
/*00a0*/ NOP; /* 0x4000000000001de4 */
/*00a8*/ MOV R4, R0; /* 0x2800000000011de4 */
/*00b0*/ RET ; /* 0x9000000000001de7 */
.......................................
DISASSEMBLED CODE FOR THE SECOND SOLUTION
code for sm_21
Function : _Z9myKernel2Pffi
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0018*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0020*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc23 */
/*0028*/ @P0 BRA.U 0x98; /* 0x40000001a00081e7 */
/*0030*/ @!P0 MOV32I R3, 0x4; /* 0x180000001000e1e2 */
/*0038*/ @!P0 IMAD R2.CC, R0, R3, c[0x0][0x20]; /* 0x200780008000a0a3 */
/*0040*/ @!P0 IMAD.HI.X R3, R0, R3, c[0x0][0x24]; /* 0x208680009000e0e3 */
/*0048*/ @!P0 LD.E R0, [R2]; /* 0x8400000000202085 */
/*0050*/ @!P0 FADD R5, |R0|, -c[0x0][0x28]; /* 0x50004000a0016180 */
/*0058*/ @!P0 FADD R4, -|R0|, c[0x0][0x28]; /* 0x50004000a0012280 */
/*0060*/ @!P0 LOP32I.AND R0, R0, 0x80000000; /* 0x3a00000000002002 */
/*0068*/ @!P0 LOP32I.AND R5, R5, 0x7fffffff; /* 0x39fffffffc516002 */
/*0070*/ @!P0 SHR.U32 R4, R4, 0x1f; /* 0x5800c0007c412003 */
/*0078*/ @!P0 LOP.OR R5, R0, R5; /* 0x6800000014016043 */
/*0080*/ @!P0 I2F.F32.S32 R0, R4; /* 0x1800000011202204 */
/*0088*/ @!P0 FMUL R0, R0, R5; /* 0x5800000014002000 */
/*0090*/ @!P0 ST.E [R2], R0; /* 0x9400000000202085 */
/*0098*/ EXIT ; /* 0x8000000000001de7 */
.................................
EDIT
A follow-up of this post has appeared in Soft thresholding in CUDA.
这篇关于软阈值CUDA实现的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!