本文介绍了软阈值CUDA实现的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想知道如何在GPU CUDA中实现一个漂亮的软阈值函数内核?
软阈值函数如下:





其中lambda是阈值,x是输入向量或矩阵。假设它们都是真实的。



我更喜欢一个可靠运行的现有代码示例。我不愿意从头开始。任何人都这样做过吗?或者知道一些我在哪里可以找到一个很好的示例代码?感谢。

解决方案

上面注释中提出的两个解决方案是为元素处理设置的,如下:

  __ global__ void myKernel1(float * __restrict__ x,float lambda,const int N)
{
int tid = threadIdx .x + blockIdx.x * blockDim.x;

if(tid< N){
float xa = fabs(x [tid]);
x [tid] =(xa> lambda)? x [tid] *((xa-lambda)/ xa):0;
}

}

  __ global__ void myKernel2(float * __restrict__ x,float lambda,const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid< N){
float xa = fabs(x [tid]);
x [tid] = signbit(lambda-xa)* copysign(xa-lambda,x [tid]);
}

}

两个解决方案的反汇编代码报告如下。正如@njuffa所注意到的,由于缺少 x / | x | 部分,第二个似乎原则上比第一个更负担。然而,也由@njuffa注意到,这种情况将可能是内存绑定,而不是计算绑定。然而,也许这个分析表明,当实现为非元素计算的 __ device __ 函数时,第二个解决方案是优选的。



第一个解决方案的已解析代码

  sm_21 
的代码功能:_Z9myKernel1Pffi
.headerflags @EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)
/ * 0000 * / MOV R1,c [0x1] [0x100] / * 0x2800440400005de4 * /
/ * 0008 * / S2R R0,SR_CTAID.X; / * 0x2c00000094001c04 * /
/ * 0010 * / S2R R3,SR_TID.X; / * 0x2c0000008400dc04 * /
/ * 0018 * / IMAD R0,R0,c [0x0] [0x8],R3; / * 0x2006400020001ca3 * /
/ * 0020 * / ISETP.GE.AND P0,PT,R0,c [0x0] [0x2c] / * 0x1b0e4000b001dc23 * /
/ * 0028 * / @ P0退出; / * 0x80000000000001e7 * /
/ * 0030 * / MOV32I R3,0x4; / * 0x180000001000dde2 * /
/ * 0038 * / SSY 0x90; / * 0x6000000140000007 * /
/ * 0040 * / IMAD R16.CC,R0,R3,c [0x0] [0x20] / * 0x2007800080041ca3 * /
/ * 0048 * / IMAD.HI.X R17,R0,R3,c [0x0] [0x24] / * 0x2086800090045ce3 * /
/ * 0050 * / LD.E R2,[R16]; / * 0x8400000001009c85 * /
/ * 0058 * / FSETP.GT.AND P0,PT,| R2 |,c [0x0] [0x28] / * 0x220e4000a021dc80 * /
/ * 0060 * / F2F.F32.F32 R5,| R2 |; / * 0x1000000009215c44 * /
/ * 0068 * / @ P0 BRA 0x78; / * 0x40000000200001e7 * /
/ * 0070 * / MOV.S R0,RZ; / * 0x28000000fc001df4 * /
/ * 0078 * / FADD R4,| R2 |,-c [0x0] [0x28] / * 0x50004000a0211d80 * /
/ * 0080 * / JCAL 0x0; / * 0x1000000000010007 * /
/ * 0088 * / FMUL.S R0,R2,R4; / * 0x5800000010201c10 * /
/ * 0090 * / ST.E [R16],R0; / * 0x9400000001001c85 * /
/ * 0098 * / EXIT; / * 0x8000000000001de7 * /
.................................


功能:__cuda_sm20_div_rn_noftz_f32_slowpath
.headerflags @EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)
/ * 0000 * / SHL R0,R4,0x1; / * 0x6000c00004401c03 * /
/ * 0008 * / MOV32I R6,0x1; / * 0x1800000004019de2 * /
/ * 0010 * / SHL R3,R5,0x1; / * 0x6000c0000450dc03 * /
/ * 0018 * / IMAD.U32.U32.HI R0,R0,0x100,-R6; / * 0x200cc00400001d43 * /
/ * 0020 * / ISETP.GT.U32.AND P0,PT,R0,0xfd,PT; / * 0x1a0ec003f401dc03 * /
/ * 0028 * / IMAD.U32.U32.HI R3,R3,0x100,-R6; / * 0x200cc0040030dd43 * /
/ * 0030 * / ISETP.GT.U32.OR P0,PT,R3,0xfd,P0; / * 0x1a20c003f431dc03 * /
/ * 0038 * / @!P0 BRA 0x178; / * 0x40000004e00021e7 * /
/ * 0040 * / FSETP.LE.AND P0,PT,| R4 |,+ INF,PT; / * 0x218edfe00041dc80 * /
/ * 0048 * / @!P0 BRA 0x60; / * 0x40000000400021e7 * /
/ * 0050 * / FSETP.LE.AND P0,PT,| R5 |,+ INF,PT; / * 0x218edfe00051dc80 * /
/ * 0058 * / @ P0 BRA 0x70; / * 0x40000000400001e7 * /
/ * 0060 * / FADD R4,R4,R5; / * 0x5000000014411c00 * /
/ * 0068 * / BRA 0x370; / * 0x4000000c00001de7 * /
/ * 0070 * / SHL R7,R5,0x1; / * 0x6000c0000451dc03 * /
/ * 0078 * / SHL R6,R4,0x1; / * 0x6000c00004419c03 * /
/ * 0080 * / ISETP.EQ.U32.AND P2,PT,R7,RZ,PT; / * 0x190e0000fc75dc03 * /
/ * 0088 * / ISETP.EQ.U32.AND P1,PT,R6,RZ,PT; / * 0x190e0000fc63dc03 * /
/ * 0090 * / PSETP.AND.AND P0,PT,P1,P2,PT; / * 0x0c0e00000811dc04 * /
/ * 0098 * / @ P0 BRA 0xc0; / * 0x40000000800001e7 * /
/ * 00a0 * / FSETP.EQ.AND P3,PT,| R4 |,+ INF,PT; / * 0x210edfe00047dc80 * /
/ * 00a8 * / FSETP.EQ.AND P0,PT,| R5 |,+ INF,PT; / * 0x210edfe00051dc80 * /
/ * 00b0 * / @!P3 BRA 0xd8; / * 0x4000000080002de7 * /
/ * 00b8 * / @!P0 BRA 0xd8; / * 0x40000000600021e7 * /
/ * 00c0 * / MOV32I R0,0xffc00000; / * 0x1bff000000001de2 * /
/ * 00c8 * / MUFU.RSQ R4,R0; / * 0xc800000014011c00 * /
/ * 00d0 * / BRA 0x370; / * 0x4000000a60001de7 * /
/ * 00d8 * / PSETP.OR.AND P0,PT,P0,P1,PT; / * 0x0c0e00004401dc04 * /
/ * 00e0 * / @!P0 BRA 0x100; / * 0x40000000600021e7 * /
/ * 00e8 * / LOP.XOR R0,R5,R4; / * 0x6800000010501c83 * /
/ * 00f0 * / LOP32I.AND R4,R0,0x80000000; / * 0x3a00000000011c02 * /
/ * 00f8 * / BRA 0x370; / * 0x40000009c0001de7 * /
/ * 0100 * / PSETP.OR.AND P0,PT,P3,P2,PT; / * 0x0c0e00004831dc04 * /
/ * 0108 * / @!P0 BRA 0x130; / * 0x40000000800021e7 * /
/ * 0110 * / LOP.XOR R0,R5,R4; / * 0x6800000010501c83 * /
/ * 0118 * / LOP32I.AND R0,R0,0x80000000; / * 0x3a00000000001c02 * /
/ * 0120 * / LOP32I.OR R4,R0,0x7f800000; / * 0x39fe000000011c42 * /
/ * 0128 * / BRA 0x370; / * 0x4000000900001de7 * /
/ * 0130 * / ISETP.GE.AND P1,PT,R0,RZ,PT; / * 0x1b0e0000fc03dc23 * /
/ * 0138 * / ISETP.GE.AND P0,PT,R3,RZ,PT; / * 0x1b0e0000fc31dc23 * /
/ * 0140 * / @!P1 MOV32I R6,0xffffffc0; / * 0x1bffffff0001a5e2 * /
/ * 0148 * / @!P1 FFMA R4,R4,1.84467440737095520000e + 019,RZ; / * 0x307ed7e000412400 * /
/ * 0150 * / @ P1 MOV R6,RZ; / * 0x28000000fc0185e4 * /
/ * 0158 * / @ P0 BRA 0x180; / * 0x40000000800001e7 * /
/ * 0160 * / FFMA R5,R5,1.84467440737095520000e + 019,RZ; / * 0x307ed7e000515c00 * /
/ * 0168 * / IADD R6,R6,0x40; / * 0x4800c00100619c03 * /
/ * 0170 * / BRA 0x180; / * 0x4000000020001de7 * /
/ * 0178 * / MOV R6,RZ; / * 0x28000000fc019de4 * /
/ * 0180 * / IADD R7,R3,-0x7e; / * 0x4800fffe0831dc03 * /
/ * 0188 * / MOV32I R9,0x3f800000; / * 0x18fe000000025de2 * /
/ * 0190 * / ISCADD R7,-R7,R5,0x17; / * 0x410000001471dee3 * /
/ * 0198 * / ISUB R3,R0,R3; / * 0x480000000c00dd03 * /
/ * 01a0 * / MUFU.RCP R8,R7; / * 0xc800000010721c00 * /
/ * 01a8 * / IADD R5,R0,-0x7e; / * 0x4800fffe08015c03 * /
/ * 01b0 * / FFMA R9,-R7,R8,R9; / * 0x3012000020725e00 * /
/ * 01b8 * / ISCADD R4,-R5,R4,0x17; / * 0x4100000010511ee3 * /
/ * 01c0 * / FFMA R5,R8,R9,R8; / * 0x3010000024815c00 * /
/ * 01c8 * / FFMA R8,R4,R5,RZ; / * 0x307e000014421c00 * /
/ * 01d0 * / FFMA R9,-R7,R8,R4; / * 0x3008000020725e00 * /
/ * 01d8 * / FFMA R8,R9,R5,R8; / * 0x3010000014921c00 * /
/ * 01e0 * / FFMA R7,-R7,R8,R4; / * 0x300800002071de00 * /
/ * 01e8 * / FFMA R4,R7,R5,R8; / * 0x3010000014711c00 * /
/ * 01f0 * / SHL R9,R4,0x1; / * 0x6000c00004425c03 * /
/ * 01f8 * / SHR.U32 R9,R9,0x18; / * 0x5800c00060925c03 * /
/ * 0200 * / IADD R0,R3,R9; / * 0x4800000024301c03 * /
/ * 0208 * / IADD R6,R6,R0; / * 0x4800000000619c03 * /
/ * 0210 * / IADD R0,R6,-0x1; / * 0x4800fffffc601c03 * /
/ * 0218 * / ISETP.GT.U32.AND P0,PT,R0,0xfd,PT; / * 0x1a0ec003f401dc03 * /
/ * 0220 * / @ P0 BRA 0x240; / * 0x40000000600001e7 * /
/ * 0228 * / ISUB R0,R6,R9; / * 0x4800000024601d03 * /
/ * 0230 * / ISCADD R4,R0,R4,0x17; / * 0x4000000010011ee3 * /
/ * 0238 * / BRA 0x370; / * 0x40000004c0001de7 * /
/ * 0240 * / ISETP.LE.AND P0,PT,R6,0xfe,PT; / * 0x198ec003f861dc23 * /
/ * 0248 * / @ P0 BRA 0x268; / * 0x40000000600001e7 * /
/ * 0250 * / LOP32I.AND R0,R4,0x80000000; / * 0x3a00000000401c02 * /
/ * 0258 * / LOP32I.OR R4,R0,0x7f800000; / * 0x39fe000000011c42 * /
/ * 0260 * / BRA 0x370; / * 0x4000000420001de7 * /
/ * 0268 * / ISETP.GT.AND P0,PT,R6,RZ,PT; / * 0x1a0e0000fc61dc23 * /
/ * 0270 * / @ P0 BRA 0x370; / * 0x40000003e00001e7 * /
/ * 0278 * / ISETP.GE.AND P0,PT,R6,-0x18,PT; / * 0x1b0effffa061dc23 * /
/ * 0280 * / @ P0 BRA 0x298; / * 0x40000000400001e7 * /
/ * 0288 * / LOP32I.AND R4,R4,0x80000000; / * 0x3a00000000411c02 * /
/ * 0290 * / BRA 0x370; / * 0x4000000360001de7 * /
/ * 0298 * / FFMA.RP R3,R7,R5,R8; / * 0x311000001470dc00 * /
/ * 02a0 * / FFMA.RM R0,R7,R5,R8; / * 0x3090000014701c00 * /
/ * 02a8 * / FFMA.RZ R5,R7,R5,R8; / * 0x3190000014715c00 * /
/ * 02b0 * / FSET.NEU.AND R3,R0,R3,PT; / * 0x168e00000c00dc00 * /
/ * 02b8 * / I2I.S32.S32 R7,-R6; / * 0x1c0000001921df84 * /
/ * 02c0 * / LOP32I.AND R5,R5,0x7fffff; / * 0x3801fffffc515c02 * /
/ * 02c8 * / ISETP.EQ.AND P0,PT,R7,RZ,PT; / * 0x190e0000fc71dc23 * /
/ * 02d0 * / LOP32I.AND R0,R4,0x80000000; / * 0x3a00000000401c02 * /
/ * 02d8 * / I2I.S32.S32 R3,-R3; / * 0x1c0000000d20df84 * /
/ * 02e0 * / I2I.S32.S32 R4,-R6; / * 0x1c00000019211f84 * /
/ * 02e8 * / LOP32I.OR R7,R5,0x800000; / * 0x380200000051dc42 * /
/ * 02f0 * / @ P0 BRA.U 0x328; / * 0x40000000c00081e7 * /
/ * 02f8 * / @!P0 IADD R5,R6,0x20; / * 0x4800c00080616003 * /
/ * 0300 * / @!P0 SHL R5,R7,R5; / * 0x6000000014716003 * /
/ * 0308 * / @!P0 ICMP.EQ.U32 R5,RZ,0x1,R5; / * 0x310ac00007f16003 * /
/ * 0310 * / @!P0 SHR.U32 R7,R7,R4; / * 0x580000001071e003 * /
/ * 0318 * / @!P0 LOP.OR R3,R3,R5; / * 0x680000001430e043 * /
/ * 0320 * / NOP; / * 0x4000000000001de4 * /
/ * 0328 * / SHL R4,R7,0x1e; / * 0x6000c00078711c03 * /
/ * 0330 * / SHR.U32 R5,R4,0x1f; / * 0x5800c0007c415c03 * /
/ * 0338 * / LOP.AND R4,R7,0x1; / * 0x6800c00004711c03 * /
/ * 0340 * / LOP.OR R3,R3,R5; / * 0x680000001430dc43 * /
/ * 0348 * / LOP.AND R3,R4,R3; / * 0x680000000c40dc03 * /
/ * 0350 * / SHR.U32 R4,R7,0x1; / * 0x5800c00004711c03 * /
/ * 0358 * / ISETP.NE.U32.AND P0,PT,R3,RZ,PT; / * 0x1a8e0000fc31dc03 * /
/ * 0360 * / @ P0 IADD R4,R4,0x1; / * 0x4800c00004410003 * /
/ * 0368 * / LOP.OR R4,R0,R4; / * 0x6800000010011c43 * /
/ * 0370 * / RET; / * 0x9000000000001de7 * /
......................................... .............


功能:__cuda_sm20_div_rn_f32
.headerflags @EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)
/ * 0000 * / MUFU.RCP R3,R5; / * 0xc80000001050dc00 * /
/ * 0008 * / MOV32I R6,0x3f800000; / * 0x18fe000000019de2 * /
/ * 0010 * / LOP32I.AND R0,R4,0x7fffff; / * 0x3801fffffc401c02 * /
/ * 0018 * / FFMA.FTZ R6,-R5,R3,R6; / * 0x300c00000c519e40 * /
/ * 0020 * / LOP32I.OR R0,R0,0x3f800000; / * 0x38fe000000001c42 * /
/ * 0028 * / FFMA.FTZ R3,R3,R6,R3; / * 0x300600001830dc40 * /
/ * 0030 * / FFMA.FTZ R6,R0,R3,RZ; / * 0x307e00000c019c40 * /
/ * 0038 * / FFMA.FTZ R7,-R5,R6,R0; / * 0x300000001851de40 * /
/ * 0040 * / FFMA.FTZ R6,R7,R3,R6; / * 0x300c00000c719c40 * /
/ * 0048 * / FFMA.FTZ R0,-R5,R6,R0; / * 0x3000000018501e40 * /
/ * 0050 * / LOP32I.AND R7,R4,0xff800000; / * 0x3bfe00000041dc02 * /
/ * 0058 * / FFMA.FTZ R6,R0,R3,R6; / * 0x300c00000c019c40 * /
/ * 0060 * / FFMA.FTZ R0,R6,R7,RZ; / * 0x307e00001c601c40 * /
/ * 0068 * / LOP32I.AND R3,R0,0x7fffffff; / * 0x39fffffffc00dc02 * /
/ * 0070 * / MOV32I R6,0x7effffef; / * 0x19fbffffbc019de2 * /
/ * 0078 * / IADD32I R3,R3,-0x800010; / * 0x0bfdffffc030dc02 * /
/ * 0080 * / ISETP.GT.U32.AND P0,PT,R3,R6,PT; / * 0x1a0e00001831dc03 * /
/ * 0088 * / @!P0 BRA 0xa8; / * 0x40000000600021e7 * /
/ * 0090 * / JCAL 0x0; / * 0x1000000000010007 * /
/ * 0098 * / MOV R0,R4; / * 0x2800000010001de4 * /
/ * 00a0 * / NOP; / * 0x4000000000001de4 * /
/ * 00a8 * / MOV R4,R0; / * 0x2800000000011de4 * /
/ * 00b0 * / RET; / * 0x9000000000001de7 * /
.......................................

第二个解决方案的已解析代码



代码sm_21
功能:_Z9myKernel2Pffi
.headerflags @EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)
/ * 0000 * / MOV R1, c [0x1] [0x100]; / * 0x2800440400005de4 * /
/ * 0008 * / S2R R0,SR_CTAID.X; / * 0x2c00000094001c04 * /
/ * 0010 * / S2R R2,SR_TID.X; / * 0x2c00000084009c04 * /
/ * 0018 * / IMAD R0,R0,c [0x0] [0x8],R2; / * 0x2004400020001ca3 * /
/ * 0020 * / ISETP.GE.AND P0,PT,R0,c [0x0] [0x2c] / * 0x1b0e4000b001dc23 * /
/ * 0028 * / @ P0 BRA.U 0x98; / * 0x40000001a00081e7 * /
/ * 0030 * / @!P0 MOV32I R3,0x4; / * 0x180000001000e1e2 * /
/ * 0038 * / @!P0 IMAD R2.CC,R0,R3,c [0x0] [0x20] / * 0x200780008000a0a3 * /
/ * 0040 * / @!P0 IMAD.HI.X R3,R0,R3,c [0x0] [0x24] / * 0x208680009000e0e3 * /
/ * 0048 * / @!P0 LD.E R0,[R2]; / * 0x8400000000202085 * /
/ * 0050 * / @!P0 FADD R5,| R0 |,-c [0x0] [0x28] / * 0x50004000a0016180 * /
/ * 0058 * / @!P0 FADD R4, - | R0 |,c [0x0] [0x28] / * 0x50004000a0012280 * /
/ * 0060 * / @!P0 LOP32I.AND R0,R0,0x80000000; / * 0x3a00000000002002 * /
/ * 0068 * / @!P0 LOP32I.AND R5,R5,0x7fffffff; / * 0x39fffffffc516002 * /
/ * 0070 * / @!P0 SHR.U32 R4,R4,0x1f; / * 0x5800c0007c412003 * /
/ * 0078 * / @!P0 LOP.OR R5,R0,R5; / * 0x6800000014016043 * /
/ * 0080 * / @!P0 I2F.F32.S32 R0,R4; / * 0x1800000011202204 * /
/ * 0088 * / @!P0 FMUL R0,R0,R5; / * 0x5800000014002000 * /
/ * 0090 * / @!P0 ST.E [R2],R0; / * 0x9400000000202085 * /
/ * 0098 * / EXIT; / * 0x8000000000001de7 * /
.................................

EDIT



帖子已出现在中。


I am wondering how should I implement a nice soft thresholding function kernel in GPU CUDA?The soft thresholding function is like following:

where lambda is the threshold, and x is the input vector or matrix. Suppose they are both real.

I prefer aN existing code sample that runs reliably. I am reluctant to start from scratch. Anyone has done this before? Or know some where I can find a good sample code? Thanks.

解决方案

The two solutions proposed in the comments above, set up for an elementwise processing, are the following:

__global__ void myKernel1(float* __restrict__ x, float lambda, const int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {
        float xa = fabs(x[tid]); 
        x[tid] = (xa > lambda) ? x[tid] * ((xa - lambda) / xa) : 0;
    }

}

and

__global__ void myKernel2(float* __restrict__ x, float lambda, const int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {
        float xa = fabs(x[tid]); 
        x[tid] = signbit(lambda-xa)*copysign(xa-lambda,x[tid]);
    }

}

The disassembled codes for the two solutions are reported below. As noticed also by @njuffa, the second one seems to be in principle less burdened than the first one due to the lacking x/|x| division. However, as also noticed by @njuffa, this scenario will be likely memory bound rather than compute bound. However, perhaps this analysis gives indication that the second solution is preferrable when implemented as __device__ functions for non-elementwise computationS.

DISASSEMBLED CODE FOR THE FIRST SOLUTION

code for sm_21
    Function : _Z9myKernel1Pffi
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/        MOV R1, c[0x1][0x100];                       /* 0x2800440400005de4 */
    /*0008*/        S2R R0, SR_CTAID.X;                          /* 0x2c00000094001c04 */
    /*0010*/        S2R R3, SR_TID.X;                            /* 0x2c0000008400dc04 */
    /*0018*/        IMAD R0, R0, c[0x0][0x8], R3;                /* 0x2006400020001ca3 */
    /*0020*/        ISETP.GE.AND P0, PT, R0, c[0x0][0x2c], PT;   /* 0x1b0e4000b001dc23 */
    /*0028*/    @P0 EXIT ;                                       /* 0x80000000000001e7 */
    /*0030*/        MOV32I R3, 0x4;                              /* 0x180000001000dde2 */
    /*0038*/        SSY 0x90;                                    /* 0x6000000140000007 */
    /*0040*/        IMAD R16.CC, R0, R3, c[0x0][0x20];           /* 0x2007800080041ca3 */
    /*0048*/        IMAD.HI.X R17, R0, R3, c[0x0][0x24];         /* 0x2086800090045ce3 */
    /*0050*/        LD.E R2, [R16];                              /* 0x8400000001009c85 */
    /*0058*/        FSETP.GT.AND P0, PT, |R2|, c[0x0][0x28], PT; /* 0x220e4000a021dc80 */
    /*0060*/        F2F.F32.F32 R5, |R2|;                        /* 0x1000000009215c44 */
    /*0068*/    @P0 BRA 0x78;                                    /* 0x40000000200001e7 */
    /*0070*/        MOV.S R0, RZ;                                /* 0x28000000fc001df4 */
    /*0078*/        FADD R4, |R2|, -c[0x0][0x28];                /* 0x50004000a0211d80 */
    /*0080*/        JCAL 0x0;                                    /* 0x1000000000010007 */
    /*0088*/        FMUL.S R0, R2, R4;                           /* 0x5800000010201c10 */
    /*0090*/        ST.E [R16], R0;                              /* 0x9400000001001c85 */
    /*0098*/        EXIT ;                                       /* 0x8000000000001de7 */
    .................................


    Function : __cuda_sm20_div_rn_noftz_f32_slowpath
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/        SHL R0, R4, 0x1;                                   /* 0x6000c00004401c03 */
    /*0008*/        MOV32I R6, 0x1;                                    /* 0x1800000004019de2 */
    /*0010*/        SHL R3, R5, 0x1;                                   /* 0x6000c0000450dc03 */
    /*0018*/        IMAD.U32.U32.HI R0, R0, 0x100, -R6;                /* 0x200cc00400001d43 */
    /*0020*/        ISETP.GT.U32.AND P0, PT, R0, 0xfd, PT;             /* 0x1a0ec003f401dc03 */
    /*0028*/        IMAD.U32.U32.HI R3, R3, 0x100, -R6;                /* 0x200cc0040030dd43 */
    /*0030*/        ISETP.GT.U32.OR P0, PT, R3, 0xfd, P0;              /* 0x1a20c003f431dc03 */
    /*0038*/   @!P0 BRA 0x178;                                         /* 0x40000004e00021e7 */
    /*0040*/        FSETP.LE.AND P0, PT, |R4|, +INF , PT;              /* 0x218edfe00041dc80 */
    /*0048*/   @!P0 BRA 0x60;                                          /* 0x40000000400021e7 */
    /*0050*/        FSETP.LE.AND P0, PT, |R5|, +INF , PT;              /* 0x218edfe00051dc80 */
    /*0058*/    @P0 BRA 0x70;                                          /* 0x40000000400001e7 */
    /*0060*/        FADD R4, R4, R5;                                   /* 0x5000000014411c00 */
    /*0068*/        BRA 0x370;                                         /* 0x4000000c00001de7 */
    /*0070*/        SHL R7, R5, 0x1;                                   /* 0x6000c0000451dc03 */
    /*0078*/        SHL R6, R4, 0x1;                                   /* 0x6000c00004419c03 */
    /*0080*/        ISETP.EQ.U32.AND P2, PT, R7, RZ, PT;               /* 0x190e0000fc75dc03 */
    /*0088*/        ISETP.EQ.U32.AND P1, PT, R6, RZ, PT;               /* 0x190e0000fc63dc03 */
    /*0090*/        PSETP.AND.AND P0, PT, P1, P2, PT;                  /* 0x0c0e00000811dc04 */
    /*0098*/    @P0 BRA 0xc0;                                          /* 0x40000000800001e7 */
    /*00a0*/        FSETP.EQ.AND P3, PT, |R4|, +INF , PT;              /* 0x210edfe00047dc80 */
    /*00a8*/        FSETP.EQ.AND P0, PT, |R5|, +INF , PT;              /* 0x210edfe00051dc80 */
    /*00b0*/   @!P3 BRA 0xd8;                                          /* 0x4000000080002de7 */
    /*00b8*/   @!P0 BRA 0xd8;                                          /* 0x40000000600021e7 */
    /*00c0*/        MOV32I R0, 0xffc00000;                             /* 0x1bff000000001de2 */
    /*00c8*/        MUFU.RSQ R4, R0;                                   /* 0xc800000014011c00 */
    /*00d0*/        BRA 0x370;                                         /* 0x4000000a60001de7 */
    /*00d8*/        PSETP.OR.AND P0, PT, P0, P1, PT;                   /* 0x0c0e00004401dc04 */
    /*00e0*/   @!P0 BRA 0x100;                                         /* 0x40000000600021e7 */
    /*00e8*/        LOP.XOR R0, R5, R4;                                /* 0x6800000010501c83 */
    /*00f0*/        LOP32I.AND R4, R0, 0x80000000;                     /* 0x3a00000000011c02 */
    /*00f8*/        BRA 0x370;                                         /* 0x40000009c0001de7 */
    /*0100*/        PSETP.OR.AND P0, PT, P3, P2, PT;                   /* 0x0c0e00004831dc04 */
    /*0108*/   @!P0 BRA 0x130;                                         /* 0x40000000800021e7 */
    /*0110*/        LOP.XOR R0, R5, R4;                                /* 0x6800000010501c83 */
    /*0118*/        LOP32I.AND R0, R0, 0x80000000;                     /* 0x3a00000000001c02 */
    /*0120*/        LOP32I.OR R4, R0, 0x7f800000;                      /* 0x39fe000000011c42 */
    /*0128*/        BRA 0x370;                                         /* 0x4000000900001de7 */
    /*0130*/        ISETP.GE.AND P1, PT, R0, RZ, PT;                   /* 0x1b0e0000fc03dc23 */
    /*0138*/        ISETP.GE.AND P0, PT, R3, RZ, PT;                   /* 0x1b0e0000fc31dc23 */
    /*0140*/   @!P1 MOV32I R6, 0xffffffc0;                             /* 0x1bffffff0001a5e2 */
    /*0148*/   @!P1 FFMA R4, R4, 1.84467440737095520000e+019, RZ;      /* 0x307ed7e000412400 */
    /*0150*/    @P1 MOV R6, RZ;                                        /* 0x28000000fc0185e4 */
    /*0158*/    @P0 BRA 0x180;                                         /* 0x40000000800001e7 */
    /*0160*/        FFMA R5, R5, 1.84467440737095520000e+019, RZ;      /* 0x307ed7e000515c00 */
    /*0168*/        IADD R6, R6, 0x40;                                 /* 0x4800c00100619c03 */
    /*0170*/        BRA 0x180;                                         /* 0x4000000020001de7 */
    /*0178*/        MOV R6, RZ;                                        /* 0x28000000fc019de4 */
    /*0180*/        IADD R7, R3, -0x7e;                                /* 0x4800fffe0831dc03 */
    /*0188*/        MOV32I R9, 0x3f800000;                             /* 0x18fe000000025de2 */
    /*0190*/        ISCADD R7, -R7, R5, 0x17;                          /* 0x410000001471dee3 */
    /*0198*/        ISUB R3, R0, R3;                                   /* 0x480000000c00dd03 */
    /*01a0*/        MUFU.RCP R8, R7;                                   /* 0xc800000010721c00 */
    /*01a8*/        IADD R5, R0, -0x7e;                                /* 0x4800fffe08015c03 */
    /*01b0*/        FFMA R9, -R7, R8, R9;                              /* 0x3012000020725e00 */
    /*01b8*/        ISCADD R4, -R5, R4, 0x17;                          /* 0x4100000010511ee3 */
    /*01c0*/        FFMA R5, R8, R9, R8;                               /* 0x3010000024815c00 */
    /*01c8*/        FFMA R8, R4, R5, RZ;                               /* 0x307e000014421c00 */
    /*01d0*/        FFMA R9, -R7, R8, R4;                              /* 0x3008000020725e00 */
    /*01d8*/        FFMA R8, R9, R5, R8;                               /* 0x3010000014921c00 */
    /*01e0*/        FFMA R7, -R7, R8, R4;                              /* 0x300800002071de00 */
    /*01e8*/        FFMA R4, R7, R5, R8;                               /* 0x3010000014711c00 */
    /*01f0*/        SHL R9, R4, 0x1;                                   /* 0x6000c00004425c03 */
    /*01f8*/        SHR.U32 R9, R9, 0x18;                              /* 0x5800c00060925c03 */
    /*0200*/        IADD R0, R3, R9;                                   /* 0x4800000024301c03 */
    /*0208*/        IADD R6, R6, R0;                                   /* 0x4800000000619c03 */
    /*0210*/        IADD R0, R6, -0x1;                                 /* 0x4800fffffc601c03 */
    /*0218*/        ISETP.GT.U32.AND P0, PT, R0, 0xfd, PT;             /* 0x1a0ec003f401dc03 */
    /*0220*/    @P0 BRA 0x240;                                         /* 0x40000000600001e7 */
    /*0228*/        ISUB R0, R6, R9;                                   /* 0x4800000024601d03 */
    /*0230*/        ISCADD R4, R0, R4, 0x17;                           /* 0x4000000010011ee3 */
    /*0238*/        BRA 0x370;                                         /* 0x40000004c0001de7 */
    /*0240*/        ISETP.LE.AND P0, PT, R6, 0xfe, PT;                 /* 0x198ec003f861dc23 */
    /*0248*/    @P0 BRA 0x268;                                         /* 0x40000000600001e7 */
    /*0250*/        LOP32I.AND R0, R4, 0x80000000;                     /* 0x3a00000000401c02 */
    /*0258*/        LOP32I.OR R4, R0, 0x7f800000;                      /* 0x39fe000000011c42 */
    /*0260*/        BRA 0x370;                                         /* 0x4000000420001de7 */
    /*0268*/        ISETP.GT.AND P0, PT, R6, RZ, PT;                   /* 0x1a0e0000fc61dc23 */
    /*0270*/    @P0 BRA 0x370;                                         /* 0x40000003e00001e7 */
    /*0278*/        ISETP.GE.AND P0, PT, R6, -0x18, PT;                /* 0x1b0effffa061dc23 */
    /*0280*/    @P0 BRA 0x298;                                         /* 0x40000000400001e7 */
    /*0288*/        LOP32I.AND R4, R4, 0x80000000;                     /* 0x3a00000000411c02 */
    /*0290*/        BRA 0x370;                                         /* 0x4000000360001de7 */
    /*0298*/        FFMA.RP R3, R7, R5, R8;                            /* 0x311000001470dc00 */
    /*02a0*/        FFMA.RM R0, R7, R5, R8;                            /* 0x3090000014701c00 */
    /*02a8*/        FFMA.RZ R5, R7, R5, R8;                            /* 0x3190000014715c00 */
    /*02b0*/        FSET.NEU.AND R3, R0, R3, PT;                       /* 0x168e00000c00dc00 */
    /*02b8*/        I2I.S32.S32 R7, -R6;                               /* 0x1c0000001921df84 */
    /*02c0*/        LOP32I.AND R5, R5, 0x7fffff;                       /* 0x3801fffffc515c02 */
    /*02c8*/        ISETP.EQ.AND P0, PT, R7, RZ, PT;                   /* 0x190e0000fc71dc23 */
    /*02d0*/        LOP32I.AND R0, R4, 0x80000000;                     /* 0x3a00000000401c02 */
    /*02d8*/        I2I.S32.S32 R3, -R3;                               /* 0x1c0000000d20df84 */
    /*02e0*/        I2I.S32.S32 R4, -R6;                               /* 0x1c00000019211f84 */
    /*02e8*/        LOP32I.OR R7, R5, 0x800000;                        /* 0x380200000051dc42 */
    /*02f0*/    @P0 BRA.U 0x328;                                       /* 0x40000000c00081e7 */
    /*02f8*/   @!P0 IADD R5, R6, 0x20;                                 /* 0x4800c00080616003 */
    /*0300*/   @!P0 SHL R5, R7, R5;                                    /* 0x6000000014716003 */
    /*0308*/   @!P0 ICMP.EQ.U32 R5, RZ, 0x1, R5;                       /* 0x310ac00007f16003 */
    /*0310*/   @!P0 SHR.U32 R7, R7, R4;                                /* 0x580000001071e003 */
    /*0318*/   @!P0 LOP.OR R3, R3, R5;                                 /* 0x680000001430e043 */
    /*0320*/        NOP;                                               /* 0x4000000000001de4 */
    /*0328*/        SHL R4, R7, 0x1e;                                  /* 0x6000c00078711c03 */
    /*0330*/        SHR.U32 R5, R4, 0x1f;                              /* 0x5800c0007c415c03 */
    /*0338*/        LOP.AND R4, R7, 0x1;                               /* 0x6800c00004711c03 */
    /*0340*/        LOP.OR R3, R3, R5;                                 /* 0x680000001430dc43 */
    /*0348*/        LOP.AND R3, R4, R3;                                /* 0x680000000c40dc03 */
    /*0350*/        SHR.U32 R4, R7, 0x1;                               /* 0x5800c00004711c03 */
    /*0358*/        ISETP.NE.U32.AND P0, PT, R3, RZ, PT;               /* 0x1a8e0000fc31dc03 */
    /*0360*/    @P0 IADD R4, R4, 0x1;                                  /* 0x4800c00004410003 */
    /*0368*/        LOP.OR R4, R0, R4;                                 /* 0x6800000010011c43 */
    /*0370*/        RET ;                                              /* 0x9000000000001de7 */
    ......................................................


    Function : __cuda_sm20_div_rn_f32
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/        MUFU.RCP R3, R5;                     /* 0xc80000001050dc00 */
    /*0008*/        MOV32I R6, 0x3f800000;               /* 0x18fe000000019de2 */
    /*0010*/        LOP32I.AND R0, R4, 0x7fffff;         /* 0x3801fffffc401c02 */
    /*0018*/        FFMA.FTZ R6, -R5, R3, R6;            /* 0x300c00000c519e40 */
    /*0020*/        LOP32I.OR R0, R0, 0x3f800000;        /* 0x38fe000000001c42 */
    /*0028*/        FFMA.FTZ R3, R3, R6, R3;             /* 0x300600001830dc40 */
    /*0030*/        FFMA.FTZ R6, R0, R3, RZ;             /* 0x307e00000c019c40 */
    /*0038*/        FFMA.FTZ R7, -R5, R6, R0;            /* 0x300000001851de40 */
    /*0040*/        FFMA.FTZ R6, R7, R3, R6;             /* 0x300c00000c719c40 */
    /*0048*/        FFMA.FTZ R0, -R5, R6, R0;            /* 0x3000000018501e40 */
    /*0050*/        LOP32I.AND R7, R4, 0xff800000;       /* 0x3bfe00000041dc02 */
    /*0058*/        FFMA.FTZ R6, R0, R3, R6;             /* 0x300c00000c019c40 */
    /*0060*/        FFMA.FTZ R0, R6, R7, RZ;             /* 0x307e00001c601c40 */
    /*0068*/        LOP32I.AND R3, R0, 0x7fffffff;       /* 0x39fffffffc00dc02 */
    /*0070*/        MOV32I R6, 0x7effffef;               /* 0x19fbffffbc019de2 */
    /*0078*/        IADD32I R3, R3, -0x800010;           /* 0x0bfdffffc030dc02 */
    /*0080*/        ISETP.GT.U32.AND P0, PT, R3, R6, PT; /* 0x1a0e00001831dc03 */
    /*0088*/   @!P0 BRA 0xa8;                            /* 0x40000000600021e7 */
    /*0090*/        JCAL 0x0;                            /* 0x1000000000010007 */
    /*0098*/        MOV R0, R4;                          /* 0x2800000010001de4 */
    /*00a0*/        NOP;                                 /* 0x4000000000001de4 */
    /*00a8*/        MOV R4, R0;                          /* 0x2800000000011de4 */
    /*00b0*/        RET ;                                /* 0x9000000000001de7 */
    .......................................

DISASSEMBLED CODE FOR THE SECOND SOLUTION

code for sm_21
    Function : _Z9myKernel2Pffi
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/        MOV R1, c[0x1][0x100];                     /* 0x2800440400005de4 */
    /*0008*/        S2R R0, SR_CTAID.X;                        /* 0x2c00000094001c04 */
    /*0010*/        S2R R2, SR_TID.X;                          /* 0x2c00000084009c04 */
    /*0018*/        IMAD R0, R0, c[0x0][0x8], R2;              /* 0x2004400020001ca3 */
    /*0020*/        ISETP.GE.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc23 */
    /*0028*/    @P0 BRA.U 0x98;                                /* 0x40000001a00081e7 */
    /*0030*/   @!P0 MOV32I R3, 0x4;                            /* 0x180000001000e1e2 */
    /*0038*/   @!P0 IMAD R2.CC, R0, R3, c[0x0][0x20];          /* 0x200780008000a0a3 */
    /*0040*/   @!P0 IMAD.HI.X R3, R0, R3, c[0x0][0x24];        /* 0x208680009000e0e3 */
    /*0048*/   @!P0 LD.E R0, [R2];                             /* 0x8400000000202085 */
    /*0050*/   @!P0 FADD R5, |R0|, -c[0x0][0x28];              /* 0x50004000a0016180 */
    /*0058*/   @!P0 FADD R4, -|R0|, c[0x0][0x28];              /* 0x50004000a0012280 */
    /*0060*/   @!P0 LOP32I.AND R0, R0, 0x80000000;             /* 0x3a00000000002002 */
    /*0068*/   @!P0 LOP32I.AND R5, R5, 0x7fffffff;             /* 0x39fffffffc516002 */
    /*0070*/   @!P0 SHR.U32 R4, R4, 0x1f;                      /* 0x5800c0007c412003 */
    /*0078*/   @!P0 LOP.OR R5, R0, R5;                         /* 0x6800000014016043 */
    /*0080*/   @!P0 I2F.F32.S32 R0, R4;                        /* 0x1800000011202204 */
    /*0088*/   @!P0 FMUL R0, R0, R5;                           /* 0x5800000014002000 */
    /*0090*/   @!P0 ST.E [R2], R0;                             /* 0x9400000000202085 */
    /*0098*/        EXIT ;                                     /* 0x8000000000001de7 */
    .................................

EDIT

A follow-up of this post has appeared in Soft thresholding in CUDA.

这篇关于软阈值CUDA实现的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

10-15 07:20