- .headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM30 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM30)"
- .elftype @"ET_EXEC"
- //--------------------- .text._Z7testPtxPiPfPj --------------------------
- .section .text._Z7testPtxPiPfPj,"ax",@progbits
- .sectioninfo @"SHI_REGISTERS=12"
- .align 64
- .global _Z7testPtxPiPfPj
- .type _Z7testPtxPiPfPj,@function
- .size _Z7testPtxPiPfPj,(.L_39 - _Z7testPtxPiPfPj)
- .other _Z7testPtxPiPfPj,@"STO_CUDA_ENTRY STV_DEFAULT"
- _Z7testPtxPiPfPj:
- .text._Z7testPtxPiPfPj:
- /*0008*/ MOV R1, c[0x0][0x44];
- /*0010*/ S2R R0, SR_CLOCKLO;
- /*0018*/ MOV R2, c[0x0][0x140];
- /*0020*/ MOV R3, c[0x0][0x144];
- /*0028*/ LD.E.CV R2, [R2];
- /*0030*/ S2R R3, SR_CLOCKLO;
- /*0038*/ I2F.F32.U32 R2, R2;
- /*0048*/ IADD32I R4, R2, 0x1800000;
- /*0050*/ LOP32I.AND R4, R4, 0x7f800000;
- /*0058*/ ISETP.GT.U32.AND P0, PT, R4, c[0x2][0x0], PT;
- /*0060*/ @P0 BRA `(.L_10);
- /*0068*/ MOV R5, R2;
- /*0070*/ CAL `($_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath);
- /*0078*/ MOV R7, R4;
- /*0088*/ BRA `(.L_11);
- .L_10:
- /*0090*/ MUFU.RCP R7, R2;
- /*0098*/ FFMA R2, R2, R7, c[0x2][0x4];
- /*00a0*/ FADD.FTZ R2, -R2, -RZ;
- /*00a8*/ FFMA R7, R7, R2, R7;
- .L_11:
- /*00b0*/ IADD R6, -R0, R3;
- /*00b8*/ MOV R4, c[0x0][0x148];
- /*00c8*/ MOV R5, c[0x0][0x14c];
- /*00d0*/ MOV R2, c[0x0][0x150];
- /*00d8*/ MOV R3, c[0x0][0x154];
- /*00e0*/ ST.E [R4], R7;
- /*00e8*/ ST.E [R2], R6;
- /*00f0*/ EXIT;
- .weak $_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath
- .type $_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath,@function
- .size $_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath,(.L_39 - $_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath)
- $_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath:
- /*00f8*/ SHL R2, R5, 0x1;
- /*0108*/ PRMT R4, RZ, 0x7, R2;
- /*0110*/ MOV R2, R5;
- /*0118*/ ISETP.NE.U32.AND P0, PT, R4, RZ, PT;
- /*0120*/ @P0 BRA `(.L_12);
- /*0128*/ SHL R4, R2, 0x1;
- /*0130*/ ISETP.NE.AND P0, PT, R4, RZ, PT;
- /*0138*/ @!P0 MUFU.RCP R4, R2;
- /*0148*/ @!P0 RET;
- /*0150*/ FFMA R2, R2, 1.84467440737095516160e+19, RZ;
- /*0158*/ MUFU.RCP R4, R2;
- /*0160*/ FFMA R2, R2, R4, c[0x2][0x4];
- /*0168*/ FADD.FTZ R2, -R2, -RZ;
- /*0170*/ FFMA R4, R4, R2, R4;
- /*0178*/ FFMA R4, R4, 1.84467440737095516160e+19, RZ;
- /*0188*/ RET;
- .L_12:
- /*0190*/ IADD32I R5, R4, -0xfd;
- /*0198*/ ISETP.GT.U32.AND P0, PT, R5, 0x1, PT;
- /*01a0*/ @P0 BRA `(.L_13);
- /*01a8*/ LOP32I.AND R11, R2, 0x7fffff;
- /*01b0*/ IADD32I R4, R4, -0xfc;
- /*01b8*/ MOV32I R10, 0x3;
- /*01c8*/ LOP32I.AND R2, R2, 0x80000000;
- /*01d0*/ LOP32I.OR R6, R11, 0x3f800000;
- /*01d8*/ ISETP.NE.U32.AND P1, PT, R11, RZ, PT;
- /*01e0*/ SHL R10, R10, R5;
- /*01e8*/ MUFU.RCP R7, R6;
- /*01f0*/ FFMA R6, R6, R7, c[0x2][0x4];
- /*01f8*/ FADD.FTZ R6, -R6, -RZ;
- /*0208*/ FFMA.RM R8, R7, R6, R7;
- /*0210*/ FFMA.RP R6, R7, R6, R7;
- /*0218*/ LOP32I.AND R9, R8, 0x7fffff;
- /*0220*/ FSET.NEU.FTZ.AND R6, R8, R6, PT;
- /*0228*/ LOP32I.OR R9, R9, 0x800000;
- /*0230*/ IADD R6, -R6, RZ;
- /*0238*/ LOP.AND R10, R10, R9;
- /*0248*/ SHR.U32 R4, R9, R4;
- /*0250*/ SHR.U32 R10, R10, R5;
- /*0258*/ LOP.AND R5, R5, R9;
- /*0260*/ LOP32I.AND R7, R10, 0x2;
- /*0268*/ LOP.OR R5, R6, R5;
- /*0270*/ LOP32I.AND R10, R10, 0x1;
- /*0278*/ ISETP.NE.U32.AND P0, PT, R7, RZ, PT;
- /*0288*/ ISETP.NE.U32.OR P0, PT, R5, RZ, P0;
- /*0290*/ ISETP.NE.U32.AND P0, PT, R10, RZ, P0;
- /*0298*/ @P0 IADD32I R4, R4, 0x1;
- /*02a0*/ @!P1 SHL R4, R4, 0x1;
- /*02a8*/ LOP.OR R4, R4, R2;
- /*02b0*/ RET;
- .L_13:
- /*02b8*/ MUFU.RCP R4, R2;
- /*02c8*/ RET;
- .L_14:
- /*02d0*/ BRA `(.L_14);
- .L_39:
[text] SASS code using nvdisasm
Viewer
Editor
You can edit this paste and save as new: