Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F96429920
cg_cmm.ptx
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Dec 26, 16:47
Size
28 KB
Mime Type
text/x-asm
Expires
Sat, Dec 28, 16:47 (2 d)
Engine
blob
Format
Raw Data
Handle
23169864
Attached To
rLAMMPS lammps
cg_cmm.ptx
View Options
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009eb0_00000000-9_lal_cg_cmm.cpp3.i (/home/sjplimp/ccBI#.oK8Qzh)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009eb0_00000000-8_lal_cg_cmm.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_cg_cmm.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .u64 __cudaparm_kernel_pair_lj3,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair___val_paramengv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<63>;
.reg .f32 %f<111>;
.reg .pred %p<21>;
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32608_55_non_const_red_acc108[3072];
// __cuda_local_var_32543_10_non_const_f = 48
// __cuda_local_var_32545_9_non_const_virial = 16
.loc 16 31 0
$LDWbegin_kernel_pair:
.loc 16 36 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 37 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 38 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 39 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 46 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_0_28930;
.loc 16 51 0
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r10;
mul.wide.s32 %rd3, %r10, 4;
cvt.s64.s32 %rd4, %r8;
mul.wide.s32 %rd5, %r8, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r11, [%rd8+0];
sub.s32 %r12, %r1, 1;
and.b32 %r13, %r12, %r2;
cvt.s64.s32 %rd9, %r13;
mul.wide.s32 %rd10, %r13, 4;
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd11, %rd6;
@%p2 bra $Lt_0_20994;
cvt.s32.s64 %r14, %rd2;
mul.lo.s32 %r15, %r14, %r1;
mov.s32 %r16, %r15;
mul.lo.s32 %r17, %r12, %r8;
add.s32 %r18, %r14, %r17;
cvt.s64.s32 %rd12, %r18;
mul.wide.s32 %rd13, %r18, 4;
add.u64 %rd14, %rd8, %rd13;
and.b32 %r19, %r12, %r11;
cvt.s64.s32 %rd15, %r19;
div.s32 %r20, %r11, %r1;
mul.lo.s32 %r21, %r15, %r20;
cvt.s64.s32 %rd16, %r21;
add.u64 %rd17, %rd15, %rd16;
mul.lo.u64 %rd18, %rd17, 4;
add.u64 %rd19, %rd14, %rd18;
add.u64 %rd20, %rd10, %rd14;
bra.uni $Lt_0_20738;
$Lt_0_20994:
add.u64 %rd21, %rd3, %rd8;
ld.global.s32 %r22, [%rd21+0];
cvt.s64.s32 %rd22, %r22;
mul.wide.s32 %rd23, %r22, 4;
add.u64 %rd24, %rd11, %rd23;
cvt.s64.s32 %rd25, %r11;
mul.wide.s32 %rd26, %r11, 4;
add.u64 %rd19, %rd24, %rd26;
mov.s32 %r16, %r1;
add.u64 %rd20, %rd10, %rd24;
$Lt_0_20738:
.loc 16 54 0
ld.global.s32 %r23, [%rd7+0];
mov.u32 %r24, %r23;
mov.s32 %r25, 0;
mov.u32 %r26, %r25;
mov.s32 %r27, 0;
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd20, %rd19;
@%p3 bra $Lt_0_30466;
cvt.rzi.ftz.s32.f32 %r31, %f24;
cvt.s64.s32 %rd27, %r16;
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r33, %r32, %r31;
ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
$Lt_0_21762:
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
.loc 16 60 0
ld.global.s32 %r34, [%rd20+0];
.loc 16 61 0
shr.s32 %r35, %r34, 30;
and.b32 %r36, %r35, 3;
cvt.s64.s32 %rd30, %r36;
mul.wide.s32 %rd31, %r36, 4;
add.u64 %rd32, %rd29, %rd31;
ld.shared.f32 %f29, [%rd32+0];
.loc 16 64 0
and.b32 %r37, %r34, 1073741823;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
mov.s32 %r41, 0;
mov.u32 %r42, %r41;
mov.s32 %r43, 0;
mov.u32 %r44, %r43;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r45, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
add.s32 %r46, %r45, %r33;
cvt.s64.s32 %rd33, %r46;
mul.wide.s32 %rd34, %r46, 16;
add.u64 %rd35, %rd34, %rd28;
ld.global.f32 %f44, [%rd35+0];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_24066;
rcp.approx.ftz.f32 %f45, %f43;
ld.global.f32 %f46, [%rd35+4];
mov.f32 %f47, 0f40000000; // 2
setp.eq.ftz.f32 %p5, %f46, %f47;
@!%p5 bra $Lt_0_22786;
.loc 16 79 0
mul.ftz.f32 %f48, %f45, %f45;
mov.f32 %f49, %f48;
.loc 16 80 0
mul.ftz.f32 %f50, %f48, %f48;
bra.uni $Lt_0_23042;
$Lt_0_22786:
mov.f32 %f51, 0f3f800000; // 1
setp.eq.ftz.f32 %p6, %f46, %f51;
@!%p6 bra $Lt_0_23298;
.loc 16 82 0
sqrt.approx.ftz.f32 %f52, %f45;
mul.ftz.f32 %f53, %f45, %f52;
mov.f32 %f50, %f53;
.loc 16 83 0
mul.ftz.f32 %f49, %f53, %f53;
bra.uni $Lt_0_23042;
$Lt_0_23298:
.loc 16 85 0
mul.ftz.f32 %f54, %f45, %f45;
mul.ftz.f32 %f55, %f45, %f54;
mov.f32 %f49, %f55;
.loc 16 86 0
mov.f32 %f50, %f55;
$Lt_0_23042:
$Lt_0_22530:
.loc 16 88 0
mul.ftz.f32 %f56, %f45, %f29;
mul.ftz.f32 %f57, %f49, %f56;
ld.global.v2.f32 {%f58,%f59}, [%rd35+8];
mul.ftz.f32 %f60, %f58, %f50;
sub.ftz.f32 %f61, %f60, %f59;
mul.ftz.f32 %f62, %f57, %f61;
.loc 16 90 0
fma.rn.ftz.f32 %f27, %f39, %f62, %f27;
.loc 16 91 0
fma.rn.ftz.f32 %f26, %f38, %f62, %f26;
.loc 16 92 0
fma.rn.ftz.f32 %f25, %f40, %f62, %f25;
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
mov.u32 %r48, 0;
setp.le.s32 %p7, %r47, %r48;
@%p7 bra $Lt_0_23554;
.loc 16 94 0
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];
add.u64 %rd37, %rd36, %rd34;
ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd37+0];
mul.ftz.f32 %f66, %f29, %f49;
mul.ftz.f32 %f67, %f63, %f50;
sub.ftz.f32 %f68, %f67, %f64;
mul.ftz.f32 %f69, %f66, %f68;
sub.ftz.f32 %f70, %f69, %f65;
add.ftz.f32 %f28, %f28, %f70;
$Lt_0_23554:
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
mov.u32 %r50, 0;
setp.le.s32 %p8, %r49, %r50;
@%p8 bra $Lt_0_24066;
.loc 16 97 0
mov.f32 %f71, %f6;
mul.ftz.f32 %f72, %f39, %f39;
fma.rn.ftz.f32 %f73, %f62, %f72, %f71;
mov.f32 %f6, %f73;
.loc 16 98 0
mov.f32 %f74, %f8;
fma.rn.ftz.f32 %f75, %f62, %f41, %f74;
mov.f32 %f8, %f75;
.loc 16 99 0
mov.f32 %f76, %f10;
mul.ftz.f32 %f77, %f40, %f40;
fma.rn.ftz.f32 %f78, %f62, %f77, %f76;
mov.f32 %f10, %f78;
.loc 16 100 0
mov.f32 %f79, %f12;
mul.ftz.f32 %f80, %f38, %f39;
fma.rn.ftz.f32 %f81, %f62, %f80, %f79;
mov.f32 %f12, %f81;
.loc 16 101 0
mov.f32 %f82, %f14;
mul.ftz.f32 %f83, %f39, %f40;
fma.rn.ftz.f32 %f84, %f62, %f83, %f82;
mov.f32 %f14, %f84;
.loc 16 102 0
mul.ftz.f32 %f85, %f38, %f40;
fma.rn.ftz.f32 %f15, %f62, %f85, %f15;
mov.f32 %f16, %f15;
$Lt_0_24066:
$Lt_0_22018:
.loc 16 58 0
mul.lo.u64 %rd38, %rd27, 4;
add.u64 %rd20, %rd20, %rd38;
setp.lt.u64 %p9, %rd20, %rd19;
@%p9 bra $Lt_0_21762;
bra.uni $Lt_0_21250;
$Lt_0_30466:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_21250:
mov.u32 %r51, 1;
setp.le.s32 %p10, %r1, %r51;
@%p10 bra $Lt_0_26882;
.loc 16 107 0
mov.u64 %rd39, __cuda___cuda_local_var_32608_55_non_const_red_acc108;
cvt.s64.s32 %rd40, %r2;
mul.wide.s32 %rd41, %r2, 4;
add.u64 %rd42, %rd39, %rd41;
mov.f32 %f86, %f27;
st.shared.f32 [%rd42+0], %f86;
mov.f32 %f87, %f26;
st.shared.f32 [%rd42+512], %f87;
mov.f32 %f88, %f25;
st.shared.f32 [%rd42+1024], %f88;
mov.f32 %f89, %f28;
st.shared.f32 [%rd42+1536], %f89;
shr.s32 %r52, %r1, 31;
mov.s32 %r53, 1;
and.b32 %r54, %r52, %r53;
add.s32 %r55, %r54, %r1;
shr.s32 %r56, %r55, 1;
mov.s32 %r57, %r56;
mov.u32 %r58, 0;
setp.ne.u32 %p11, %r56, %r58;
@!%p11 bra $Lt_0_25346;
$Lt_0_25858:
setp.ge.u32 %p12, %r13, %r57;
@%p12 bra $Lt_0_26114;
add.u32 %r59, %r2, %r57;
cvt.u64.u32 %rd43, %r59;
mul.wide.u32 %rd44, %r59, 4;
add.u64 %rd45, %rd39, %rd44;
ld.shared.f32 %f90, [%rd45+0];
add.ftz.f32 %f86, %f90, %f86;
st.shared.f32 [%rd42+0], %f86;
ld.shared.f32 %f91, [%rd45+512];
add.ftz.f32 %f87, %f91, %f87;
st.shared.f32 [%rd42+512], %f87;
ld.shared.f32 %f92, [%rd45+1024];
add.ftz.f32 %f88, %f92, %f88;
st.shared.f32 [%rd42+1024], %f88;
ld.shared.f32 %f93, [%rd45+1536];
add.ftz.f32 %f89, %f93, %f89;
st.shared.f32 [%rd42+1536], %f89;
$Lt_0_26114:
shr.u32 %r57, %r57, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p13, %r57, %r60;
@%p13 bra $Lt_0_25858;
$Lt_0_25346:
mov.f32 %f27, %f86;
mov.f32 %f26, %f87;
mov.f32 %f25, %f88;
mov.f32 %f28, %f89;
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
mov.u32 %r62, 0;
setp.le.s32 %p14, %r61, %r62;
@%p14 bra $Lt_0_26882;
mov.f32 %f86, %f6;
st.shared.f32 [%rd42+0], %f86;
mov.f32 %f87, %f8;
st.shared.f32 [%rd42+512], %f87;
mov.f32 %f88, %f10;
st.shared.f32 [%rd42+1024], %f88;
mov.f32 %f89, %f12;
st.shared.f32 [%rd42+1536], %f89;
mov.f32 %f94, %f14;
st.shared.f32 [%rd42+2048], %f94;
mov.f32 %f95, %f15;
st.shared.f32 [%rd42+2560], %f95;
mov.s32 %r63, %r56;
@!%p11 bra $Lt_0_27394;
$Lt_0_27906:
setp.ge.u32 %p15, %r13, %r63;
@%p15 bra $Lt_0_28162;
add.u32 %r64, %r2, %r63;
cvt.u64.u32 %rd46, %r64;
mul.wide.u32 %rd47, %r64, 4;
add.u64 %rd48, %rd39, %rd47;
ld.shared.f32 %f96, [%rd48+0];
add.ftz.f32 %f86, %f96, %f86;
st.shared.f32 [%rd42+0], %f86;
ld.shared.f32 %f97, [%rd48+512];
add.ftz.f32 %f87, %f97, %f87;
st.shared.f32 [%rd42+512], %f87;
ld.shared.f32 %f98, [%rd48+1024];
add.ftz.f32 %f88, %f98, %f88;
st.shared.f32 [%rd42+1024], %f88;
ld.shared.f32 %f99, [%rd48+1536];
add.ftz.f32 %f89, %f99, %f89;
st.shared.f32 [%rd42+1536], %f89;
ld.shared.f32 %f100, [%rd48+2048];
add.ftz.f32 %f94, %f100, %f94;
st.shared.f32 [%rd42+2048], %f94;
ld.shared.f32 %f101, [%rd48+2560];
add.ftz.f32 %f95, %f101, %f95;
st.shared.f32 [%rd42+2560], %f95;
$Lt_0_28162:
shr.u32 %r63, %r63, 1;
mov.u32 %r65, 0;
setp.ne.u32 %p16, %r63, %r65;
@%p16 bra $Lt_0_27906;
$Lt_0_27394:
mov.f32 %f6, %f86;
mov.f32 %f8, %f87;
mov.f32 %f10, %f88;
mov.f32 %f12, %f89;
mov.f32 %f14, %f94;
mov.f32 %f16, %f95;
$Lt_0_26882:
$Lt_0_24834:
mov.u32 %r66, 0;
setp.ne.s32 %p17, %r13, %r66;
@%p17 bra $Lt_0_28930;
ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];
add.u64 %rd50, %rd49, %rd5;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p18, %r67, %r68;
@%p18 bra $Lt_0_29442;
st.global.f32 [%rd50+0], %f28;
cvt.s64.s32 %rd51, %r9;
mul.wide.s32 %rd52, %r9, 4;
add.u64 %rd50, %rd50, %rd52;
$Lt_0_29442:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_0_29954;
mov.f32 %f102, %f6;
st.global.f32 [%rd50+0], %f102;
cvt.s64.s32 %rd53, %r9;
mul.wide.s32 %rd54, %r9, 4;
add.u64 %rd55, %rd54, %rd50;
mov.f32 %f103, %f8;
st.global.f32 [%rd55+0], %f103;
add.u64 %rd56, %rd54, %rd55;
mov.f32 %f104, %f10;
st.global.f32 [%rd56+0], %f104;
add.u64 %rd57, %rd54, %rd56;
mov.f32 %f105, %f12;
st.global.f32 [%rd57+0], %f105;
add.u64 %rd50, %rd54, %rd57;
mov.f32 %f106, %f14;
st.global.f32 [%rd50+0], %f106;
mov.f32 %f107, %f16;
add.u64 %rd58, %rd54, %rd50;
st.global.f32 [%rd58+0], %f107;
$Lt_0_29954:
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd60, %rd4, 16;
add.u64 %rd61, %rd59, %rd60;
mov.f32 %f108, %f109;
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f108};
$Lt_0_28930:
$Lt_0_20226:
.loc 16 110 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<75>;
.reg .f32 %f<118>;
.reg .pred %p<24>;
.shared .align 4 .b8 __cuda___cuda_local_var_32625_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32623_34_non_const_lj13296[1936];
.shared .align 16 .b8 __cuda___cuda_local_var_32624_34_non_const_lj35232[1936];
.shared .align 4 .b8 __cuda___cuda_local_var_32702_55_non_const_red_acc7168[3072];
// __cuda_local_var_32635_10_non_const_f = 48
// __cuda_local_var_32637_9_non_const_virial = 16
.loc 16 118 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_22530;
.loc 16 126 0
mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_22530:
mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_23042;
.loc 16 128 0
mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_23554;
.loc 16 130 0
mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
add.u64 %rd15, %rd14, %rd9;
add.u64 %rd16, %rd9, %rd13;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
$Lt_1_23554:
mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;
$Lt_1_23042:
mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;
mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;
.loc 16 138 0
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
mov.f32 %f20, 0f00000000; // 0
mov.f32 %f21, %f20;
.loc 16 140 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
cvt.s32.u32 %r10, %ctaid.x;
mul.lo.s32 %r11, %r10, %r9;
add.s32 %r12, %r7, %r11;
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
setp.ge.s32 %p4, %r12, %r13;
@%p4 bra $Lt_1_32770;
.loc 16 145 0
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd17, %r14;
mul.wide.s32 %rd18, %r14, 4;
cvt.s64.s32 %rd19, %r12;
mul.wide.s32 %rd20, %r12, 4;
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd22, %rd20, %rd21;
add.u64 %rd23, %rd18, %rd22;
ld.global.s32 %r15, [%rd23+0];
sub.s32 %r16, %r6, 1;
and.b32 %r17, %r16, %r1;
cvt.s64.s32 %rd24, %r17;
mul.wide.s32 %rd25, %r17, 4;
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd26, %rd21;
@%p5 bra $Lt_1_24834;
cvt.s32.s64 %r18, %rd17;
mul.lo.s32 %r19, %r18, %r6;
mov.s32 %r20, %r19;
mul.lo.s32 %r21, %r16, %r12;
add.s32 %r22, %r18, %r21;
cvt.s64.s32 %rd27, %r22;
mul.wide.s32 %rd28, %r22, 4;
add.u64 %rd29, %rd23, %rd28;
and.b32 %r23, %r16, %r15;
cvt.s64.s32 %rd30, %r23;
div.s32 %r24, %r15, %r6;
mul.lo.s32 %r25, %r19, %r24;
cvt.s64.s32 %rd31, %r25;
add.u64 %rd32, %rd30, %rd31;
mul.lo.u64 %rd33, %rd32, 4;
add.u64 %rd34, %rd29, %rd33;
add.u64 %rd35, %rd25, %rd29;
bra.uni $Lt_1_24578;
$Lt_1_24834:
add.u64 %rd36, %rd18, %rd23;
ld.global.s32 %r26, [%rd36+0];
cvt.s64.s32 %rd37, %r26;
mul.wide.s32 %rd38, %r26, 4;
add.u64 %rd39, %rd26, %rd38;
cvt.s64.s32 %rd40, %r15;
mul.wide.s32 %rd41, %r15, 4;
add.u64 %rd34, %rd39, %rd41;
mov.s32 %r20, %r6;
add.u64 %rd35, %rd25, %rd39;
$Lt_1_24578:
.loc 16 148 0
ld.global.s32 %r27, [%rd22+0];
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
mov.s32 %r31, 0;
mov.u32 %r32, %r31;
mov.s32 %r33, 0;
mov.u32 %r34, %r33;
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
mov.f32 %f28, %f24;
mov.f32 %f29, %f25;
setp.ge.u64 %p6, %rd35, %rd34;
@%p6 bra $Lt_1_34306;
cvt.rzi.ftz.s32.f32 %r35, %f29;
cvt.s64.s32 %rd42, %r20;
mul.lo.s32 %r36, %r35, 11;
cvt.rn.f32.s32 %f30, %r36;
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_25602:
//<loop> Loop body line 148, nesting depth: 1, estimated iterations: unknown
.loc 16 155 0
ld.global.s32 %r37, [%rd35+0];
.loc 16 156 0
shr.s32 %r38, %r37, 30;
and.b32 %r39, %r38, 3;
cvt.s64.s32 %rd43, %r39;
mul.wide.s32 %rd44, %r39, 4;
add.u64 %rd45, %rd1, %rd44;
ld.shared.f32 %f35, [%rd45+0];
.loc 16 159 0
and.b32 %r40, %r37, 1073741823;
mov.u32 %r41, %r40;
mov.s32 %r42, 0;
mov.u32 %r43, %r42;
mov.s32 %r44, 0;
mov.u32 %r45, %r44;
mov.s32 %r46, 0;
mov.u32 %r47, %r46;
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
mov.f32 %f42, %f38;
mov.f32 %f43, %f39;
sub.ftz.f32 %f44, %f27, %f41;
sub.ftz.f32 %f45, %f26, %f40;
sub.ftz.f32 %f46, %f28, %f42;
mul.ftz.f32 %f47, %f44, %f44;
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
add.ftz.f32 %f50, %f30, %f43;
cvt.rzi.ftz.s32.f32 %r48, %f50;
cvt.s64.s32 %rd46, %r48;
mul.wide.s32 %rd47, %r48, 16;
add.u64 %rd48, %rd47, %rd7;
ld.shared.f32 %f51, [%rd48+0];
setp.gt.ftz.f32 %p7, %f51, %f49;
@!%p7 bra $Lt_1_27906;
rcp.approx.ftz.f32 %f52, %f49;
ld.shared.f32 %f53, [%rd48+4];
mov.f32 %f54, 0f40000000; // 2
setp.eq.ftz.f32 %p8, %f53, %f54;
@!%p8 bra $Lt_1_26626;
.loc 16 173 0
mul.ftz.f32 %f55, %f52, %f52;
mov.f32 %f56, %f55;
.loc 16 174 0
mul.ftz.f32 %f57, %f55, %f55;
bra.uni $Lt_1_26882;
$Lt_1_26626:
mov.f32 %f58, 0f3f800000; // 1
setp.eq.ftz.f32 %p9, %f53, %f58;
@!%p9 bra $Lt_1_27138;
.loc 16 176 0
sqrt.approx.ftz.f32 %f59, %f52;
mul.ftz.f32 %f60, %f52, %f59;
mov.f32 %f57, %f60;
.loc 16 177 0
mul.ftz.f32 %f56, %f60, %f60;
bra.uni $Lt_1_26882;
$Lt_1_27138:
.loc 16 179 0
mul.ftz.f32 %f61, %f52, %f52;
mul.ftz.f32 %f62, %f52, %f61;
mov.f32 %f56, %f62;
.loc 16 180 0
mov.f32 %f57, %f62;
$Lt_1_26882:
$Lt_1_26370:
.loc 16 182 0
mul.ftz.f32 %f63, %f52, %f35;
mul.ftz.f32 %f64, %f56, %f63;
ld.shared.v2.f32 {%f65,%f66}, [%rd48+8];
mul.ftz.f32 %f67, %f65, %f57;
sub.ftz.f32 %f68, %f67, %f66;
mul.ftz.f32 %f69, %f64, %f68;
.loc 16 184 0
fma.rn.ftz.f32 %f33, %f45, %f69, %f33;
.loc 16 185 0
fma.rn.ftz.f32 %f32, %f44, %f69, %f32;
.loc 16 186 0
fma.rn.ftz.f32 %f31, %f46, %f69, %f31;
ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r50, 0;
setp.le.s32 %p10, %r49, %r50;
@%p10 bra $Lt_1_27394;
.loc 16 188 0
add.u64 %rd49, %rd47, %rd13;
ld.shared.v4.f32 {%f70,%f71,%f72,_}, [%rd49+0];
mul.ftz.f32 %f73, %f35, %f56;
mul.ftz.f32 %f74, %f70, %f57;
sub.ftz.f32 %f75, %f74, %f71;
mul.ftz.f32 %f76, %f73, %f75;
sub.ftz.f32 %f77, %f76, %f72;
add.ftz.f32 %f34, %f34, %f77;
$Lt_1_27394:
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r52, 0;
setp.le.s32 %p11, %r51, %r52;
@%p11 bra $Lt_1_27906;
.loc 16 191 0
mov.f32 %f78, %f11;
mul.ftz.f32 %f79, %f45, %f45;
fma.rn.ftz.f32 %f80, %f69, %f79, %f78;
mov.f32 %f11, %f80;
.loc 16 192 0
mov.f32 %f81, %f13;
fma.rn.ftz.f32 %f82, %f69, %f47, %f81;
mov.f32 %f13, %f82;
.loc 16 193 0
mov.f32 %f83, %f15;
mul.ftz.f32 %f84, %f46, %f46;
fma.rn.ftz.f32 %f85, %f69, %f84, %f83;
mov.f32 %f15, %f85;
.loc 16 194 0
mov.f32 %f86, %f17;
mul.ftz.f32 %f87, %f44, %f45;
fma.rn.ftz.f32 %f88, %f69, %f87, %f86;
mov.f32 %f17, %f88;
.loc 16 195 0
mov.f32 %f89, %f19;
mul.ftz.f32 %f90, %f45, %f46;
fma.rn.ftz.f32 %f91, %f69, %f90, %f89;
mov.f32 %f19, %f91;
.loc 16 196 0
mul.ftz.f32 %f92, %f44, %f46;
fma.rn.ftz.f32 %f20, %f69, %f92, %f20;
mov.f32 %f21, %f20;
$Lt_1_27906:
$Lt_1_25858:
.loc 16 153 0
mul.lo.u64 %rd50, %rd42, 4;
add.u64 %rd35, %rd35, %rd50;
setp.lt.u64 %p12, %rd35, %rd34;
@%p12 bra $Lt_1_25602;
bra.uni $Lt_1_25090;
$Lt_1_34306:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_25090:
mov.u32 %r53, 1;
setp.le.s32 %p13, %r6, %r53;
@%p13 bra $Lt_1_30722;
.loc 16 201 0
mov.u64 %rd51, __cuda___cuda_local_var_32702_55_non_const_red_acc7168;
cvt.s64.s32 %rd52, %r1;
mul.wide.s32 %rd53, %r1, 4;
add.u64 %rd54, %rd51, %rd53;
mov.f32 %f93, %f33;
st.shared.f32 [%rd54+0], %f93;
mov.f32 %f94, %f32;
st.shared.f32 [%rd54+512], %f94;
mov.f32 %f95, %f31;
st.shared.f32 [%rd54+1024], %f95;
mov.f32 %f96, %f34;
st.shared.f32 [%rd54+1536], %f96;
shr.s32 %r54, %r6, 31;
mov.s32 %r55, 1;
and.b32 %r56, %r54, %r55;
add.s32 %r57, %r56, %r6;
shr.s32 %r58, %r57, 1;
mov.s32 %r59, %r58;
mov.u32 %r60, 0;
setp.ne.u32 %p14, %r58, %r60;
@!%p14 bra $Lt_1_29186;
$Lt_1_29698:
setp.ge.u32 %p15, %r17, %r59;
@%p15 bra $Lt_1_29954;
add.u32 %r61, %r1, %r59;
cvt.u64.u32 %rd55, %r61;
mul.wide.u32 %rd56, %r61, 4;
add.u64 %rd57, %rd51, %rd56;
ld.shared.f32 %f97, [%rd57+0];
add.ftz.f32 %f93, %f97, %f93;
st.shared.f32 [%rd54+0], %f93;
ld.shared.f32 %f98, [%rd57+512];
add.ftz.f32 %f94, %f98, %f94;
st.shared.f32 [%rd54+512], %f94;
ld.shared.f32 %f99, [%rd57+1024];
add.ftz.f32 %f95, %f99, %f95;
st.shared.f32 [%rd54+1024], %f95;
ld.shared.f32 %f100, [%rd57+1536];
add.ftz.f32 %f96, %f100, %f96;
st.shared.f32 [%rd54+1536], %f96;
$Lt_1_29954:
shr.u32 %r59, %r59, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p16, %r59, %r62;
@%p16 bra $Lt_1_29698;
$Lt_1_29186:
mov.f32 %f33, %f93;
mov.f32 %f32, %f94;
mov.f32 %f31, %f95;
mov.f32 %f34, %f96;
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r64, 0;
setp.le.s32 %p17, %r63, %r64;
@%p17 bra $Lt_1_30722;
mov.f32 %f93, %f11;
st.shared.f32 [%rd54+0], %f93;
mov.f32 %f94, %f13;
st.shared.f32 [%rd54+512], %f94;
mov.f32 %f95, %f15;
st.shared.f32 [%rd54+1024], %f95;
mov.f32 %f96, %f17;
st.shared.f32 [%rd54+1536], %f96;
mov.f32 %f101, %f19;
st.shared.f32 [%rd54+2048], %f101;
mov.f32 %f102, %f20;
st.shared.f32 [%rd54+2560], %f102;
mov.s32 %r65, %r58;
@!%p14 bra $Lt_1_31234;
$Lt_1_31746:
setp.ge.u32 %p18, %r17, %r65;
@%p18 bra $Lt_1_32002;
add.u32 %r66, %r1, %r65;
cvt.u64.u32 %rd58, %r66;
mul.wide.u32 %rd59, %r66, 4;
add.u64 %rd60, %rd51, %rd59;
ld.shared.f32 %f103, [%rd60+0];
add.ftz.f32 %f93, %f103, %f93;
st.shared.f32 [%rd54+0], %f93;
ld.shared.f32 %f104, [%rd60+512];
add.ftz.f32 %f94, %f104, %f94;
st.shared.f32 [%rd54+512], %f94;
ld.shared.f32 %f105, [%rd60+1024];
add.ftz.f32 %f95, %f105, %f95;
st.shared.f32 [%rd54+1024], %f95;
ld.shared.f32 %f106, [%rd60+1536];
add.ftz.f32 %f96, %f106, %f96;
st.shared.f32 [%rd54+1536], %f96;
ld.shared.f32 %f107, [%rd60+2048];
add.ftz.f32 %f101, %f107, %f101;
st.shared.f32 [%rd54+2048], %f101;
ld.shared.f32 %f108, [%rd60+2560];
add.ftz.f32 %f102, %f108, %f102;
st.shared.f32 [%rd54+2560], %f102;
$Lt_1_32002:
shr.u32 %r65, %r65, 1;
mov.u32 %r67, 0;
setp.ne.u32 %p19, %r65, %r67;
@%p19 bra $Lt_1_31746;
$Lt_1_31234:
mov.f32 %f11, %f93;
mov.f32 %f13, %f94;
mov.f32 %f15, %f95;
mov.f32 %f17, %f96;
mov.f32 %f19, %f101;
mov.f32 %f21, %f102;
$Lt_1_30722:
$Lt_1_28674:
mov.u32 %r68, 0;
setp.ne.s32 %p20, %r17, %r68;
@%p20 bra $Lt_1_32770;
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];
add.u64 %rd62, %rd61, %rd20;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p21, %r69, %r70;
@%p21 bra $Lt_1_33282;
st.global.f32 [%rd62+0], %f34;
cvt.s64.s32 %rd63, %r13;
mul.wide.s32 %rd64, %r13, 4;
add.u64 %rd62, %rd62, %rd64;
$Lt_1_33282:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p22, %r71, %r72;
@%p22 bra $Lt_1_33794;
mov.f32 %f109, %f11;
st.global.f32 [%rd62+0], %f109;
cvt.s64.s32 %rd65, %r13;
mul.wide.s32 %rd66, %r13, 4;
add.u64 %rd67, %rd66, %rd62;
mov.f32 %f110, %f13;
st.global.f32 [%rd67+0], %f110;
add.u64 %rd68, %rd66, %rd67;
mov.f32 %f111, %f15;
st.global.f32 [%rd68+0], %f111;
add.u64 %rd69, %rd66, %rd68;
mov.f32 %f112, %f17;
st.global.f32 [%rd69+0], %f112;
add.u64 %rd62, %rd66, %rd69;
mov.f32 %f113, %f19;
st.global.f32 [%rd62+0], %f113;
mov.f32 %f114, %f21;
add.u64 %rd70, %rd66, %rd62;
st.global.f32 [%rd70+0], %f114;
$Lt_1_33794:
ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd72, %rd19, 16;
add.u64 %rd73, %rd71, %rd72;
mov.f32 %f115, %f116;
st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f115};
$Lt_1_32770:
$Lt_1_24066:
.loc 16 204 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast
Event Timeline
Log In to Comment