Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F96436225
charmm_long.ptx
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Dec 26, 18:13
Size
37 KB
Mime Type
text/x-asm
Expires
Sat, Dec 28, 18:13 (1 d, 9 h)
Engine
blob
Format
Raw Data
Handle
23169713
Attached To
rLAMMPS lammps
charmm_long.ptx
View Options
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009e6b_00000000-9_lal_charmm_long.cpp3.i (/home/sjplimp/ccBI#.BwX2xw)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009e6b_00000000-8_lal_charmm_long.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_charmm_long.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.global .texref q_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair___val_paramengv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .u64 __cudaparm_kernel_pair_q_,
.param .f32 __cudaparm_kernel_pair_cut_coulsq,
.param .f32 __cudaparm_kernel_pair_qqrd2e,
.param .f32 __cudaparm_kernel_pair_g_ewald,
.param .f32 __cudaparm_kernel_pair_denom_lj,
.param .f32 __cudaparm_kernel_pair_cut_bothsq,
.param .f32 __cudaparm_kernel_pair_cut_ljsq,
.param .f32 __cudaparm_kernel_pair_cut_lj_innersq,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<91>;
.reg .u64 %rd<65>;
.reg .f32 %f<190>;
.reg .pred %p<23>;
.shared .align 16 .b8 __cuda___cuda_local_var_32542_33_non_const_sp_lj120[32];
.shared .align 4 .b8 __cuda___cuda_local_var_32646_55_non_const_red_acc152[3072];
// __cuda_local_var_32554_10_non_const_f = 64
// __cuda_local_var_32556_9_non_const_virial = 16
// __cuda_local_var_32590_43_non_const_r6inv = 40
// __cuda_local_var_32590_50_non_const_prefactor = 52
// __cuda_local_var_32590_61_non_const__erfc = 48
// __cuda_local_var_32590_68_non_const_switch1 = 44
.loc 16 37 0
$LDWbegin_kernel_pair:
.loc 16 42 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 43 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 44 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 45 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32542_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4};
.loc 16 46 0
ld.global.f32 %f5, [%rd1+16];
.loc 16 47 0
ld.global.f32 %f6, [%rd1+20];
.loc 16 48 0
ld.global.f32 %f7, [%rd1+24];
.loc 16 49 0
ld.global.f32 %f8, [%rd1+28];
st.shared.v4.f32 [__cuda___cuda_local_var_32542_33_non_const_sp_lj120+16], {%f5,%f6,%f7,%f8};
.loc 16 57 0
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
mov.f32 %f17, 0f00000000; // 0
mov.f32 %f18, %f17;
mov.f32 %f19, 0f00000000; // 0
mov.f32 %f20, %f19;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_0_33026;
.loc 16 62 0
cvt.s64.s32 %rd2, %r8;
mul.wide.s32 %rd3, %r8, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd5, %rd3, %rd4;
ld.global.s32 %r10, [%rd5+0];
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd6, %r11;
mul.wide.s32 %rd7, %r11, 4;
add.u64 %rd8, %rd7, %rd5;
ld.global.s32 %r12, [%rd8+0];
sub.s32 %r13, %r1, 1;
and.b32 %r14, %r13, %r2;
cvt.s64.s32 %rd9, %r14;
mul.wide.s32 %rd10, %r14, 4;
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd11, %rd4;
@%p2 bra $Lt_0_24066;
cvt.s32.s64 %r15, %rd6;
mul.lo.s32 %r16, %r15, %r1;
mov.s32 %r17, %r16;
mul.lo.s32 %r18, %r13, %r8;
add.s32 %r19, %r15, %r18;
cvt.s64.s32 %rd12, %r19;
mul.wide.s32 %rd13, %r19, 4;
add.u64 %rd14, %rd8, %rd13;
and.b32 %r20, %r13, %r12;
cvt.s64.s32 %rd15, %r20;
div.s32 %r21, %r12, %r1;
mul.lo.s32 %r22, %r16, %r21;
cvt.s64.s32 %rd16, %r22;
add.u64 %rd17, %rd15, %rd16;
mul.lo.u64 %rd18, %rd17, 4;
add.u64 %rd19, %rd14, %rd18;
add.u64 %rd20, %rd10, %rd14;
bra.uni $Lt_0_23810;
$Lt_0_24066:
add.u64 %rd21, %rd7, %rd8;
ld.global.s32 %r23, [%rd21+0];
cvt.s64.s32 %rd22, %r23;
mul.wide.s32 %rd23, %r23, 4;
add.u64 %rd24, %rd11, %rd23;
cvt.s64.s32 %rd25, %r12;
mul.wide.s32 %rd26, %r12, 4;
add.u64 %rd19, %rd24, %rd26;
mov.s32 %r17, %r1;
add.u64 %rd20, %rd10, %rd24;
$Lt_0_23810:
.loc 16 65 0
mov.u32 %r24, %r10;
mov.s32 %r25, 0;
mov.u32 %r26, %r25;
mov.s32 %r27, 0;
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}];
mov.f32 %f25, %f21;
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
mov.f32 %f28, %f24;
.loc 16 66 0
mov.u32 %r31, %r10;
mov.s32 %r32, 0;
mov.u32 %r33, %r32;
mov.s32 %r34, 0;
mov.u32 %r35, %r34;
mov.s32 %r36, 0;
mov.u32 %r37, %r36;
tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}];
mov.f32 %f33, %f29;
setp.ge.u64 %p3, %rd20, %rd19;
@%p3 bra $Lt_0_34562;
cvt.s64.s32 %rd27, %r17;
ld.param.f32 %f34, [__cudaparm_kernel_pair_cut_bothsq];
mov.f32 %f35, 0f00000000; // 0
mov.f32 %f36, 0f00000000; // 0
mov.f32 %f37, 0f00000000; // 0
mov.f32 %f38, 0f00000000; // 0
mov.f32 %f39, 0f00000000; // 0
mov.u64 %rd28, __cuda___cuda_local_var_32542_33_non_const_sp_lj120;
$Lt_0_24834:
//<loop> Loop body line 66, nesting depth: 1, estimated iterations: unknown
.loc 16 70 0
ld.global.s32 %r38, [%rd20+0];
.loc 16 73 0
shr.s32 %r39, %r38, 30;
and.b32 %r40, %r39, 3;
cvt.s64.s32 %rd29, %r40;
mul.wide.s32 %rd30, %r40, 4;
add.u64 %rd31, %rd28, %rd30;
ld.shared.f32 %f40, [%rd31+0];
.loc 16 74 0
mov.f32 %f41, 0f3f800000; // 1
ld.shared.f32 %f42, [%rd31+16];
sub.ftz.f32 %f43, %f41, %f42;
.loc 16 77 0
and.b32 %r41, %r38, 1073741823;
mov.u32 %r42, %r41;
mov.s32 %r43, 0;
mov.u32 %r44, %r43;
mov.s32 %r45, 0;
mov.u32 %r46, %r45;
mov.s32 %r47, 0;
mov.u32 %r48, %r47;
tex.1d.v4.f32.s32 {%f44,%f45,%f46,%f47},[pos_tex,{%r42,%r44,%r46,%r48}];
mov.f32 %f48, %f44;
mov.f32 %f49, %f45;
mov.f32 %f50, %f46;
mov.f32 %f51, %f47;
sub.ftz.f32 %f52, %f26, %f49;
sub.ftz.f32 %f53, %f25, %f48;
sub.ftz.f32 %f54, %f27, %f50;
mul.ftz.f32 %f55, %f52, %f52;
fma.rn.ftz.f32 %f56, %f53, %f53, %f55;
fma.rn.ftz.f32 %f57, %f54, %f54, %f56;
setp.lt.ftz.f32 %p4, %f57, %f34;
@!%p4 bra $Lt_0_28162;
ld.param.f32 %f58, [__cudaparm_kernel_pair_cut_ljsq];
setp.lt.ftz.f32 %p5, %f57, %f58;
rcp.approx.ftz.f32 %f59, %f57;
@!%p5 bra $Lt_0_25858;
.loc 16 92 0
mul.ftz.f32 %f60, %f59, %f59;
mul.ftz.f32 %f61, %f59, %f60;
mov.f32 %f62, %f61;
.loc 16 93 0
cvt.rzi.ftz.s32.f32 %r49, %f51;
cvt.rzi.ftz.s32.f32 %r50, %f28;
ld.param.u64 %rd32, [__cudaparm_kernel_pair_lj1];
ld.param.s32 %r51, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r52, %r51, %r50;
add.s32 %r53, %r49, %r52;
cvt.s64.s32 %rd33, %r53;
mul.wide.s32 %rd34, %r53, 16;
add.u64 %rd35, %rd32, %rd34;
mul.ftz.f32 %f63, %f61, %f40;
ld.global.v2.f32 {%f64,%f65}, [%rd35+0];
mul.ftz.f32 %f66, %f64, %f61;
sub.ftz.f32 %f67, %f66, %f65;
mul.ftz.f32 %f68, %f63, %f67;
ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_lj_innersq];
setp.gt.ftz.f32 %p6, %f57, %f69;
@!%p6 bra $Lt_0_25602;
.loc 16 99 0
add.ftz.f32 %f70, %f57, %f57;
sub.ftz.f32 %f71, %f58, %f57;
add.ftz.f32 %f72, %f70, %f58;
mul.ftz.f32 %f73, %f71, %f71;
mov.f32 %f74, 0f40400000; // 3
mul.ftz.f32 %f75, %f74, %f69;
sub.ftz.f32 %f76, %f72, %f75;
ld.param.f32 %f77, [__cudaparm_kernel_pair_denom_lj];
div.approx.ftz.f32 %f78, %f76, %f77;
mul.ftz.f32 %f79, %f73, %f78;
mov.f32 %f80, %f79;
.loc 16 102 0
mov.f32 %f81, 0f41400000; // 12
mul.ftz.f32 %f82, %f57, %f81;
mul.ftz.f32 %f83, %f71, %f82;
sub.ftz.f32 %f84, %f57, %f69;
mul.ftz.f32 %f85, %f83, %f84;
div.approx.ftz.f32 %f86, %f85, %f77;
ld.global.v2.f32 {%f87,%f88}, [%rd35+8];
mul.ftz.f32 %f89, %f87, %f61;
sub.ftz.f32 %f90, %f89, %f88;
mul.ftz.f32 %f91, %f61, %f90;
mul.ftz.f32 %f92, %f86, %f91;
fma.rn.ftz.f32 %f68, %f68, %f79, %f92;
bra.uni $Lt_0_25602;
$Lt_0_25858:
.loc 16 105 0
mov.f32 %f68, 0f00000000; // 0
$Lt_0_25602:
ld.param.f32 %f93, [__cudaparm_kernel_pair_cut_coulsq];
setp.gt.ftz.f32 %p7, %f93, %f57;
@!%p7 bra $Lt_0_26882;
.loc 20 518 0
rsqrt.approx.ftz.f32 %f94, %f59;
ld.param.f32 %f95, [__cudaparm_kernel_pair_g_ewald];
mul.ftz.f32 %f96, %f95, %f94;
mul.ftz.f32 %f97, %f96, %f96;
neg.ftz.f32 %f98, %f97;
mov.f32 %f99, 0f3fb8aa3b; // 1.4427
mul.ftz.f32 %f100, %f98, %f99;
ex2.approx.ftz.f32 %f101, %f100;
.loc 16 112 0
mov.f32 %f102, 0f3f800000; // 1
mov.f32 %f103, 0f3ea7ba05; // 0.327591
fma.rn.ftz.f32 %f104, %f103, %f96, %f102;
rcp.approx.ftz.f32 %f105, %f104;
mov.f32 %f106, 0f3e827906; // 0.25483
mov.f32 %f107, 0fbe91a98e; // -0.284497
mov.f32 %f108, 0f3fb5f0e3; // 1.42141
mov.f32 %f109, 0fbfba00e3; // -1.45315
mov.f32 %f110, 0f3f87dc22; // 1.06141
fma.rn.ftz.f32 %f111, %f110, %f105, %f109;
fma.rn.ftz.f32 %f112, %f105, %f111, %f108;
fma.rn.ftz.f32 %f113, %f105, %f112, %f107;
fma.rn.ftz.f32 %f114, %f105, %f113, %f106;
mul.ftz.f32 %f115, %f105, %f114;
mul.ftz.f32 %f116, %f101, %f115;
mov.f32 %f117, %f116;
.loc 16 113 0
mov.u32 %r54, %r41;
mov.s32 %r55, 0;
mov.u32 %r56, %r55;
mov.s32 %r57, 0;
mov.u32 %r58, %r57;
mov.s32 %r59, 0;
mov.u32 %r60, %r59;
tex.1d.v4.f32.s32 {%f118,%f119,%f120,%f121},[q_tex,{%r54,%r56,%r58,%r60}];
mov.f32 %f122, %f118;
ld.param.f32 %f123, [__cudaparm_kernel_pair_qqrd2e];
mul.ftz.f32 %f124, %f123, %f33;
mul.ftz.f32 %f125, %f124, %f122;
div.approx.ftz.f32 %f126, %f125, %f94;
mov.f32 %f127, %f126;
.loc 16 114 0
mov.f32 %f128, 0f3f906ebb; // 1.12838
mul.ftz.f32 %f129, %f96, %f128;
fma.rn.ftz.f32 %f130, %f101, %f129, %f116;
sub.ftz.f32 %f131, %f130, %f43;
mul.ftz.f32 %f132, %f126, %f131;
bra.uni $Lt_0_26626;
$Lt_0_26882:
.loc 16 116 0
mov.f32 %f132, 0f00000000; // 0
$Lt_0_26626:
.loc 16 120 0
add.ftz.f32 %f133, %f132, %f68;
mul.ftz.f32 %f134, %f133, %f59;
fma.rn.ftz.f32 %f37, %f53, %f134, %f37;
.loc 16 121 0
fma.rn.ftz.f32 %f36, %f52, %f134, %f36;
.loc 16 122 0
fma.rn.ftz.f32 %f35, %f54, %f134, %f35;
ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag];
mov.u32 %r62, 0;
setp.le.s32 %p8, %r61, %r62;
@%p8 bra $Lt_0_27650;
.loc 16 125 0
mov.f32 %f135, %f127;
mov.f32 %f136, %f117;
sub.ftz.f32 %f137, %f136, %f43;
fma.rn.ftz.f32 %f138, %f135, %f137, %f38;
selp.f32 %f38, %f138, %f38, %p7;
@!%p5 bra $Lt_0_27650;
.loc 16 128 0
cvt.rzi.ftz.s32.f32 %r63, %f51;
cvt.rzi.ftz.s32.f32 %r64, %f28;
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj1];
ld.param.s32 %r65, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r66, %r65, %r64;
add.s32 %r67, %r63, %r66;
cvt.s64.s32 %rd37, %r67;
mul.wide.s32 %rd38, %r67, 16;
add.u64 %rd35, %rd36, %rd38;
mov.f32 %f139, %f62;
ld.global.v2.f32 {%f140,%f141}, [%rd35+8];
mul.ftz.f32 %f142, %f140, %f139;
sub.ftz.f32 %f143, %f142, %f141;
mul.ftz.f32 %f144, %f139, %f143;
mov.f32 %f145, %f80;
mul.ftz.f32 %f146, %f145, %f144;
ld.param.f32 %f147, [__cudaparm_kernel_pair_cut_lj_innersq];
setp.lt.ftz.f32 %p9, %f147, %f57;
selp.f32 %f148, %f146, %f144, %p9;
.loc 16 131 0
fma.rn.ftz.f32 %f39, %f40, %f148, %f39;
$Lt_0_27650:
$Lt_0_27138:
ld.param.s32 %r68, [__cudaparm_kernel_pair_vflag];
mov.u32 %r69, 0;
setp.le.s32 %p10, %r68, %r69;
@%p10 bra $Lt_0_28162;
.loc 16 135 0
mov.f32 %f149, %f10;
mul.ftz.f32 %f150, %f53, %f53;
fma.rn.ftz.f32 %f151, %f134, %f150, %f149;
mov.f32 %f10, %f151;
.loc 16 136 0
mov.f32 %f152, %f12;
fma.rn.ftz.f32 %f153, %f134, %f55, %f152;
mov.f32 %f12, %f153;
.loc 16 137 0
mov.f32 %f154, %f14;
mul.ftz.f32 %f155, %f54, %f54;
fma.rn.ftz.f32 %f156, %f134, %f155, %f154;
mov.f32 %f14, %f156;
.loc 16 138 0
mov.f32 %f157, %f16;
mul.ftz.f32 %f158, %f52, %f53;
fma.rn.ftz.f32 %f159, %f134, %f158, %f157;
mov.f32 %f16, %f159;
.loc 16 139 0
mov.f32 %f160, %f18;
mul.ftz.f32 %f161, %f53, %f54;
fma.rn.ftz.f32 %f162, %f134, %f161, %f160;
mov.f32 %f18, %f162;
.loc 16 140 0
mul.ftz.f32 %f163, %f52, %f54;
fma.rn.ftz.f32 %f19, %f134, %f163, %f19;
mov.f32 %f20, %f19;
$Lt_0_28162:
$Lt_0_25090:
.loc 16 69 0
mul.lo.u64 %rd39, %rd27, 4;
add.u64 %rd20, %rd20, %rd39;
setp.lt.u64 %p11, %rd20, %rd19;
@%p11 bra $Lt_0_24834;
bra.uni $Lt_0_24322;
$Lt_0_34562:
mov.f32 %f35, 0f00000000; // 0
mov.f32 %f36, 0f00000000; // 0
mov.f32 %f37, 0f00000000; // 0
mov.f32 %f38, 0f00000000; // 0
mov.f32 %f39, 0f00000000; // 0
$Lt_0_24322:
mov.u32 %r70, 1;
setp.le.s32 %p12, %r1, %r70;
@%p12 bra $Lt_0_30978;
.loc 16 145 0
mov.u64 %rd40, __cuda___cuda_local_var_32646_55_non_const_red_acc152;
cvt.s64.s32 %rd41, %r2;
mul.wide.s32 %rd42, %r2, 4;
add.u64 %rd43, %rd40, %rd42;
mov.f32 %f164, %f37;
st.shared.f32 [%rd43+0], %f164;
mov.f32 %f165, %f36;
st.shared.f32 [%rd43+512], %f165;
mov.f32 %f166, %f35;
st.shared.f32 [%rd43+1024], %f166;
mov.f32 %f167, %f39;
st.shared.f32 [%rd43+1536], %f167;
mov.f32 %f168, %f38;
st.shared.f32 [%rd43+2048], %f168;
shr.s32 %r71, %r1, 31;
mov.s32 %r72, 1;
and.b32 %r73, %r71, %r72;
add.s32 %r74, %r73, %r1;
shr.s32 %r75, %r74, 1;
mov.s32 %r76, %r75;
mov.u32 %r77, 0;
setp.ne.u32 %p13, %r75, %r77;
@!%p13 bra $Lt_0_29442;
$Lt_0_29954:
setp.ge.u32 %p14, %r14, %r76;
@%p14 bra $Lt_0_30210;
add.u32 %r78, %r2, %r76;
cvt.u64.u32 %rd44, %r78;
mul.wide.u32 %rd45, %r78, 4;
add.u64 %rd46, %rd40, %rd45;
ld.shared.f32 %f169, [%rd46+0];
add.ftz.f32 %f164, %f169, %f164;
st.shared.f32 [%rd43+0], %f164;
ld.shared.f32 %f170, [%rd46+512];
add.ftz.f32 %f165, %f170, %f165;
st.shared.f32 [%rd43+512], %f165;
ld.shared.f32 %f171, [%rd46+1024];
add.ftz.f32 %f166, %f171, %f166;
st.shared.f32 [%rd43+1024], %f166;
ld.shared.f32 %f172, [%rd46+1536];
add.ftz.f32 %f167, %f172, %f167;
st.shared.f32 [%rd43+1536], %f167;
ld.shared.f32 %f173, [%rd46+2048];
add.ftz.f32 %f168, %f173, %f168;
st.shared.f32 [%rd43+2048], %f168;
$Lt_0_30210:
shr.u32 %r76, %r76, 1;
mov.u32 %r79, 0;
setp.ne.u32 %p15, %r76, %r79;
@%p15 bra $Lt_0_29954;
$Lt_0_29442:
mov.f32 %f37, %f164;
mov.f32 %f36, %f165;
mov.f32 %f35, %f166;
mov.f32 %f39, %f167;
mov.f32 %f38, %f168;
ld.param.s32 %r80, [__cudaparm_kernel_pair_vflag];
mov.u32 %r81, 0;
setp.le.s32 %p16, %r80, %r81;
@%p16 bra $Lt_0_30978;
mov.f32 %f164, %f10;
st.shared.f32 [%rd43+0], %f164;
mov.f32 %f165, %f12;
st.shared.f32 [%rd43+512], %f165;
mov.f32 %f166, %f14;
st.shared.f32 [%rd43+1024], %f166;
mov.f32 %f167, %f16;
st.shared.f32 [%rd43+1536], %f167;
mov.f32 %f168, %f18;
st.shared.f32 [%rd43+2048], %f168;
mov.f32 %f174, %f19;
st.shared.f32 [%rd43+2560], %f174;
mov.s32 %r82, %r75;
@!%p13 bra $Lt_0_31490;
$Lt_0_32002:
setp.ge.u32 %p17, %r14, %r82;
@%p17 bra $Lt_0_32258;
add.u32 %r83, %r2, %r82;
cvt.u64.u32 %rd47, %r83;
mul.wide.u32 %rd48, %r83, 4;
add.u64 %rd49, %rd40, %rd48;
ld.shared.f32 %f175, [%rd49+0];
add.ftz.f32 %f164, %f175, %f164;
st.shared.f32 [%rd43+0], %f164;
ld.shared.f32 %f176, [%rd49+512];
add.ftz.f32 %f165, %f176, %f165;
st.shared.f32 [%rd43+512], %f165;
ld.shared.f32 %f177, [%rd49+1024];
add.ftz.f32 %f166, %f177, %f166;
st.shared.f32 [%rd43+1024], %f166;
ld.shared.f32 %f178, [%rd49+1536];
add.ftz.f32 %f167, %f178, %f167;
st.shared.f32 [%rd43+1536], %f167;
ld.shared.f32 %f179, [%rd49+2048];
add.ftz.f32 %f168, %f179, %f168;
st.shared.f32 [%rd43+2048], %f168;
ld.shared.f32 %f180, [%rd49+2560];
add.ftz.f32 %f174, %f180, %f174;
st.shared.f32 [%rd43+2560], %f174;
$Lt_0_32258:
shr.u32 %r82, %r82, 1;
mov.u32 %r84, 0;
setp.ne.u32 %p18, %r82, %r84;
@%p18 bra $Lt_0_32002;
$Lt_0_31490:
mov.f32 %f10, %f164;
mov.f32 %f12, %f165;
mov.f32 %f14, %f166;
mov.f32 %f16, %f167;
mov.f32 %f18, %f168;
mov.f32 %f20, %f174;
$Lt_0_30978:
$Lt_0_28930:
mov.u32 %r85, 0;
setp.ne.s32 %p19, %r14, %r85;
@%p19 bra $Lt_0_33026;
ld.param.u64 %rd50, [__cudaparm_kernel_pair___val_paramengv];
add.u64 %rd51, %rd50, %rd3;
ld.param.s32 %r86, [__cudaparm_kernel_pair_eflag];
mov.u32 %r87, 0;
setp.le.s32 %p20, %r86, %r87;
@%p20 bra $Lt_0_33538;
st.global.f32 [%rd51+0], %f39;
cvt.s64.s32 %rd52, %r9;
mul.wide.s32 %rd53, %r9, 4;
add.u64 %rd54, %rd53, %rd51;
st.global.f32 [%rd54+0], %f38;
add.u64 %rd51, %rd53, %rd54;
$Lt_0_33538:
ld.param.s32 %r88, [__cudaparm_kernel_pair_vflag];
mov.u32 %r89, 0;
setp.le.s32 %p21, %r88, %r89;
@%p21 bra $Lt_0_34050;
mov.f32 %f181, %f10;
st.global.f32 [%rd51+0], %f181;
cvt.s64.s32 %rd55, %r9;
mul.wide.s32 %rd56, %r9, 4;
add.u64 %rd57, %rd56, %rd51;
mov.f32 %f182, %f12;
st.global.f32 [%rd57+0], %f182;
add.u64 %rd58, %rd56, %rd57;
mov.f32 %f183, %f14;
st.global.f32 [%rd58+0], %f183;
add.u64 %rd59, %rd56, %rd58;
mov.f32 %f184, %f16;
st.global.f32 [%rd59+0], %f184;
add.u64 %rd51, %rd56, %rd59;
mov.f32 %f185, %f18;
st.global.f32 [%rd51+0], %f185;
mov.f32 %f186, %f20;
add.u64 %rd60, %rd56, %rd51;
st.global.f32 [%rd60+0], %f186;
$Lt_0_34050:
ld.param.u64 %rd61, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd62, %rd2, 16;
add.u64 %rd63, %rd61, %rd62;
mov.f32 %f187, %f188;
st.global.v4.f32 [%rd63+0], {%f37,%f36,%f35,%f187};
$Lt_0_33026:
$Lt_0_23298:
.loc 16 148 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_ljd_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .u64 __cudaparm_kernel_pair_fast_q_,
.param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,
.param .f32 __cudaparm_kernel_pair_fast_qqrd2e,
.param .f32 __cudaparm_kernel_pair_fast_g_ewald,
.param .f32 __cudaparm_kernel_pair_fast_denom_lj,
.param .f32 __cudaparm_kernel_pair_fast_cut_bothsq,
.param .f32 __cudaparm_kernel_pair_fast_cut_ljsq,
.param .f32 __cudaparm_kernel_pair_fast_cut_lj_innersq,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<86>;
.reg .u64 %rd<72>;
.reg .f32 %f<196>;
.reg .pred %p<25>;
.shared .align 4 .b8 __cuda___cuda_local_var_32666_33_non_const_sp_lj3336[32];
.shared .align 8 .b8 __cuda___cuda_local_var_32665_34_non_const_ljd3368[1024];
.shared .align 4 .b8 __cuda___cuda_local_var_32775_55_non_const_red_acc4392[3072];
// __cuda_local_var_32675_10_non_const_f = 64
// __cuda_local_var_32677_9_non_const_virial = 16
// __cuda_local_var_32712_43_non_const_prefactor = 56
// __cuda_local_var_32712_54_non_const__erfc = 52
// __cuda_local_var_32712_61_non_const_switch1 = 48
// __cuda_local_var_32713_15_non_const_lj3 = 44
// __cuda_local_var_32713_20_non_const_lj4 = 40
.loc 16 160 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
cvt.s64.s32 %rd1, %r1;
mov.u32 %r2, 7;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_24834;
.loc 16 167 0
mov.u64 %rd2, __cuda___cuda_local_var_32666_33_non_const_sp_lj3336;
mul.lo.u64 %rd3, %rd1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd2;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_24834:
mov.u64 %rd7, __cuda___cuda_local_var_32665_34_non_const_ljd3368;
mov.u64 %rd2, __cuda___cuda_local_var_32666_33_non_const_sp_lj3336;
.loc 16 168 0
mul.lo.u64 %rd8, %rd1, 8;
ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_ljd_in];
add.u64 %rd10, %rd9, %rd8;
add.u64 %rd11, %rd8, %rd7;
ld.global.v2.f32 {%f2,%f3}, [%rd10+0];
st.shared.v2.f32 [%rd11+0], {%f2,%f3};
add.s32 %r3, %r1, 128;
mov.u32 %r4, 127;
setp.gt.s32 %p2, %r3, %r4;
@%p2 bra $Lt_1_25346;
ld.global.v2.f32 {%f4,%f5}, [%rd10+1024];
st.shared.v2.f32 [%rd11+1024], {%f4,%f5};
$Lt_1_25346:
.loc 16 178 0
mov.f32 %f6, 0f00000000; // 0
mov.f32 %f7, %f6;
mov.f32 %f8, 0f00000000; // 0
mov.f32 %f9, %f8;
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
.loc 16 180 0
bar.sync 0;
ld.param.s32 %r5, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r6, %r1, %r5;
cvt.s32.u32 %r7, %ntid.x;
div.s32 %r8, %r7, %r5;
cvt.s32.u32 %r9, %ctaid.x;
mul.lo.s32 %r10, %r9, %r8;
add.s32 %r11, %r6, %r10;
ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_inum];
setp.ge.s32 %p3, %r11, %r12;
@%p3 bra $Lt_1_35586;
.loc 16 185 0
cvt.s64.s32 %rd12, %r11;
mul.wide.s32 %rd13, %r11, 4;
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd15, %rd13, %rd14;
ld.global.s32 %r13, [%rd15+0];
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd16, %r14;
mul.wide.s32 %rd17, %r14, 4;
add.u64 %rd18, %rd17, %rd15;
ld.global.s32 %r15, [%rd18+0];
sub.s32 %r16, %r5, 1;
and.b32 %r17, %r16, %r1;
cvt.s64.s32 %rd19, %r17;
mul.wide.s32 %rd20, %r17, 4;
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p4, %rd21, %rd14;
@%p4 bra $Lt_1_26626;
cvt.s32.s64 %r18, %rd16;
mul.lo.s32 %r19, %r18, %r5;
mov.s32 %r20, %r19;
mul.lo.s32 %r21, %r16, %r11;
add.s32 %r22, %r18, %r21;
cvt.s64.s32 %rd22, %r22;
mul.wide.s32 %rd23, %r22, 4;
add.u64 %rd24, %rd18, %rd23;
and.b32 %r23, %r16, %r15;
cvt.s64.s32 %rd25, %r23;
div.s32 %r24, %r15, %r5;
mul.lo.s32 %r25, %r19, %r24;
cvt.s64.s32 %rd26, %r25;
add.u64 %rd27, %rd25, %rd26;
mul.lo.u64 %rd28, %rd27, 4;
add.u64 %rd29, %rd24, %rd28;
add.u64 %rd30, %rd20, %rd24;
bra.uni $Lt_1_26370;
$Lt_1_26626:
add.u64 %rd31, %rd17, %rd18;
ld.global.s32 %r26, [%rd31+0];
cvt.s64.s32 %rd32, %r26;
mul.wide.s32 %rd33, %r26, 4;
add.u64 %rd34, %rd21, %rd33;
cvt.s64.s32 %rd35, %r15;
mul.wide.s32 %rd36, %r15, 4;
add.u64 %rd29, %rd34, %rd36;
mov.s32 %r20, %r5;
add.u64 %rd30, %rd20, %rd34;
$Lt_1_26370:
.loc 16 188 0
mov.u32 %r27, %r13;
mov.s32 %r28, 0;
mov.u32 %r29, %r28;
mov.s32 %r30, 0;
mov.u32 %r31, %r30;
mov.s32 %r32, 0;
mov.u32 %r33, %r32;
tex.1d.v4.f32.s32 {%f18,%f19,%f20,%f21},[pos_tex,{%r27,%r29,%r31,%r33}];
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
mov.f32 %f25, %f21;
.loc 16 189 0
mov.u32 %r34, %r13;
mov.s32 %r35, 0;
mov.u32 %r36, %r35;
mov.s32 %r37, 0;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
tex.1d.v4.f32.s32 {%f26,%f27,%f28,%f29},[q_tex,{%r34,%r36,%r38,%r40}];
mov.f32 %f30, %f26;
setp.ge.u64 %p5, %rd30, %rd29;
@%p5 bra $Lt_1_37122;
cvt.rzi.ftz.s32.f32 %r41, %f25;
cvt.s64.s32 %rd37, %r20;
ld.param.f32 %f31, [__cudaparm_kernel_pair_fast_cut_bothsq];
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
mov.f32 %f35, 0f00000000; // 0
mov.f32 %f36, 0f00000000; // 0
$Lt_1_27394:
//<loop> Loop body line 189, nesting depth: 1, estimated iterations: unknown
.loc 16 193 0
ld.global.s32 %r42, [%rd30+0];
.loc 16 196 0
shr.s32 %r43, %r42, 30;
and.b32 %r44, %r43, 3;
cvt.s64.s32 %rd38, %r44;
mul.wide.s32 %rd39, %r44, 4;
add.u64 %rd40, %rd2, %rd39;
ld.shared.f32 %f37, [%rd40+0];
.loc 16 197 0
mov.f32 %f38, 0f3f800000; // 1
ld.shared.f32 %f39, [%rd40+16];
sub.ftz.f32 %f40, %f38, %f39;
.loc 16 200 0
and.b32 %r45, %r42, 1073741823;
mov.u32 %r46, %r45;
mov.s32 %r47, 0;
mov.u32 %r48, %r47;
mov.s32 %r49, 0;
mov.u32 %r50, %r49;
mov.s32 %r51, 0;
mov.u32 %r52, %r51;
tex.1d.v4.f32.s32 {%f41,%f42,%f43,%f44},[pos_tex,{%r46,%r48,%r50,%r52}];
mov.f32 %f45, %f41;
mov.f32 %f46, %f42;
mov.f32 %f47, %f43;
mov.f32 %f48, %f44;
sub.ftz.f32 %f49, %f23, %f46;
sub.ftz.f32 %f50, %f22, %f45;
sub.ftz.f32 %f51, %f24, %f47;
mul.ftz.f32 %f52, %f49, %f49;
fma.rn.ftz.f32 %f53, %f50, %f50, %f52;
fma.rn.ftz.f32 %f54, %f51, %f51, %f53;
setp.lt.ftz.f32 %p6, %f54, %f31;
@!%p6 bra $Lt_1_30722;
ld.param.f32 %f55, [__cudaparm_kernel_pair_fast_cut_ljsq];
setp.lt.ftz.f32 %p7, %f54, %f55;
rcp.approx.ftz.f32 %f56, %f54;
@!%p7 bra $Lt_1_28418;
.loc 16 215 0
cvt.rzi.ftz.s32.f32 %r53, %f48;
cvt.s64.s32 %rd41, %r41;
mul.wide.s32 %rd42, %r41, 8;
add.u64 %rd43, %rd7, %rd42;
cvt.s64.s32 %rd44, %r53;
mul.wide.s32 %rd45, %r53, 8;
add.u64 %rd46, %rd7, %rd45;
ld.shared.v2.f32 {%f57,%f58}, [%rd43+0];
ld.shared.v2.f32 {%f59,%f60}, [%rd46+0];
mul.ftz.f32 %f61, %f57, %f59;
.loc 16 216 0
add.ftz.f32 %f62, %f58, %f60;
mov.f32 %f63, 0f3f000000; // 0.5
mul.ftz.f32 %f64, %f62, %f63;
.loc 16 220 0
mul.ftz.f32 %f65, %f64, %f64;
sqrt.approx.ftz.f32 %f66, %f61;
mov.f32 %f67, 0f40800000; // 4
mul.ftz.f32 %f68, %f66, %f67;
mul.ftz.f32 %f69, %f65, %f56;
mul.ftz.f32 %f70, %f69, %f69;
mul.ftz.f32 %f71, %f69, %f70;
mul.ftz.f32 %f72, %f68, %f71;
mov.f32 %f73, %f72;
.loc 16 221 0
mul.ftz.f32 %f74, %f71, %f72;
mov.f32 %f75, %f74;
.loc 16 222 0
mov.f32 %f76, 0f40c00000; // 6
mul.ftz.f32 %f77, %f72, %f76;
mov.f32 %f78, 0f41400000; // 12
mul.ftz.f32 %f79, %f78, %f74;
sub.ftz.f32 %f80, %f79, %f77;
mul.ftz.f32 %f81, %f37, %f80;
ld.param.f32 %f82, [__cudaparm_kernel_pair_fast_cut_lj_innersq];
setp.gt.ftz.f32 %p8, %f54, %f82;
@!%p8 bra $Lt_1_28162;
.loc 16 228 0
add.ftz.f32 %f83, %f54, %f54;
sub.ftz.f32 %f84, %f55, %f54;
add.ftz.f32 %f85, %f83, %f55;
mul.ftz.f32 %f86, %f84, %f84;
mov.f32 %f87, 0f40400000; // 3
mul.ftz.f32 %f88, %f87, %f82;
sub.ftz.f32 %f89, %f85, %f88;
ld.param.f32 %f90, [__cudaparm_kernel_pair_fast_denom_lj];
div.approx.ftz.f32 %f91, %f89, %f90;
mul.ftz.f32 %f92, %f86, %f91;
mov.f32 %f93, %f92;
.loc 16 231 0
mov.f32 %f94, 0f41400000; // 12
mul.ftz.f32 %f95, %f54, %f94;
mul.ftz.f32 %f96, %f84, %f95;
sub.ftz.f32 %f97, %f54, %f82;
mul.ftz.f32 %f98, %f96, %f97;
div.approx.ftz.f32 %f99, %f98, %f90;
sub.ftz.f32 %f100, %f74, %f72;
mul.ftz.f32 %f101, %f99, %f100;
fma.rn.ftz.f32 %f81, %f81, %f92, %f101;
bra.uni $Lt_1_28162;
$Lt_1_28418:
.loc 16 234 0
mov.f32 %f81, 0f00000000; // 0
$Lt_1_28162:
ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_cut_coulsq];
setp.gt.ftz.f32 %p9, %f102, %f54;
@!%p9 bra $Lt_1_29442;
.loc 20 518 0
rsqrt.approx.ftz.f32 %f103, %f56;
ld.param.f32 %f104, [__cudaparm_kernel_pair_fast_g_ewald];
mul.ftz.f32 %f105, %f104, %f103;
mul.ftz.f32 %f106, %f105, %f105;
neg.ftz.f32 %f107, %f106;
mov.f32 %f108, 0f3fb8aa3b; // 1.4427
mul.ftz.f32 %f109, %f107, %f108;
ex2.approx.ftz.f32 %f110, %f109;
.loc 16 241 0
mov.f32 %f111, 0f3f800000; // 1
mov.f32 %f112, 0f3ea7ba05; // 0.327591
fma.rn.ftz.f32 %f113, %f112, %f105, %f111;
rcp.approx.ftz.f32 %f114, %f113;
mov.f32 %f115, 0f3e827906; // 0.25483
mov.f32 %f116, 0fbe91a98e; // -0.284497
mov.f32 %f117, 0f3fb5f0e3; // 1.42141
mov.f32 %f118, 0fbfba00e3; // -1.45315
mov.f32 %f119, 0f3f87dc22; // 1.06141
fma.rn.ftz.f32 %f120, %f119, %f114, %f118;
fma.rn.ftz.f32 %f121, %f114, %f120, %f117;
fma.rn.ftz.f32 %f122, %f114, %f121, %f116;
fma.rn.ftz.f32 %f123, %f114, %f122, %f115;
mul.ftz.f32 %f124, %f114, %f123;
mul.ftz.f32 %f125, %f110, %f124;
mov.f32 %f126, %f125;
.loc 16 242 0
mov.u32 %r54, %r45;
mov.s32 %r55, 0;
mov.u32 %r56, %r55;
mov.s32 %r57, 0;
mov.u32 %r58, %r57;
mov.s32 %r59, 0;
mov.u32 %r60, %r59;
tex.1d.v4.f32.s32 {%f127,%f128,%f129,%f130},[q_tex,{%r54,%r56,%r58,%r60}];
mov.f32 %f131, %f127;
ld.param.f32 %f132, [__cudaparm_kernel_pair_fast_qqrd2e];
mul.ftz.f32 %f133, %f132, %f30;
mul.ftz.f32 %f134, %f133, %f131;
div.approx.ftz.f32 %f135, %f134, %f103;
mov.f32 %f136, %f135;
.loc 16 243 0
mov.f32 %f137, 0f3f906ebb; // 1.12838
mul.ftz.f32 %f138, %f105, %f137;
fma.rn.ftz.f32 %f139, %f110, %f138, %f125;
sub.ftz.f32 %f140, %f139, %f40;
mul.ftz.f32 %f141, %f135, %f140;
bra.uni $Lt_1_29186;
$Lt_1_29442:
.loc 16 245 0
mov.f32 %f141, 0f00000000; // 0
$Lt_1_29186:
.loc 16 249 0
add.ftz.f32 %f142, %f141, %f81;
mul.ftz.f32 %f143, %f142, %f56;
fma.rn.ftz.f32 %f34, %f50, %f143, %f34;
.loc 16 250 0
fma.rn.ftz.f32 %f33, %f49, %f143, %f33;
.loc 16 251 0
fma.rn.ftz.f32 %f32, %f51, %f143, %f32;
ld.param.s32 %r61, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r62, 0;
setp.le.s32 %p10, %r61, %r62;
@%p10 bra $Lt_1_30210;
.loc 16 254 0
mov.f32 %f144, %f136;
mov.f32 %f145, %f126;
sub.ftz.f32 %f146, %f145, %f40;
fma.rn.ftz.f32 %f147, %f144, %f146, %f35;
selp.f32 %f35, %f147, %f35, %p9;
@!%p7 bra $Lt_1_30210;
.loc 16 260 0
mov.f32 %f148, %f75;
mov.f32 %f149, %f73;
sub.ftz.f32 %f150, %f148, %f149;
mov.f32 %f151, %f93;
mul.ftz.f32 %f152, %f151, %f150;
ld.param.f32 %f153, [__cudaparm_kernel_pair_fast_cut_lj_innersq];
setp.lt.ftz.f32 %p11, %f153, %f54;
selp.f32 %f154, %f152, %f150, %p11;
fma.rn.ftz.f32 %f36, %f37, %f154, %f36;
$Lt_1_30210:
$Lt_1_29698:
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r64, 0;
setp.le.s32 %p12, %r63, %r64;
@%p12 bra $Lt_1_30722;
.loc 16 264 0
mov.f32 %f155, %f7;
mul.ftz.f32 %f156, %f50, %f50;
fma.rn.ftz.f32 %f157, %f143, %f156, %f155;
mov.f32 %f7, %f157;
.loc 16 265 0
mov.f32 %f158, %f9;
fma.rn.ftz.f32 %f159, %f143, %f52, %f158;
mov.f32 %f9, %f159;
.loc 16 266 0
mov.f32 %f160, %f11;
mul.ftz.f32 %f161, %f51, %f51;
fma.rn.ftz.f32 %f162, %f143, %f161, %f160;
mov.f32 %f11, %f162;
.loc 16 267 0
mov.f32 %f163, %f13;
mul.ftz.f32 %f164, %f49, %f50;
fma.rn.ftz.f32 %f165, %f143, %f164, %f163;
mov.f32 %f13, %f165;
.loc 16 268 0
mov.f32 %f166, %f15;
mul.ftz.f32 %f167, %f50, %f51;
fma.rn.ftz.f32 %f168, %f143, %f167, %f166;
mov.f32 %f15, %f168;
.loc 16 269 0
mul.ftz.f32 %f169, %f49, %f51;
fma.rn.ftz.f32 %f16, %f143, %f169, %f16;
mov.f32 %f17, %f16;
$Lt_1_30722:
$Lt_1_27650:
.loc 16 192 0
mul.lo.u64 %rd47, %rd37, 4;
add.u64 %rd30, %rd30, %rd47;
setp.lt.u64 %p13, %rd30, %rd29;
@%p13 bra $Lt_1_27394;
bra.uni $Lt_1_26882;
$Lt_1_37122:
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
mov.f32 %f35, 0f00000000; // 0
mov.f32 %f36, 0f00000000; // 0
$Lt_1_26882:
mov.u32 %r65, 1;
setp.le.s32 %p14, %r5, %r65;
@%p14 bra $Lt_1_33538;
.loc 16 274 0
mov.u64 %rd48, __cuda___cuda_local_var_32775_55_non_const_red_acc4392;
mul.lo.u64 %rd49, %rd1, 4;
add.u64 %rd50, %rd48, %rd49;
mov.f32 %f170, %f34;
st.shared.f32 [%rd50+0], %f170;
mov.f32 %f171, %f33;
st.shared.f32 [%rd50+512], %f171;
mov.f32 %f172, %f32;
st.shared.f32 [%rd50+1024], %f172;
mov.f32 %f173, %f36;
st.shared.f32 [%rd50+1536], %f173;
mov.f32 %f174, %f35;
st.shared.f32 [%rd50+2048], %f174;
shr.s32 %r66, %r5, 31;
mov.s32 %r67, 1;
and.b32 %r68, %r66, %r67;
add.s32 %r69, %r68, %r5;
shr.s32 %r70, %r69, 1;
mov.s32 %r71, %r70;
mov.u32 %r72, 0;
setp.ne.u32 %p15, %r70, %r72;
@!%p15 bra $Lt_1_32002;
$Lt_1_32514:
setp.ge.u32 %p16, %r17, %r71;
@%p16 bra $Lt_1_32770;
add.u32 %r73, %r1, %r71;
cvt.u64.u32 %rd51, %r73;
mul.wide.u32 %rd52, %r73, 4;
add.u64 %rd53, %rd48, %rd52;
ld.shared.f32 %f175, [%rd53+0];
add.ftz.f32 %f170, %f175, %f170;
st.shared.f32 [%rd50+0], %f170;
ld.shared.f32 %f176, [%rd53+512];
add.ftz.f32 %f171, %f176, %f171;
st.shared.f32 [%rd50+512], %f171;
ld.shared.f32 %f177, [%rd53+1024];
add.ftz.f32 %f172, %f177, %f172;
st.shared.f32 [%rd50+1024], %f172;
ld.shared.f32 %f178, [%rd53+1536];
add.ftz.f32 %f173, %f178, %f173;
st.shared.f32 [%rd50+1536], %f173;
ld.shared.f32 %f179, [%rd53+2048];
add.ftz.f32 %f174, %f179, %f174;
st.shared.f32 [%rd50+2048], %f174;
$Lt_1_32770:
shr.u32 %r71, %r71, 1;
mov.u32 %r74, 0;
setp.ne.u32 %p17, %r71, %r74;
@%p17 bra $Lt_1_32514;
$Lt_1_32002:
mov.f32 %f34, %f170;
mov.f32 %f33, %f171;
mov.f32 %f32, %f172;
mov.f32 %f36, %f173;
mov.f32 %f35, %f174;
ld.param.s32 %r75, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r76, 0;
setp.le.s32 %p18, %r75, %r76;
@%p18 bra $Lt_1_33538;
mov.f32 %f170, %f7;
st.shared.f32 [%rd50+0], %f170;
mov.f32 %f171, %f9;
st.shared.f32 [%rd50+512], %f171;
mov.f32 %f172, %f11;
st.shared.f32 [%rd50+1024], %f172;
mov.f32 %f173, %f13;
st.shared.f32 [%rd50+1536], %f173;
mov.f32 %f174, %f15;
st.shared.f32 [%rd50+2048], %f174;
mov.f32 %f180, %f16;
st.shared.f32 [%rd50+2560], %f180;
mov.s32 %r77, %r70;
@!%p15 bra $Lt_1_34050;
$Lt_1_34562:
setp.ge.u32 %p19, %r17, %r77;
@%p19 bra $Lt_1_34818;
add.u32 %r78, %r1, %r77;
cvt.u64.u32 %rd54, %r78;
mul.wide.u32 %rd55, %r78, 4;
add.u64 %rd56, %rd48, %rd55;
ld.shared.f32 %f181, [%rd56+0];
add.ftz.f32 %f170, %f181, %f170;
st.shared.f32 [%rd50+0], %f170;
ld.shared.f32 %f182, [%rd56+512];
add.ftz.f32 %f171, %f182, %f171;
st.shared.f32 [%rd50+512], %f171;
ld.shared.f32 %f183, [%rd56+1024];
add.ftz.f32 %f172, %f183, %f172;
st.shared.f32 [%rd50+1024], %f172;
ld.shared.f32 %f184, [%rd56+1536];
add.ftz.f32 %f173, %f184, %f173;
st.shared.f32 [%rd50+1536], %f173;
ld.shared.f32 %f185, [%rd56+2048];
add.ftz.f32 %f174, %f185, %f174;
st.shared.f32 [%rd50+2048], %f174;
ld.shared.f32 %f186, [%rd56+2560];
add.ftz.f32 %f180, %f186, %f180;
st.shared.f32 [%rd50+2560], %f180;
$Lt_1_34818:
shr.u32 %r77, %r77, 1;
mov.u32 %r79, 0;
setp.ne.u32 %p20, %r77, %r79;
@%p20 bra $Lt_1_34562;
$Lt_1_34050:
mov.f32 %f7, %f170;
mov.f32 %f9, %f171;
mov.f32 %f11, %f172;
mov.f32 %f13, %f173;
mov.f32 %f15, %f174;
mov.f32 %f17, %f180;
$Lt_1_33538:
$Lt_1_31490:
mov.u32 %r80, 0;
setp.ne.s32 %p21, %r17, %r80;
@%p21 bra $Lt_1_35586;
ld.param.u64 %rd57, [__cudaparm_kernel_pair_fast___val_paramengv];
add.u64 %rd58, %rd57, %rd13;
ld.param.s32 %r81, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r82, 0;
setp.le.s32 %p22, %r81, %r82;
@%p22 bra $Lt_1_36098;
st.global.f32 [%rd58+0], %f36;
cvt.s64.s32 %rd59, %r12;
mul.wide.s32 %rd60, %r12, 4;
add.u64 %rd61, %rd60, %rd58;
st.global.f32 [%rd61+0], %f35;
add.u64 %rd58, %rd60, %rd61;
$Lt_1_36098:
ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r84, 0;
setp.le.s32 %p23, %r83, %r84;
@%p23 bra $Lt_1_36610;
mov.f32 %f187, %f7;
st.global.f32 [%rd58+0], %f187;
cvt.s64.s32 %rd62, %r12;
mul.wide.s32 %rd63, %r12, 4;
add.u64 %rd64, %rd63, %rd58;
mov.f32 %f188, %f9;
st.global.f32 [%rd64+0], %f188;
add.u64 %rd65, %rd63, %rd64;
mov.f32 %f189, %f11;
st.global.f32 [%rd65+0], %f189;
add.u64 %rd66, %rd63, %rd65;
mov.f32 %f190, %f13;
st.global.f32 [%rd66+0], %f190;
add.u64 %rd58, %rd63, %rd66;
mov.f32 %f191, %f15;
st.global.f32 [%rd58+0], %f191;
mov.f32 %f192, %f17;
add.u64 %rd67, %rd63, %rd58;
st.global.f32 [%rd67+0], %f192;
$Lt_1_36610:
ld.param.u64 %rd68, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd69, %rd12, 16;
add.u64 %rd70, %rd68, %rd69;
mov.f32 %f193, %f194;
st.global.v4.f32 [%rd70+0], {%f34,%f33,%f32,%f193};
$Lt_1_35586:
$Lt_1_25858:
.loc 16 277 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast
Event Timeline
Log In to Comment