Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F96432454
coul_long.ptx
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Dec 26, 17:21
Size
29 KB
Mime Type
text/x-asm
Expires
Sat, Dec 28, 17:21 (2 d)
Engine
blob
Format
Raw Data
Handle
23183458
Attached To
rLAMMPS lammps
coul_long.ptx
View Options
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009de1_00000000-9_lal_coul_long.cpp3.i (/home/sjplimp/ccBI#.NrfuKV)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009de1_00000000-8_lal_coul_long.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_coul_long.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.global .texref q_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .u64 __cudaparm_kernel_pair_lj3,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_cl_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair_engv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .u64 __cudaparm_kernel_pair_q_,
.param .f32 __cudaparm_kernel_pair_cut_coulsq,
.param .f32 __cudaparm_kernel_pair_qqrd2e,
.param .f32 __cudaparm_kernel_pair_g_ewald,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<81>;
.reg .u64 %rd<58>;
.reg .f32 %f<132>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_cl112[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32611_37_non_const_red_acc128[3072];
// __cuda_local_var_32548_10_non_const_f = 48
// __cuda_local_var_32550_9_non_const_virial = 16
.loc 16 36 0
$LDWbegin_kernel_pair:
.loc 16 41 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_cl_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 42 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 43 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 44 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_cl112+0], {%f1,%f2,%f3,%f4};
.loc 16 51 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_0_25858;
.loc 16 56 0
cvt.s64.s32 %rd2, %r8;
mul.wide.s32 %rd3, %r8, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd5, %rd3, %rd4;
ld.global.s32 %r10, [%rd5+0];
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd6, %r11;
mul.wide.s32 %rd7, %r11, 4;
add.u64 %rd8, %rd7, %rd5;
ld.global.s32 %r12, [%rd8+0];
sub.s32 %r13, %r1, 1;
and.b32 %r14, %r13, %r2;
cvt.s64.s32 %rd9, %r14;
mul.wide.s32 %rd10, %r14, 4;
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd11, %rd4;
@%p2 bra $Lt_0_19458;
cvt.s32.s64 %r15, %rd6;
mul.lo.s32 %r16, %r15, %r1;
mov.s32 %r17, %r16;
mul.lo.s32 %r18, %r13, %r8;
add.s32 %r19, %r15, %r18;
cvt.s64.s32 %rd12, %r19;
mul.wide.s32 %rd13, %r19, 4;
add.u64 %rd14, %rd8, %rd13;
and.b32 %r20, %r13, %r12;
cvt.s64.s32 %rd15, %r20;
div.s32 %r21, %r12, %r1;
mul.lo.s32 %r22, %r16, %r21;
cvt.s64.s32 %rd16, %r22;
add.u64 %rd17, %rd15, %rd16;
mul.lo.u64 %rd18, %rd17, 4;
add.u64 %rd19, %rd14, %rd18;
add.u64 %rd20, %rd10, %rd14;
bra.uni $Lt_0_19202;
$Lt_0_19458:
add.u64 %rd21, %rd7, %rd8;
ld.global.s32 %r23, [%rd21+0];
cvt.s64.s32 %rd22, %r23;
mul.wide.s32 %rd23, %r23, 4;
add.u64 %rd24, %rd11, %rd23;
cvt.s64.s32 %rd25, %r12;
mul.wide.s32 %rd26, %r12, 4;
add.u64 %rd19, %rd24, %rd26;
mov.s32 %r17, %r1;
add.u64 %rd20, %rd10, %rd24;
$Lt_0_19202:
.loc 16 59 0
mov.u32 %r24, %r10;
mov.s32 %r25, 0;
mov.u32 %r26, %r25;
mov.s32 %r27, 0;
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
.loc 16 60 0
mov.u32 %r31, %r10;
mov.s32 %r32, 0;
mov.u32 %r33, %r32;
mov.s32 %r34, 0;
mov.u32 %r35, %r34;
mov.s32 %r36, 0;
mov.u32 %r37, %r36;
tex.1d.v4.f32.s32 {%f24,%f25,%f26,%f27},[q_tex,{%r31,%r33,%r35,%r37}];
mov.f32 %f28, %f24;
setp.ge.u64 %p3, %rd20, %rd19;
@%p3 bra $Lt_0_27394;
cvt.s64.s32 %rd27, %r17;
ld.param.f32 %f29, [__cudaparm_kernel_pair_cut_coulsq];
mov.f32 %f30, 0f00000000; // 0
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.u64 %rd28, __cuda___cuda_local_var_32541_33_non_const_sp_cl112;
$Lt_0_20226:
//<loop> Loop body line 60, nesting depth: 1, estimated iterations: unknown
.loc 16 63 0
ld.global.s32 %r38, [%rd20+0];
.loc 16 66 0
mov.f32 %f34, 0f3f800000; // 1
shr.s32 %r39, %r38, 30;
and.b32 %r40, %r39, 3;
cvt.s64.s32 %rd29, %r40;
mul.wide.s32 %rd30, %r40, 4;
add.u64 %rd31, %rd28, %rd30;
ld.shared.f32 %f35, [%rd31+0];
sub.ftz.f32 %f36, %f34, %f35;
.loc 16 69 0
and.b32 %r41, %r38, 1073741823;
mov.u32 %r42, %r41;
mov.s32 %r43, 0;
mov.u32 %r44, %r43;
mov.s32 %r45, 0;
mov.u32 %r46, %r45;
mov.s32 %r47, 0;
mov.u32 %r48, %r47;
tex.1d.v4.f32.s32 {%f37,%f38,%f39,%f40},[pos_tex,{%r42,%r44,%r46,%r48}];
mov.f32 %f41, %f37;
mov.f32 %f42, %f38;
mov.f32 %f43, %f39;
sub.ftz.f32 %f44, %f22, %f42;
sub.ftz.f32 %f45, %f21, %f41;
sub.ftz.f32 %f46, %f23, %f43;
mul.ftz.f32 %f47, %f44, %f44;
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
setp.lt.ftz.f32 %p4, %f49, %f29;
@!%p4 bra $Lt_0_20994;
.loc 20 518 0
rcp.approx.ftz.f32 %f50, %f49;
rsqrt.approx.ftz.f32 %f51, %f50;
ld.param.f32 %f52, [__cudaparm_kernel_pair_g_ewald];
mul.ftz.f32 %f53, %f52, %f51;
mul.ftz.f32 %f54, %f53, %f53;
neg.ftz.f32 %f55, %f54;
mov.f32 %f56, 0f3fb8aa3b; // 1.4427
mul.ftz.f32 %f57, %f55, %f56;
ex2.approx.ftz.f32 %f58, %f57;
.loc 16 85 0
mov.f32 %f59, 0f3f800000; // 1
mov.f32 %f60, 0f3ea7ba05; // 0.327591
fma.rn.ftz.f32 %f61, %f60, %f53, %f59;
rcp.approx.ftz.f32 %f62, %f61;
mov.f32 %f63, 0f3e827906; // 0.25483
mov.f32 %f64, 0fbe91a98e; // -0.284497
mov.f32 %f65, 0f3fb5f0e3; // 1.42141
mov.f32 %f66, 0fbfba00e3; // -1.45315
mov.f32 %f67, 0f3f87dc22; // 1.06141
fma.rn.ftz.f32 %f68, %f67, %f62, %f66;
fma.rn.ftz.f32 %f69, %f62, %f68, %f65;
fma.rn.ftz.f32 %f70, %f62, %f69, %f64;
fma.rn.ftz.f32 %f71, %f62, %f70, %f63;
mul.ftz.f32 %f72, %f62, %f71;
mul.ftz.f32 %f73, %f58, %f72;
.loc 16 86 0
mov.u32 %r49, %r41;
mov.s32 %r50, 0;
mov.u32 %r51, %r50;
mov.s32 %r52, 0;
mov.u32 %r53, %r52;
mov.s32 %r54, 0;
mov.u32 %r55, %r54;
tex.1d.v4.f32.s32 {%f74,%f75,%f76,%f77},[q_tex,{%r49,%r51,%r53,%r55}];
mov.f32 %f78, %f74;
.loc 16 87 0
ld.param.f32 %f79, [__cudaparm_kernel_pair_qqrd2e];
mul.ftz.f32 %f80, %f79, %f28;
mul.ftz.f32 %f81, %f80, %f78;
div.approx.ftz.f32 %f82, %f81, %f51;
mov.f32 %f83, 0f3f906ebb; // 1.12838
mul.ftz.f32 %f84, %f53, %f83;
fma.rn.ftz.f32 %f85, %f58, %f84, %f73;
sub.ftz.f32 %f86, %f85, %f36;
mul.ftz.f32 %f87, %f82, %f86;
mul.ftz.f32 %f88, %f50, %f87;
.loc 16 89 0
fma.rn.ftz.f32 %f32, %f45, %f88, %f32;
.loc 16 90 0
fma.rn.ftz.f32 %f31, %f44, %f88, %f31;
.loc 16 91 0
fma.rn.ftz.f32 %f30, %f46, %f88, %f30;
.loc 16 78 0
sub.ftz.f32 %f89, %f73, %f36;
fma.rn.ftz.f32 %f90, %f82, %f89, %f33;
ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag];
mov.s32 %r57, 0;
setp.gt.s32 %p5, %r56, %r57;
selp.f32 %f33, %f90, %f33, %p5;
ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag];
mov.u32 %r59, 0;
setp.le.s32 %p6, %r58, %r59;
@%p6 bra $Lt_0_20994;
.loc 16 97 0
mov.f32 %f91, %f6;
mul.ftz.f32 %f92, %f45, %f45;
fma.rn.ftz.f32 %f93, %f88, %f92, %f91;
mov.f32 %f6, %f93;
.loc 16 98 0
mov.f32 %f94, %f8;
fma.rn.ftz.f32 %f95, %f88, %f47, %f94;
mov.f32 %f8, %f95;
.loc 16 99 0
mov.f32 %f96, %f10;
mul.ftz.f32 %f97, %f46, %f46;
fma.rn.ftz.f32 %f98, %f88, %f97, %f96;
mov.f32 %f10, %f98;
.loc 16 100 0
mov.f32 %f99, %f12;
mul.ftz.f32 %f100, %f44, %f45;
fma.rn.ftz.f32 %f101, %f88, %f100, %f99;
mov.f32 %f12, %f101;
.loc 16 101 0
mov.f32 %f102, %f14;
mul.ftz.f32 %f103, %f45, %f46;
fma.rn.ftz.f32 %f104, %f88, %f103, %f102;
mov.f32 %f14, %f104;
.loc 16 102 0
mul.ftz.f32 %f105, %f44, %f46;
fma.rn.ftz.f32 %f15, %f88, %f105, %f15;
mov.f32 %f16, %f15;
$Lt_0_20994:
$Lt_0_20482:
.loc 16 62 0
mul.lo.u64 %rd32, %rd27, 4;
add.u64 %rd20, %rd20, %rd32;
setp.lt.u64 %p7, %rd20, %rd19;
@%p7 bra $Lt_0_20226;
bra.uni $Lt_0_19714;
$Lt_0_27394:
mov.f32 %f30, 0f00000000; // 0
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
$Lt_0_19714:
mov.u32 %r60, 1;
setp.le.s32 %p8, %r1, %r60;
@%p8 bra $Lt_0_23810;
.loc 16 112 0
mov.u64 %rd33, __cuda___cuda_local_var_32611_37_non_const_red_acc128;
cvt.s64.s32 %rd34, %r2;
mul.wide.s32 %rd35, %r2, 4;
add.u64 %rd36, %rd33, %rd35;
mov.f32 %f106, %f32;
st.shared.f32 [%rd36+0], %f106;
.loc 16 113 0
mov.f32 %f107, %f31;
st.shared.f32 [%rd36+512], %f107;
.loc 16 114 0
mov.f32 %f108, %f30;
st.shared.f32 [%rd36+1024], %f108;
.loc 16 115 0
mov.f32 %f109, %f33;
st.shared.f32 [%rd36+1536], %f109;
.loc 16 117 0
shr.s32 %r61, %r1, 31;
mov.s32 %r62, 1;
and.b32 %r63, %r61, %r62;
add.s32 %r64, %r63, %r1;
shr.s32 %r65, %r64, 1;
mov.s32 %r66, %r65;
mov.u32 %r67, 0;
setp.ne.u32 %p9, %r65, %r67;
@!%p9 bra $Lt_0_22274;
$Lt_0_22786:
setp.ge.u32 %p10, %r14, %r66;
@%p10 bra $Lt_0_23042;
.loc 16 120 0
add.u32 %r68, %r2, %r66;
cvt.u64.u32 %rd37, %r68;
mul.wide.u32 %rd38, %r68, 4;
add.u64 %rd39, %rd33, %rd38;
ld.shared.f32 %f110, [%rd39+0];
add.ftz.f32 %f106, %f110, %f106;
st.shared.f32 [%rd36+0], %f106;
ld.shared.f32 %f111, [%rd39+512];
add.ftz.f32 %f107, %f111, %f107;
st.shared.f32 [%rd36+512], %f107;
ld.shared.f32 %f112, [%rd39+1024];
add.ftz.f32 %f108, %f112, %f108;
st.shared.f32 [%rd36+1024], %f108;
ld.shared.f32 %f113, [%rd39+1536];
add.ftz.f32 %f109, %f113, %f109;
st.shared.f32 [%rd36+1536], %f109;
$Lt_0_23042:
.loc 16 117 0
shr.u32 %r66, %r66, 1;
mov.u32 %r69, 0;
setp.ne.u32 %p11, %r66, %r69;
@%p11 bra $Lt_0_22786;
$Lt_0_22274:
.loc 16 124 0
mov.f32 %f32, %f106;
.loc 16 125 0
mov.f32 %f31, %f107;
.loc 16 126 0
mov.f32 %f30, %f108;
.loc 16 127 0
mov.f32 %f33, %f109;
ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag];
mov.u32 %r71, 0;
setp.le.s32 %p12, %r70, %r71;
@%p12 bra $Lt_0_23810;
.loc 16 131 0
mov.f32 %f106, %f6;
st.shared.f32 [%rd36+0], %f106;
mov.f32 %f107, %f8;
st.shared.f32 [%rd36+512], %f107;
mov.f32 %f108, %f10;
st.shared.f32 [%rd36+1024], %f108;
mov.f32 %f109, %f12;
st.shared.f32 [%rd36+1536], %f109;
mov.f32 %f114, %f14;
st.shared.f32 [%rd36+2048], %f114;
mov.f32 %f115, %f15;
st.shared.f32 [%rd36+2560], %f115;
.loc 16 133 0
mov.s32 %r72, %r65;
@!%p9 bra $Lt_0_24322;
$Lt_0_24834:
setp.ge.u32 %p13, %r14, %r72;
@%p13 bra $Lt_0_25090;
.loc 16 136 0
add.u32 %r73, %r2, %r72;
cvt.u64.u32 %rd40, %r73;
mul.wide.u32 %rd41, %r73, 4;
add.u64 %rd42, %rd33, %rd41;
ld.shared.f32 %f116, [%rd42+0];
add.ftz.f32 %f106, %f116, %f106;
st.shared.f32 [%rd36+0], %f106;
ld.shared.f32 %f117, [%rd42+512];
add.ftz.f32 %f107, %f117, %f107;
st.shared.f32 [%rd36+512], %f107;
ld.shared.f32 %f118, [%rd42+1024];
add.ftz.f32 %f108, %f118, %f108;
st.shared.f32 [%rd36+1024], %f108;
ld.shared.f32 %f119, [%rd42+1536];
add.ftz.f32 %f109, %f119, %f109;
st.shared.f32 [%rd36+1536], %f109;
ld.shared.f32 %f120, [%rd42+2048];
add.ftz.f32 %f114, %f120, %f114;
st.shared.f32 [%rd36+2048], %f114;
ld.shared.f32 %f121, [%rd42+2560];
add.ftz.f32 %f115, %f121, %f115;
st.shared.f32 [%rd36+2560], %f115;
$Lt_0_25090:
.loc 16 133 0
shr.u32 %r72, %r72, 1;
mov.u32 %r74, 0;
setp.ne.u32 %p14, %r72, %r74;
@%p14 bra $Lt_0_24834;
$Lt_0_24322:
.loc 16 141 0
mov.f32 %f6, %f106;
mov.f32 %f8, %f107;
mov.f32 %f10, %f108;
mov.f32 %f12, %f109;
mov.f32 %f14, %f114;
mov.f32 %f16, %f115;
$Lt_0_23810:
$Lt_0_21762:
mov.u32 %r75, 0;
setp.ne.s32 %p15, %r14, %r75;
@%p15 bra $Lt_0_25858;
.loc 16 147 0
ld.param.u64 %rd43, [__cudaparm_kernel_pair_engv];
add.u64 %rd44, %rd43, %rd3;
ld.param.s32 %r76, [__cudaparm_kernel_pair_eflag];
mov.u32 %r77, 0;
setp.le.s32 %p16, %r76, %r77;
@%p16 bra $Lt_0_26370;
.loc 16 149 0
mov.f32 %f122, 0f00000000; // 0
st.global.f32 [%rd44+0], %f122;
.loc 16 150 0
cvt.s64.s32 %rd45, %r9;
mul.wide.s32 %rd46, %r9, 4;
add.u64 %rd47, %rd46, %rd44;
.loc 16 151 0
st.global.f32 [%rd47+0], %f33;
.loc 16 152 0
add.u64 %rd44, %rd46, %rd47;
$Lt_0_26370:
ld.param.s32 %r78, [__cudaparm_kernel_pair_vflag];
mov.u32 %r79, 0;
setp.le.s32 %p17, %r78, %r79;
@%p17 bra $Lt_0_26882;
.loc 16 156 0
mov.f32 %f123, %f6;
st.global.f32 [%rd44+0], %f123;
.loc 16 157 0
cvt.s64.s32 %rd48, %r9;
mul.wide.s32 %rd49, %r9, 4;
add.u64 %rd50, %rd49, %rd44;
.loc 16 156 0
mov.f32 %f124, %f8;
st.global.f32 [%rd50+0], %f124;
.loc 16 157 0
add.u64 %rd51, %rd49, %rd50;
.loc 16 156 0
mov.f32 %f125, %f10;
st.global.f32 [%rd51+0], %f125;
.loc 16 157 0
add.u64 %rd52, %rd49, %rd51;
.loc 16 156 0
mov.f32 %f126, %f12;
st.global.f32 [%rd52+0], %f126;
.loc 16 157 0
add.u64 %rd44, %rd49, %rd52;
.loc 16 156 0
mov.f32 %f127, %f14;
st.global.f32 [%rd44+0], %f127;
mov.f32 %f128, %f16;
add.u64 %rd53, %rd49, %rd44;
st.global.f32 [%rd53+0], %f128;
$Lt_0_26882:
.loc 16 160 0
ld.param.u64 %rd54, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd55, %rd2, 16;
add.u64 %rd56, %rd54, %rd55;
mov.f32 %f129, %f130;
st.global.v4.f32 [%rd56+0], {%f32,%f31,%f30,%f129};
$Lt_0_25858:
$Lt_0_18690:
.loc 16 163 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_cl_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast_engv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .u64 __cudaparm_kernel_pair_fast_q_,
.param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,
.param .f32 __cudaparm_kernel_pair_fast_qqrd2e,
.param .f32 __cudaparm_kernel_pair_fast_g_ewald,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<82>;
.reg .u64 %rd<62>;
.reg .f32 %f<129>;
.reg .pred %p<20>;
.shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_sp_cl3304[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32748_37_non_const_red_acc3320[3072];
// __cuda_local_var_32683_10_non_const_f = 48
// __cuda_local_var_32685_9_non_const_virial = 16
.loc 16 173 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_19458;
.loc 16 179 0
mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_cl_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_19458:
mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304;
.loc 16 186 0
mov.f32 %f2, 0f00000000; // 0
mov.f32 %f3, %f2;
mov.f32 %f4, 0f00000000; // 0
mov.f32 %f5, %f4;
mov.f32 %f6, 0f00000000; // 0
mov.f32 %f7, %f6;
mov.f32 %f8, 0f00000000; // 0
mov.f32 %f9, %f8;
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
.loc 16 188 0
bar.sync 0;
ld.param.s32 %r3, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r4, %r1, %r3;
cvt.s32.u32 %r5, %ntid.x;
div.s32 %r6, %r5, %r3;
cvt.s32.u32 %r7, %ctaid.x;
mul.lo.s32 %r8, %r7, %r6;
add.s32 %r9, %r4, %r8;
ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum];
setp.ge.s32 %p2, %r9, %r10;
@%p2 bra $Lt_1_27138;
.loc 16 193 0
cvt.s64.s32 %rd7, %r9;
mul.wide.s32 %rd8, %r9, 4;
ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd10, %rd8, %rd9;
ld.global.s32 %r11, [%rd10+0];
ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd11, %r12;
mul.wide.s32 %rd12, %r12, 4;
add.u64 %rd13, %rd12, %rd10;
ld.global.s32 %r13, [%rd13+0];
sub.s32 %r14, %r3, 1;
and.b32 %r15, %r14, %r1;
cvt.s64.s32 %rd14, %r15;
mul.wide.s32 %rd15, %r15, 4;
ld.param.u64 %rd16, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p3, %rd16, %rd9;
@%p3 bra $Lt_1_20738;
cvt.s32.s64 %r16, %rd11;
mul.lo.s32 %r17, %r16, %r3;
mov.s32 %r18, %r17;
mul.lo.s32 %r19, %r14, %r9;
add.s32 %r20, %r16, %r19;
cvt.s64.s32 %rd17, %r20;
mul.wide.s32 %rd18, %r20, 4;
add.u64 %rd19, %rd13, %rd18;
and.b32 %r21, %r14, %r13;
cvt.s64.s32 %rd20, %r21;
div.s32 %r22, %r13, %r3;
mul.lo.s32 %r23, %r17, %r22;
cvt.s64.s32 %rd21, %r23;
add.u64 %rd22, %rd20, %rd21;
mul.lo.u64 %rd23, %rd22, 4;
add.u64 %rd24, %rd19, %rd23;
add.u64 %rd25, %rd15, %rd19;
bra.uni $Lt_1_20482;
$Lt_1_20738:
add.u64 %rd26, %rd12, %rd13;
ld.global.s32 %r24, [%rd26+0];
cvt.s64.s32 %rd27, %r24;
mul.wide.s32 %rd28, %r24, 4;
add.u64 %rd29, %rd16, %rd28;
cvt.s64.s32 %rd30, %r13;
mul.wide.s32 %rd31, %r13, 4;
add.u64 %rd24, %rd29, %rd31;
mov.s32 %r18, %r3;
add.u64 %rd25, %rd15, %rd29;
$Lt_1_20482:
.loc 16 196 0
mov.u32 %r25, %r11;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
mov.s32 %r28, 0;
mov.u32 %r29, %r28;
mov.s32 %r30, 0;
mov.u32 %r31, %r30;
tex.1d.v4.f32.s32 {%f14,%f15,%f16,%f17},[pos_tex,{%r25,%r27,%r29,%r31}];
mov.f32 %f18, %f14;
mov.f32 %f19, %f15;
mov.f32 %f20, %f16;
.loc 16 197 0
mov.u32 %r32, %r11;
mov.s32 %r33, 0;
mov.u32 %r34, %r33;
mov.s32 %r35, 0;
mov.u32 %r36, %r35;
mov.s32 %r37, 0;
mov.u32 %r38, %r37;
tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[q_tex,{%r32,%r34,%r36,%r38}];
mov.f32 %f25, %f21;
setp.ge.u64 %p4, %rd25, %rd24;
@%p4 bra $Lt_1_28674;
cvt.s64.s32 %rd32, %r18;
ld.param.f32 %f26, [__cudaparm_kernel_pair_fast_cut_coulsq];
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.f32 %f29, 0f00000000; // 0
mov.f32 %f30, 0f00000000; // 0
$Lt_1_21506:
//<loop> Loop body line 197, nesting depth: 1, estimated iterations: unknown
.loc 16 200 0
ld.global.s32 %r39, [%rd25+0];
.loc 16 203 0
mov.f32 %f31, 0f3f800000; // 1
shr.s32 %r40, %r39, 30;
and.b32 %r41, %r40, 3;
cvt.s64.s32 %rd33, %r41;
mul.wide.s32 %rd34, %r41, 4;
add.u64 %rd35, %rd1, %rd34;
ld.shared.f32 %f32, [%rd35+0];
sub.ftz.f32 %f33, %f31, %f32;
.loc 16 206 0
and.b32 %r42, %r39, 1073741823;
mov.u32 %r43, %r42;
mov.s32 %r44, 0;
mov.u32 %r45, %r44;
mov.s32 %r46, 0;
mov.u32 %r47, %r46;
mov.s32 %r48, 0;
mov.u32 %r49, %r48;
tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r43,%r45,%r47,%r49}];
mov.f32 %f38, %f34;
mov.f32 %f39, %f35;
mov.f32 %f40, %f36;
sub.ftz.f32 %f41, %f19, %f39;
sub.ftz.f32 %f42, %f18, %f38;
sub.ftz.f32 %f43, %f20, %f40;
mul.ftz.f32 %f44, %f41, %f41;
fma.rn.ftz.f32 %f45, %f42, %f42, %f44;
fma.rn.ftz.f32 %f46, %f43, %f43, %f45;
setp.lt.ftz.f32 %p5, %f46, %f26;
@!%p5 bra $Lt_1_22274;
.loc 20 518 0
rcp.approx.ftz.f32 %f47, %f46;
rsqrt.approx.ftz.f32 %f48, %f47;
ld.param.f32 %f49, [__cudaparm_kernel_pair_fast_g_ewald];
mul.ftz.f32 %f50, %f49, %f48;
mul.ftz.f32 %f51, %f50, %f50;
neg.ftz.f32 %f52, %f51;
mov.f32 %f53, 0f3fb8aa3b; // 1.4427
mul.ftz.f32 %f54, %f52, %f53;
ex2.approx.ftz.f32 %f55, %f54;
.loc 16 222 0
mov.f32 %f56, 0f3f800000; // 1
mov.f32 %f57, 0f3ea7ba05; // 0.327591
fma.rn.ftz.f32 %f58, %f57, %f50, %f56;
rcp.approx.ftz.f32 %f59, %f58;
mov.f32 %f60, 0f3e827906; // 0.25483
mov.f32 %f61, 0fbe91a98e; // -0.284497
mov.f32 %f62, 0f3fb5f0e3; // 1.42141
mov.f32 %f63, 0fbfba00e3; // -1.45315
mov.f32 %f64, 0f3f87dc22; // 1.06141
fma.rn.ftz.f32 %f65, %f64, %f59, %f63;
fma.rn.ftz.f32 %f66, %f59, %f65, %f62;
fma.rn.ftz.f32 %f67, %f59, %f66, %f61;
fma.rn.ftz.f32 %f68, %f59, %f67, %f60;
mul.ftz.f32 %f69, %f59, %f68;
mul.ftz.f32 %f70, %f55, %f69;
.loc 16 223 0
mov.u32 %r50, %r42;
mov.s32 %r51, 0;
mov.u32 %r52, %r51;
mov.s32 %r53, 0;
mov.u32 %r54, %r53;
mov.s32 %r55, 0;
mov.u32 %r56, %r55;
tex.1d.v4.f32.s32 {%f71,%f72,%f73,%f74},[q_tex,{%r50,%r52,%r54,%r56}];
mov.f32 %f75, %f71;
.loc 16 224 0
ld.param.f32 %f76, [__cudaparm_kernel_pair_fast_qqrd2e];
mul.ftz.f32 %f77, %f76, %f25;
mul.ftz.f32 %f78, %f77, %f75;
div.approx.ftz.f32 %f79, %f78, %f48;
mov.f32 %f80, 0f3f906ebb; // 1.12838
mul.ftz.f32 %f81, %f50, %f80;
fma.rn.ftz.f32 %f82, %f55, %f81, %f70;
sub.ftz.f32 %f83, %f82, %f33;
mul.ftz.f32 %f84, %f79, %f83;
mul.ftz.f32 %f85, %f47, %f84;
.loc 16 226 0
fma.rn.ftz.f32 %f29, %f42, %f85, %f29;
.loc 16 227 0
fma.rn.ftz.f32 %f28, %f41, %f85, %f28;
.loc 16 228 0
fma.rn.ftz.f32 %f27, %f43, %f85, %f27;
.loc 16 215 0
sub.ftz.f32 %f86, %f70, %f33;
fma.rn.ftz.f32 %f87, %f79, %f86, %f30;
ld.param.s32 %r57, [__cudaparm_kernel_pair_fast_eflag];
mov.s32 %r58, 0;
setp.gt.s32 %p6, %r57, %r58;
selp.f32 %f30, %f87, %f30, %p6;
ld.param.s32 %r59, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r60, 0;
setp.le.s32 %p7, %r59, %r60;
@%p7 bra $Lt_1_22274;
.loc 16 234 0
mov.f32 %f88, %f3;
mul.ftz.f32 %f89, %f42, %f42;
fma.rn.ftz.f32 %f90, %f85, %f89, %f88;
mov.f32 %f3, %f90;
.loc 16 235 0
mov.f32 %f91, %f5;
fma.rn.ftz.f32 %f92, %f85, %f44, %f91;
mov.f32 %f5, %f92;
.loc 16 236 0
mov.f32 %f93, %f7;
mul.ftz.f32 %f94, %f43, %f43;
fma.rn.ftz.f32 %f95, %f85, %f94, %f93;
mov.f32 %f7, %f95;
.loc 16 237 0
mov.f32 %f96, %f9;
mul.ftz.f32 %f97, %f41, %f42;
fma.rn.ftz.f32 %f98, %f85, %f97, %f96;
mov.f32 %f9, %f98;
.loc 16 238 0
mov.f32 %f99, %f11;
mul.ftz.f32 %f100, %f42, %f43;
fma.rn.ftz.f32 %f101, %f85, %f100, %f99;
mov.f32 %f11, %f101;
.loc 16 239 0
mul.ftz.f32 %f102, %f41, %f43;
fma.rn.ftz.f32 %f12, %f85, %f102, %f12;
mov.f32 %f13, %f12;
$Lt_1_22274:
$Lt_1_21762:
.loc 16 199 0
mul.lo.u64 %rd36, %rd32, 4;
add.u64 %rd25, %rd25, %rd36;
setp.lt.u64 %p8, %rd25, %rd24;
@%p8 bra $Lt_1_21506;
bra.uni $Lt_1_20994;
$Lt_1_28674:
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.f32 %f29, 0f00000000; // 0
mov.f32 %f30, 0f00000000; // 0
$Lt_1_20994:
mov.u32 %r61, 1;
setp.le.s32 %p9, %r3, %r61;
@%p9 bra $Lt_1_25090;
.loc 16 249 0
mov.u64 %rd37, __cuda___cuda_local_var_32748_37_non_const_red_acc3320;
cvt.s64.s32 %rd38, %r1;
mul.wide.s32 %rd39, %r1, 4;
add.u64 %rd40, %rd37, %rd39;
mov.f32 %f103, %f29;
st.shared.f32 [%rd40+0], %f103;
.loc 16 250 0
mov.f32 %f104, %f28;
st.shared.f32 [%rd40+512], %f104;
.loc 16 251 0
mov.f32 %f105, %f27;
st.shared.f32 [%rd40+1024], %f105;
.loc 16 252 0
mov.f32 %f106, %f30;
st.shared.f32 [%rd40+1536], %f106;
.loc 16 254 0
shr.s32 %r62, %r3, 31;
mov.s32 %r63, 1;
and.b32 %r64, %r62, %r63;
add.s32 %r65, %r64, %r3;
shr.s32 %r66, %r65, 1;
mov.s32 %r67, %r66;
mov.u32 %r68, 0;
setp.ne.u32 %p10, %r66, %r68;
@!%p10 bra $Lt_1_23554;
$Lt_1_24066:
setp.ge.u32 %p11, %r15, %r67;
@%p11 bra $Lt_1_24322;
.loc 16 257 0
add.u32 %r69, %r1, %r67;
cvt.u64.u32 %rd41, %r69;
mul.wide.u32 %rd42, %r69, 4;
add.u64 %rd43, %rd37, %rd42;
ld.shared.f32 %f107, [%rd43+0];
add.ftz.f32 %f103, %f107, %f103;
st.shared.f32 [%rd40+0], %f103;
ld.shared.f32 %f108, [%rd43+512];
add.ftz.f32 %f104, %f108, %f104;
st.shared.f32 [%rd40+512], %f104;
ld.shared.f32 %f109, [%rd43+1024];
add.ftz.f32 %f105, %f109, %f105;
st.shared.f32 [%rd40+1024], %f105;
ld.shared.f32 %f110, [%rd43+1536];
add.ftz.f32 %f106, %f110, %f106;
st.shared.f32 [%rd40+1536], %f106;
$Lt_1_24322:
.loc 16 254 0
shr.u32 %r67, %r67, 1;
mov.u32 %r70, 0;
setp.ne.u32 %p12, %r67, %r70;
@%p12 bra $Lt_1_24066;
$Lt_1_23554:
.loc 16 261 0
mov.f32 %f29, %f103;
.loc 16 262 0
mov.f32 %f28, %f104;
.loc 16 263 0
mov.f32 %f27, %f105;
.loc 16 264 0
mov.f32 %f30, %f106;
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p13, %r71, %r72;
@%p13 bra $Lt_1_25090;
.loc 16 268 0
mov.f32 %f103, %f3;
st.shared.f32 [%rd40+0], %f103;
mov.f32 %f104, %f5;
st.shared.f32 [%rd40+512], %f104;
mov.f32 %f105, %f7;
st.shared.f32 [%rd40+1024], %f105;
mov.f32 %f106, %f9;
st.shared.f32 [%rd40+1536], %f106;
mov.f32 %f111, %f11;
st.shared.f32 [%rd40+2048], %f111;
mov.f32 %f112, %f12;
st.shared.f32 [%rd40+2560], %f112;
.loc 16 270 0
mov.s32 %r73, %r66;
@!%p10 bra $Lt_1_25602;
$Lt_1_26114:
setp.ge.u32 %p14, %r15, %r73;
@%p14 bra $Lt_1_26370;
.loc 16 273 0
add.u32 %r74, %r1, %r73;
cvt.u64.u32 %rd44, %r74;
mul.wide.u32 %rd45, %r74, 4;
add.u64 %rd46, %rd37, %rd45;
ld.shared.f32 %f113, [%rd46+0];
add.ftz.f32 %f103, %f113, %f103;
st.shared.f32 [%rd40+0], %f103;
ld.shared.f32 %f114, [%rd46+512];
add.ftz.f32 %f104, %f114, %f104;
st.shared.f32 [%rd40+512], %f104;
ld.shared.f32 %f115, [%rd46+1024];
add.ftz.f32 %f105, %f115, %f105;
st.shared.f32 [%rd40+1024], %f105;
ld.shared.f32 %f116, [%rd46+1536];
add.ftz.f32 %f106, %f116, %f106;
st.shared.f32 [%rd40+1536], %f106;
ld.shared.f32 %f117, [%rd46+2048];
add.ftz.f32 %f111, %f117, %f111;
st.shared.f32 [%rd40+2048], %f111;
ld.shared.f32 %f118, [%rd46+2560];
add.ftz.f32 %f112, %f118, %f112;
st.shared.f32 [%rd40+2560], %f112;
$Lt_1_26370:
.loc 16 270 0
shr.u32 %r73, %r73, 1;
mov.u32 %r75, 0;
setp.ne.u32 %p15, %r73, %r75;
@%p15 bra $Lt_1_26114;
$Lt_1_25602:
.loc 16 278 0
mov.f32 %f3, %f103;
mov.f32 %f5, %f104;
mov.f32 %f7, %f105;
mov.f32 %f9, %f106;
mov.f32 %f11, %f111;
mov.f32 %f13, %f112;
$Lt_1_25090:
$Lt_1_23042:
mov.u32 %r76, 0;
setp.ne.s32 %p16, %r15, %r76;
@%p16 bra $Lt_1_27138;
.loc 16 284 0
ld.param.u64 %rd47, [__cudaparm_kernel_pair_fast_engv];
add.u64 %rd48, %rd47, %rd8;
ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r78, 0;
setp.le.s32 %p17, %r77, %r78;
@%p17 bra $Lt_1_27650;
.loc 16 286 0
mov.f32 %f119, 0f00000000; // 0
st.global.f32 [%rd48+0], %f119;
.loc 16 287 0
cvt.s64.s32 %rd49, %r10;
mul.wide.s32 %rd50, %r10, 4;
add.u64 %rd51, %rd50, %rd48;
.loc 16 288 0
st.global.f32 [%rd51+0], %f30;
.loc 16 289 0
add.u64 %rd48, %rd50, %rd51;
$Lt_1_27650:
ld.param.s32 %r79, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r80, 0;
setp.le.s32 %p18, %r79, %r80;
@%p18 bra $Lt_1_28162;
.loc 16 293 0
mov.f32 %f120, %f3;
st.global.f32 [%rd48+0], %f120;
.loc 16 294 0
cvt.s64.s32 %rd52, %r10;
mul.wide.s32 %rd53, %r10, 4;
add.u64 %rd54, %rd53, %rd48;
.loc 16 293 0
mov.f32 %f121, %f5;
st.global.f32 [%rd54+0], %f121;
.loc 16 294 0
add.u64 %rd55, %rd53, %rd54;
.loc 16 293 0
mov.f32 %f122, %f7;
st.global.f32 [%rd55+0], %f122;
.loc 16 294 0
add.u64 %rd56, %rd53, %rd55;
.loc 16 293 0
mov.f32 %f123, %f9;
st.global.f32 [%rd56+0], %f123;
.loc 16 294 0
add.u64 %rd48, %rd53, %rd56;
.loc 16 293 0
mov.f32 %f124, %f11;
st.global.f32 [%rd48+0], %f124;
mov.f32 %f125, %f13;
add.u64 %rd57, %rd53, %rd48;
st.global.f32 [%rd57+0], %f125;
$Lt_1_28162:
.loc 16 297 0
ld.param.u64 %rd58, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd59, %rd7, 16;
add.u64 %rd60, %rd58, %rd59;
mov.f32 %f126, %f127;
st.global.v4.f32 [%rd60+0], {%f29,%f28,%f27,%f126};
$Lt_1_27138:
$Lt_1_19970:
.loc 16 300 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast
Event Timeline
Log In to Comment