diff --git a/lib/gpu/cmm_cut_gpu_kernel.ptx b/lib/gpu/cmm_cut_gpu_kernel.ptx new file mode 100644 index 000000000..b39dbcc72 --- /dev/null +++ b/lib/gpu/cmm_cut_gpu_kernel.ptx @@ -0,0 +1,1036 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000c02c_00000000-9_cmm_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.cW08s4) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000c02c_00000000-8_cmm_cut_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "cmm_cut_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .u64 __cudaparm_kernel_pair_lj3, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<72>; + .reg .u64 %rd<62>; + .reg .f32 %f<111>; + .reg .pred %p<21>; + .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16]; + .shared .align 4 .b8 __cuda___cuda_local_var_32590_35_non_const_red_acc108[3072]; + // __cuda_local_var_32504_10_non_const_f = 48 + // __cuda_local_var_32508_9_non_const_virial = 16 + .loc 16 88 0 +$LDWbegin_kernel_pair: + .loc 16 95 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 96 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 97 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 98 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; + .loc 16 107 0 + mov.f32 %f5, 0f00000000; // 0 + mov.f32 %f6, %f5; + mov.f32 %f7, 0f00000000; // 0 + mov.f32 %f8, %f7; + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_20738; + .loc 16 114 0 + ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd2, %r11; + mul.wide.s32 %rd3, %r11, 4; + cvt.s64.s32 %rd4, %r9; + mul.wide.s32 %rd5, %r9, 4; + ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd7, %rd5, %rd6; + add.u64 %rd8, %rd3, %rd7; + ld.global.s32 %r12, [%rd8+0]; + add.u64 %rd9, %rd3, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd6; + @%p2 bra $Lt_0_21250; + .loc 16 120 0 + cvt.s32.s64 %r13, %rd2; + mul.lo.s32 %r14, %r13, %r12; + cvt.s64.s32 %rd11, %r14; + mul.wide.s32 %rd12, %r14, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 121 0 + mul.lo.s32 %r15, %r6, %r13; + cvt.s64.s32 %rd14, %r15; + mul.wide.s32 %rd15, %r15, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 122 0 + mul.lo.s32 %r16, %r13, %r1; + bra.uni $Lt_0_20994; +$Lt_0_21250: + .loc 16 124 0 + ld.global.s32 %r17, [%rd9+0]; + cvt.s64.s32 %rd17, %r17; + mul.wide.s32 %rd18, %r17, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 125 0 + cvt.s64.s32 %rd20, %r12; + mul.wide.s32 %rd21, %r12, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 126 0 + mov.s32 %r16, %r1; + .loc 16 127 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_20994: + .loc 16 130 0 + ld.global.s32 %r18, [%rd7+0]; + mov.u32 %r19, %r18; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f21, %f17; + mov.f32 %f22, %f18; + mov.f32 %f23, %f19; + mov.f32 %f24, %f20; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_30722; + cvt.rzi.ftz.s32.f32 %r26, %f24; + cvt.s64.s32 %rd24, %r16; + ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r28, %r27, %r26; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1]; + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92; +$Lt_0_22018: + //<loop> Loop body line 130, nesting depth: 1, estimated iterations: unknown + .loc 16 136 0 + ld.global.s32 %r29, [%rd16+0]; + .loc 16 137 0 + shr.s32 %r30, %r29, 30; + and.b32 %r31, %r30, 3; + cvt.s64.s32 %rd27, %r31; + mul.wide.s32 %rd28, %r31, 4; + add.u64 %rd29, %rd26, %rd28; + ld.shared.f32 %f29, [%rd29+0]; + .loc 16 140 0 + and.b32 %r32, %r29, 1073741823; + mov.u32 %r33, %r32; + mov.s32 %r34, 0; + mov.u32 %r35, %r34; + mov.s32 %r36, 0; + mov.u32 %r37, %r36; + mov.s32 %r38, 0; + mov.u32 %r39, %r38; + tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}]; + mov.f32 %f34, %f30; + mov.f32 %f35, %f31; + mov.f32 %f36, %f32; + mov.f32 %f37, %f33; + cvt.rzi.ftz.s32.f32 %r40, %f37; + sub.ftz.f32 %f38, %f22, %f35; + sub.ftz.f32 %f39, %f21, %f34; + sub.ftz.f32 %f40, %f23, %f36; + mul.ftz.f32 %f41, %f38, %f38; + fma.rn.ftz.f32 %f42, %f39, %f39, %f41; + fma.rn.ftz.f32 %f43, %f40, %f40, %f42; + add.s32 %r41, %r40, %r28; + cvt.s64.s32 %rd30, %r41; + mul.wide.s32 %rd31, %r41, 16; + add.u64 %rd32, %rd31, %rd25; + ld.global.f32 %f44, [%rd32+0]; + setp.gt.ftz.f32 %p4, %f44, %f43; + @!%p4 bra $Lt_0_24322; + rcp.approx.ftz.f32 %f45, %f43; + ld.global.f32 %f46, [%rd32+4]; + mov.f32 %f47, 0f40000000; // 2 + setp.eq.ftz.f32 %p5, %f46, %f47; + @!%p5 bra $Lt_0_23042; + .loc 16 155 0 + mul.ftz.f32 %f48, %f45, %f45; + mov.f32 %f49, %f48; + .loc 16 156 0 + mul.ftz.f32 %f50, %f48, %f48; + bra.uni $Lt_0_23298; +$Lt_0_23042: + mov.f32 %f51, 0f3f800000; // 1 + setp.eq.ftz.f32 %p6, %f46, %f51; + @!%p6 bra $Lt_0_23554; + .loc 16 158 0 + sqrt.approx.ftz.f32 %f52, %f45; + mul.ftz.f32 %f53, %f45, %f52; + mov.f32 %f50, %f53; + .loc 16 159 0 + mul.ftz.f32 %f49, %f53, %f53; + bra.uni $Lt_0_23298; +$Lt_0_23554: + .loc 16 161 0 + mul.ftz.f32 %f54, %f45, %f45; + mul.ftz.f32 %f55, %f45, %f54; + mov.f32 %f49, %f55; + .loc 16 162 0 + mov.f32 %f50, %f55; +$Lt_0_23298: +$Lt_0_22786: + .loc 16 164 0 + mul.ftz.f32 %f56, %f45, %f29; + mul.ftz.f32 %f57, %f49, %f56; + ld.global.v2.f32 {%f58,%f59}, [%rd32+8]; + mul.ftz.f32 %f60, %f58, %f50; + sub.ftz.f32 %f61, %f60, %f59; + mul.ftz.f32 %f62, %f57, %f61; + .loc 16 166 0 + fma.rn.ftz.f32 %f27, %f39, %f62, %f27; + .loc 16 167 0 + fma.rn.ftz.f32 %f26, %f38, %f62, %f26; + .loc 16 168 0 + fma.rn.ftz.f32 %f25, %f40, %f62, %f25; + ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r43, 0; + setp.le.s32 %p7, %r42, %r43; + @%p7 bra $Lt_0_23810; + .loc 16 170 0 + ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd34, %rd33, %rd31; + ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd34+0]; + mul.ftz.f32 %f66, %f29, %f49; + mul.ftz.f32 %f67, %f63, %f50; + sub.ftz.f32 %f68, %f67, %f64; + mul.ftz.f32 %f69, %f66, %f68; + sub.ftz.f32 %f70, %f69, %f65; + add.ftz.f32 %f28, %f28, %f70; +$Lt_0_23810: + ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r45, 0; + setp.le.s32 %p8, %r44, %r45; + @%p8 bra $Lt_0_24322; + .loc 16 173 0 + mov.f32 %f71, %f6; + mul.ftz.f32 %f72, %f39, %f39; + fma.rn.ftz.f32 %f73, %f62, %f72, %f71; + mov.f32 %f6, %f73; + .loc 16 174 0 + mov.f32 %f74, %f8; + fma.rn.ftz.f32 %f75, %f62, %f41, %f74; + mov.f32 %f8, %f75; + .loc 16 175 0 + mov.f32 %f76, %f10; + mul.ftz.f32 %f77, %f40, %f40; + fma.rn.ftz.f32 %f78, %f62, %f77, %f76; + mov.f32 %f10, %f78; + .loc 16 176 0 + mov.f32 %f79, %f12; + mul.ftz.f32 %f80, %f38, %f39; + fma.rn.ftz.f32 %f81, %f62, %f80, %f79; + mov.f32 %f12, %f81; + .loc 16 177 0 + mov.f32 %f82, %f14; + mul.ftz.f32 %f83, %f39, %f40; + fma.rn.ftz.f32 %f84, %f62, %f83, %f82; + mov.f32 %f14, %f84; + .loc 16 178 0 + mul.ftz.f32 %f85, %f38, %f40; + fma.rn.ftz.f32 %f15, %f62, %f85, %f15; + mov.f32 %f16, %f15; +$Lt_0_24322: +$Lt_0_22274: + .loc 16 134 0 + mul.lo.u64 %rd35, %rd24, 4; + add.u64 %rd16, %rd16, %rd35; + setp.lt.u64 %p9, %rd16, %rd13; + @%p9 bra $Lt_0_22018; + bra.uni $Lt_0_20482; +$Lt_0_30722: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + bra.uni $Lt_0_20482; +$Lt_0_20738: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 +$Lt_0_20482: + mov.u32 %r46, 1; + setp.le.s32 %p10, %r1, %r46; + @%p10 bra $Lt_0_27138; + .loc 16 189 0 + mov.u64 %rd36, __cuda___cuda_local_var_32590_35_non_const_red_acc108; + cvt.s64.s32 %rd37, %r2; + mul.wide.s32 %rd38, %r2, 4; + add.u64 %rd39, %rd36, %rd38; + mov.f32 %f86, %f27; + st.shared.f32 [%rd39+0], %f86; + .loc 16 190 0 + mov.f32 %f87, %f26; + st.shared.f32 [%rd39+512], %f87; + .loc 16 191 0 + mov.f32 %f88, %f25; + st.shared.f32 [%rd39+1024], %f88; + .loc 16 192 0 + mov.f32 %f89, %f28; + st.shared.f32 [%rd39+1536], %f89; + .loc 16 194 0 + shr.s32 %r47, %r1, 31; + mov.s32 %r48, 1; + and.b32 %r49, %r47, %r48; + add.s32 %r50, %r49, %r1; + shr.s32 %r51, %r50, 1; + mov.s32 %r52, %r51; + mov.u32 %r53, 0; + setp.ne.u32 %p11, %r51, %r53; + @!%p11 bra $Lt_0_25602; +$Lt_0_26114: + setp.ge.u32 %p12, %r6, %r52; + @%p12 bra $Lt_0_26370; + .loc 16 197 0 + add.u32 %r54, %r2, %r52; + cvt.u64.u32 %rd40, %r54; + mul.wide.u32 %rd41, %r54, 4; + add.u64 %rd42, %rd36, %rd41; + ld.shared.f32 %f90, [%rd42+0]; + add.ftz.f32 %f86, %f90, %f86; + st.shared.f32 [%rd39+0], %f86; + ld.shared.f32 %f91, [%rd42+512]; + add.ftz.f32 %f87, %f91, %f87; + st.shared.f32 [%rd39+512], %f87; + ld.shared.f32 %f92, [%rd42+1024]; + add.ftz.f32 %f88, %f92, %f88; + st.shared.f32 [%rd39+1024], %f88; + ld.shared.f32 %f93, [%rd42+1536]; + add.ftz.f32 %f89, %f93, %f89; + st.shared.f32 [%rd39+1536], %f89; +$Lt_0_26370: + .loc 16 194 0 + shr.u32 %r52, %r52, 1; + mov.u32 %r55, 0; + setp.ne.u32 %p13, %r52, %r55; + @%p13 bra $Lt_0_26114; +$Lt_0_25602: + .loc 16 201 0 + mov.f32 %f27, %f86; + .loc 16 202 0 + mov.f32 %f26, %f87; + .loc 16 203 0 + mov.f32 %f25, %f88; + .loc 16 204 0 + mov.f32 %f28, %f89; + ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r57, 0; + setp.le.s32 %p14, %r56, %r57; + @%p14 bra $Lt_0_27138; + .loc 16 208 0 + mov.f32 %f86, %f6; + st.shared.f32 [%rd39+0], %f86; + mov.f32 %f87, %f8; + st.shared.f32 [%rd39+512], %f87; + mov.f32 %f88, %f10; + st.shared.f32 [%rd39+1024], %f88; + mov.f32 %f89, %f12; + st.shared.f32 [%rd39+1536], %f89; + mov.f32 %f94, %f14; + st.shared.f32 [%rd39+2048], %f94; + mov.f32 %f95, %f16; + st.shared.f32 [%rd39+2560], %f95; + .loc 16 210 0 + mov.s32 %r58, %r51; + @!%p11 bra $Lt_0_27650; +$Lt_0_28162: + setp.ge.u32 %p15, %r6, %r58; + @%p15 bra $Lt_0_28418; + .loc 16 213 0 + add.u32 %r59, %r2, %r58; + cvt.u64.u32 %rd43, %r59; + mul.wide.u32 %rd44, %r59, 4; + add.u64 %rd45, %rd36, %rd44; + ld.shared.f32 %f96, [%rd45+0]; + add.ftz.f32 %f86, %f96, %f86; + st.shared.f32 [%rd39+0], %f86; + ld.shared.f32 %f97, [%rd45+512]; + add.ftz.f32 %f87, %f97, %f87; + st.shared.f32 [%rd39+512], %f87; + ld.shared.f32 %f98, [%rd45+1024]; + add.ftz.f32 %f88, %f98, %f88; + st.shared.f32 [%rd39+1024], %f88; + ld.shared.f32 %f99, [%rd45+1536]; + add.ftz.f32 %f89, %f99, %f89; + st.shared.f32 [%rd39+1536], %f89; + ld.shared.f32 %f100, [%rd45+2048]; + add.ftz.f32 %f94, %f100, %f94; + st.shared.f32 [%rd39+2048], %f94; + ld.shared.f32 %f101, [%rd45+2560]; + add.ftz.f32 %f95, %f101, %f95; + st.shared.f32 [%rd39+2560], %f95; +$Lt_0_28418: + .loc 16 210 0 + shr.u32 %r58, %r58, 1; + mov.u32 %r60, 0; + setp.ne.u32 %p16, %r58, %r60; + @%p16 bra $Lt_0_28162; +$Lt_0_27650: + .loc 16 218 0 + mov.f32 %f6, %f86; + mov.f32 %f8, %f87; + mov.f32 %f10, %f88; + mov.f32 %f12, %f89; + mov.f32 %f14, %f94; + mov.f32 %f16, %f95; +$Lt_0_27138: +$Lt_0_25090: + selp.s32 %r61, 1, 0, %p1; + mov.s32 %r62, 0; + set.eq.u32.s32 %r63, %r6, %r62; + neg.s32 %r64, %r63; + and.b32 %r65, %r61, %r64; + mov.u32 %r66, 0; + setp.eq.s32 %p17, %r65, %r66; + @%p17 bra $Lt_0_29186; + .loc 16 224 0 + cvt.s64.s32 %rd46, %r9; + ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd48, %r9, 4; + add.u64 %rd49, %rd47, %rd48; + ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r68, 0; + setp.le.s32 %p18, %r67, %r68; + @%p18 bra $Lt_0_29698; + .loc 16 226 0 + st.global.f32 [%rd49+0], %f28; + .loc 16 227 0 + cvt.s64.s32 %rd50, %r10; + mul.wide.s32 %rd51, %r10, 4; + add.u64 %rd49, %rd49, %rd51; +$Lt_0_29698: + ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r70, 0; + setp.le.s32 %p19, %r69, %r70; + @%p19 bra $Lt_0_30210; + .loc 16 231 0 + mov.f32 %f102, %f6; + st.global.f32 [%rd49+0], %f102; + .loc 16 232 0 + cvt.s64.s32 %rd52, %r10; + mul.wide.s32 %rd53, %r10, 4; + add.u64 %rd54, %rd53, %rd49; + .loc 16 231 0 + mov.f32 %f103, %f8; + st.global.f32 [%rd54+0], %f103; + .loc 16 232 0 + add.u64 %rd55, %rd53, %rd54; + .loc 16 231 0 + mov.f32 %f104, %f10; + st.global.f32 [%rd55+0], %f104; + .loc 16 232 0 + add.u64 %rd56, %rd53, %rd55; + .loc 16 231 0 + mov.f32 %f105, %f12; + st.global.f32 [%rd56+0], %f105; + .loc 16 232 0 + add.u64 %rd49, %rd53, %rd56; + .loc 16 231 0 + mov.f32 %f106, %f14; + st.global.f32 [%rd49+0], %f106; + mov.f32 %f107, %f16; + add.u64 %rd57, %rd53, %rd49; + st.global.f32 [%rd57+0], %f107; +$Lt_0_30210: + .loc 16 235 0 + ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd59, %rd46, 16; + add.u64 %rd60, %rd58, %rd59; + mov.f32 %f108, %f109; + st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f108}; +$Lt_0_29186: + .loc 16 237 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_lj1_in, + .param .u64 __cudaparm_kernel_pair_fast_lj3_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<74>; + .reg .u64 %rd<74>; + .reg .f32 %f<118>; + .reg .pred %p<24>; + .shared .align 4 .b8 __cuda___cuda_local_var_32656_33_non_const_sp_lj3268[16]; + .shared .align 16 .b8 __cuda___cuda_local_var_32654_34_non_const_lj13296[1936]; + .shared .align 16 .b8 __cuda___cuda_local_var_32655_34_non_const_lj35232[1936]; + .shared .align 4 .b8 __cuda___cuda_local_var_32753_35_non_const_red_acc7168[3072]; + // __cuda_local_var_32666_10_non_const_f = 48 + // __cuda_local_var_32670_9_non_const_virial = 16 + .loc 16 245 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 3; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_22786; + .loc 16 255 0 + mov.u64 %rd1, __cuda___cuda_local_var_32656_33_non_const_sp_lj3268; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_22786: + mov.u64 %rd1, __cuda___cuda_local_var_32656_33_non_const_sp_lj3268; + mov.u32 %r3, 120; + setp.gt.s32 %p2, %r1, %r3; + @%p2 bra $Lt_1_23298; + .loc 16 257 0 + mov.u64 %rd7, __cuda___cuda_local_var_32654_34_non_const_lj13296; + cvt.s64.s32 %rd8, %r1; + mul.wide.s32 %rd9, %r1, 16; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; + add.u64 %rd11, %rd10, %rd9; + add.u64 %rd12, %rd9, %rd7; + ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; + st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; + ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r5, 0; + setp.le.s32 %p3, %r4, %r5; + @%p3 bra $Lt_1_23810; + .loc 16 259 0 + mov.u64 %rd13, __cuda___cuda_local_var_32655_34_non_const_lj35232; + ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; + add.u64 %rd15, %rd14, %rd9; + add.u64 %rd16, %rd9, %rd13; + ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; + st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; +$Lt_1_23810: + mov.u64 %rd13, __cuda___cuda_local_var_32655_34_non_const_lj35232; +$Lt_1_23298: + mov.u64 %rd7, __cuda___cuda_local_var_32654_34_non_const_lj13296; + mov.u64 %rd13, __cuda___cuda_local_var_32655_34_non_const_lj35232; + .loc 16 269 0 + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, %f14; + mov.f32 %f16, 0f00000000; // 0 + mov.f32 %f17, %f16; + mov.f32 %f18, 0f00000000; // 0 + mov.f32 %f19, %f18; + mov.f32 %f20, 0f00000000; // 0 + mov.f32 %f21, %f20; + .loc 16 271 0 + bar.sync 0; + ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r7, %r1, %r6; + cvt.s32.u32 %r8, %ntid.x; + div.s32 %r9, %r8, %r6; + rem.s32 %r10, %r1, %r6; + cvt.s32.u32 %r11, %ctaid.x; + mul.lo.s32 %r12, %r11, %r9; + add.s32 %r13, %r7, %r12; + ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p4, %r13, %r14; + @!%p4 bra $Lt_1_24578; + .loc 16 277 0 + ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd17, %r15; + mul.wide.s32 %rd18, %r15, 4; + cvt.s64.s32 %rd19, %r13; + mul.wide.s32 %rd20, %r13, 4; + ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd22, %rd20, %rd21; + add.u64 %rd23, %rd18, %rd22; + ld.global.s32 %r16, [%rd23+0]; + add.u64 %rd24, %rd18, %rd23; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p5, %rd25, %rd21; + @%p5 bra $Lt_1_25090; + .loc 16 283 0 + cvt.s32.s64 %r17, %rd17; + mul.lo.s32 %r18, %r17, %r16; + cvt.s64.s32 %rd26, %r18; + mul.wide.s32 %rd27, %r18, 4; + add.u64 %rd28, %rd24, %rd27; + .loc 16 284 0 + mul.lo.s32 %r19, %r10, %r17; + cvt.s64.s32 %rd29, %r19; + mul.wide.s32 %rd30, %r19, 4; + add.u64 %rd31, %rd24, %rd30; + .loc 16 285 0 + mul.lo.s32 %r20, %r17, %r6; + bra.uni $Lt_1_24834; +$Lt_1_25090: + .loc 16 287 0 + ld.global.s32 %r21, [%rd24+0]; + cvt.s64.s32 %rd32, %r21; + mul.wide.s32 %rd33, %r21, 4; + add.u64 %rd34, %rd25, %rd33; + .loc 16 288 0 + cvt.s64.s32 %rd35, %r16; + mul.wide.s32 %rd36, %r16, 4; + add.u64 %rd28, %rd34, %rd36; + .loc 16 289 0 + mov.s32 %r20, %r6; + .loc 16 290 0 + cvt.s64.s32 %rd37, %r10; + mul.wide.s32 %rd38, %r10, 4; + add.u64 %rd31, %rd34, %rd38; +$Lt_1_24834: + .loc 16 293 0 + ld.global.s32 %r22, [%rd22+0]; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + mov.s32 %r28, 0; + mov.u32 %r29, %r28; + tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}]; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + mov.f32 %f29, %f25; + setp.ge.u64 %p6, %rd31, %rd28; + @%p6 bra $Lt_1_34562; + cvt.rzi.ftz.s32.f32 %r30, %f29; + cvt.s64.s32 %rd39, %r20; + mul.lo.s32 %r31, %r30, 11; + cvt.rn.f32.s32 %f30, %r31; + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 +$Lt_1_25858: + //<loop> Loop body line 293, nesting depth: 1, estimated iterations: unknown + .loc 16 300 0 + ld.global.s32 %r32, [%rd31+0]; + .loc 16 301 0 + shr.s32 %r33, %r32, 30; + and.b32 %r34, %r33, 3; + cvt.s64.s32 %rd40, %r34; + mul.wide.s32 %rd41, %r34, 4; + add.u64 %rd42, %rd1, %rd41; + ld.shared.f32 %f35, [%rd42+0]; + .loc 16 304 0 + and.b32 %r35, %r32, 1073741823; + mov.u32 %r36, %r35; + mov.s32 %r37, 0; + mov.u32 %r38, %r37; + mov.s32 %r39, 0; + mov.u32 %r40, %r39; + mov.s32 %r41, 0; + mov.u32 %r42, %r41; + tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}]; + mov.f32 %f40, %f36; + mov.f32 %f41, %f37; + mov.f32 %f42, %f38; + mov.f32 %f43, %f39; + sub.ftz.f32 %f44, %f27, %f41; + sub.ftz.f32 %f45, %f26, %f40; + sub.ftz.f32 %f46, %f28, %f42; + mul.ftz.f32 %f47, %f44, %f44; + fma.rn.ftz.f32 %f48, %f45, %f45, %f47; + fma.rn.ftz.f32 %f49, %f46, %f46, %f48; + add.ftz.f32 %f50, %f30, %f43; + cvt.rzi.ftz.s32.f32 %r43, %f50; + cvt.s64.s32 %rd43, %r43; + mul.wide.s32 %rd44, %r43, 16; + add.u64 %rd45, %rd44, %rd7; + ld.shared.f32 %f51, [%rd45+0]; + setp.gt.ftz.f32 %p7, %f51, %f49; + @!%p7 bra $Lt_1_28162; + rcp.approx.ftz.f32 %f52, %f49; + ld.shared.f32 %f53, [%rd45+4]; + mov.f32 %f54, 0f40000000; // 2 + setp.eq.ftz.f32 %p8, %f53, %f54; + @!%p8 bra $Lt_1_26882; + .loc 16 318 0 + mul.ftz.f32 %f55, %f52, %f52; + mov.f32 %f56, %f55; + .loc 16 319 0 + mul.ftz.f32 %f57, %f55, %f55; + bra.uni $Lt_1_27138; +$Lt_1_26882: + mov.f32 %f58, 0f3f800000; // 1 + setp.eq.ftz.f32 %p9, %f53, %f58; + @!%p9 bra $Lt_1_27394; + .loc 16 321 0 + sqrt.approx.ftz.f32 %f59, %f52; + mul.ftz.f32 %f60, %f52, %f59; + mov.f32 %f57, %f60; + .loc 16 322 0 + mul.ftz.f32 %f56, %f60, %f60; + bra.uni $Lt_1_27138; +$Lt_1_27394: + .loc 16 324 0 + mul.ftz.f32 %f61, %f52, %f52; + mul.ftz.f32 %f62, %f52, %f61; + mov.f32 %f56, %f62; + .loc 16 325 0 + mov.f32 %f57, %f62; +$Lt_1_27138: +$Lt_1_26626: + .loc 16 327 0 + mul.ftz.f32 %f63, %f52, %f35; + mul.ftz.f32 %f64, %f56, %f63; + ld.shared.v2.f32 {%f65,%f66}, [%rd45+8]; + mul.ftz.f32 %f67, %f65, %f57; + sub.ftz.f32 %f68, %f67, %f66; + mul.ftz.f32 %f69, %f64, %f68; + .loc 16 329 0 + fma.rn.ftz.f32 %f33, %f45, %f69, %f33; + .loc 16 330 0 + fma.rn.ftz.f32 %f32, %f44, %f69, %f32; + .loc 16 331 0 + fma.rn.ftz.f32 %f31, %f46, %f69, %f31; + ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r45, 0; + setp.le.s32 %p10, %r44, %r45; + @%p10 bra $Lt_1_27650; + .loc 16 333 0 + add.u64 %rd46, %rd44, %rd13; + ld.shared.v4.f32 {%f70,%f71,%f72,_}, [%rd46+0]; + mul.ftz.f32 %f73, %f35, %f56; + mul.ftz.f32 %f74, %f70, %f57; + sub.ftz.f32 %f75, %f74, %f71; + mul.ftz.f32 %f76, %f73, %f75; + sub.ftz.f32 %f77, %f76, %f72; + add.ftz.f32 %f34, %f34, %f77; +$Lt_1_27650: + ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r47, 0; + setp.le.s32 %p11, %r46, %r47; + @%p11 bra $Lt_1_28162; + .loc 16 336 0 + mov.f32 %f78, %f11; + mul.ftz.f32 %f79, %f45, %f45; + fma.rn.ftz.f32 %f80, %f69, %f79, %f78; + mov.f32 %f11, %f80; + .loc 16 337 0 + mov.f32 %f81, %f13; + fma.rn.ftz.f32 %f82, %f69, %f47, %f81; + mov.f32 %f13, %f82; + .loc 16 338 0 + mov.f32 %f83, %f15; + mul.ftz.f32 %f84, %f46, %f46; + fma.rn.ftz.f32 %f85, %f69, %f84, %f83; + mov.f32 %f15, %f85; + .loc 16 339 0 + mov.f32 %f86, %f17; + mul.ftz.f32 %f87, %f44, %f45; + fma.rn.ftz.f32 %f88, %f69, %f87, %f86; + mov.f32 %f17, %f88; + .loc 16 340 0 + mov.f32 %f89, %f19; + mul.ftz.f32 %f90, %f45, %f46; + fma.rn.ftz.f32 %f91, %f69, %f90, %f89; + mov.f32 %f19, %f91; + .loc 16 341 0 + mul.ftz.f32 %f92, %f44, %f46; + fma.rn.ftz.f32 %f20, %f69, %f92, %f20; + mov.f32 %f21, %f20; +$Lt_1_28162: +$Lt_1_26114: + .loc 16 298 0 + mul.lo.u64 %rd47, %rd39, 4; + add.u64 %rd31, %rd31, %rd47; + setp.lt.u64 %p12, %rd31, %rd28; + @%p12 bra $Lt_1_25858; + bra.uni $Lt_1_24322; +$Lt_1_34562: + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 + bra.uni $Lt_1_24322; +$Lt_1_24578: + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 +$Lt_1_24322: + mov.u32 %r48, 1; + setp.le.s32 %p13, %r6, %r48; + @%p13 bra $Lt_1_30978; + .loc 16 352 0 + mov.u64 %rd48, __cuda___cuda_local_var_32753_35_non_const_red_acc7168; + cvt.s64.s32 %rd49, %r1; + mul.wide.s32 %rd50, %r1, 4; + add.u64 %rd51, %rd48, %rd50; + mov.f32 %f93, %f33; + st.shared.f32 [%rd51+0], %f93; + .loc 16 353 0 + mov.f32 %f94, %f32; + st.shared.f32 [%rd51+512], %f94; + .loc 16 354 0 + mov.f32 %f95, %f31; + st.shared.f32 [%rd51+1024], %f95; + .loc 16 355 0 + mov.f32 %f96, %f34; + st.shared.f32 [%rd51+1536], %f96; + .loc 16 357 0 + shr.s32 %r49, %r6, 31; + mov.s32 %r50, 1; + and.b32 %r51, %r49, %r50; + add.s32 %r52, %r51, %r6; + shr.s32 %r53, %r52, 1; + mov.s32 %r54, %r53; + mov.u32 %r55, 0; + setp.ne.u32 %p14, %r53, %r55; + @!%p14 bra $Lt_1_29442; +$Lt_1_29954: + setp.ge.u32 %p15, %r10, %r54; + @%p15 bra $Lt_1_30210; + .loc 16 360 0 + add.u32 %r56, %r1, %r54; + cvt.u64.u32 %rd52, %r56; + mul.wide.u32 %rd53, %r56, 4; + add.u64 %rd54, %rd48, %rd53; + ld.shared.f32 %f97, [%rd54+0]; + add.ftz.f32 %f93, %f97, %f93; + st.shared.f32 [%rd51+0], %f93; + ld.shared.f32 %f98, [%rd54+512]; + add.ftz.f32 %f94, %f98, %f94; + st.shared.f32 [%rd51+512], %f94; + ld.shared.f32 %f99, [%rd54+1024]; + add.ftz.f32 %f95, %f99, %f95; + st.shared.f32 [%rd51+1024], %f95; + ld.shared.f32 %f100, [%rd54+1536]; + add.ftz.f32 %f96, %f100, %f96; + st.shared.f32 [%rd51+1536], %f96; +$Lt_1_30210: + .loc 16 357 0 + shr.u32 %r54, %r54, 1; + mov.u32 %r57, 0; + setp.ne.u32 %p16, %r54, %r57; + @%p16 bra $Lt_1_29954; +$Lt_1_29442: + .loc 16 364 0 + mov.f32 %f33, %f93; + .loc 16 365 0 + mov.f32 %f32, %f94; + .loc 16 366 0 + mov.f32 %f31, %f95; + .loc 16 367 0 + mov.f32 %f34, %f96; + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p17, %r58, %r59; + @%p17 bra $Lt_1_30978; + .loc 16 371 0 + mov.f32 %f93, %f11; + st.shared.f32 [%rd51+0], %f93; + mov.f32 %f94, %f13; + st.shared.f32 [%rd51+512], %f94; + mov.f32 %f95, %f15; + st.shared.f32 [%rd51+1024], %f95; + mov.f32 %f96, %f17; + st.shared.f32 [%rd51+1536], %f96; + mov.f32 %f101, %f19; + st.shared.f32 [%rd51+2048], %f101; + mov.f32 %f102, %f21; + st.shared.f32 [%rd51+2560], %f102; + .loc 16 373 0 + mov.s32 %r60, %r53; + @!%p14 bra $Lt_1_31490; +$Lt_1_32002: + setp.ge.u32 %p18, %r10, %r60; + @%p18 bra $Lt_1_32258; + .loc 16 376 0 + add.u32 %r61, %r1, %r60; + cvt.u64.u32 %rd55, %r61; + mul.wide.u32 %rd56, %r61, 4; + add.u64 %rd57, %rd48, %rd56; + ld.shared.f32 %f103, [%rd57+0]; + add.ftz.f32 %f93, %f103, %f93; + st.shared.f32 [%rd51+0], %f93; + ld.shared.f32 %f104, [%rd57+512]; + add.ftz.f32 %f94, %f104, %f94; + st.shared.f32 [%rd51+512], %f94; + ld.shared.f32 %f105, [%rd57+1024]; + add.ftz.f32 %f95, %f105, %f95; + st.shared.f32 [%rd51+1024], %f95; + ld.shared.f32 %f106, [%rd57+1536]; + add.ftz.f32 %f96, %f106, %f96; + st.shared.f32 [%rd51+1536], %f96; + ld.shared.f32 %f107, [%rd57+2048]; + add.ftz.f32 %f101, %f107, %f101; + st.shared.f32 [%rd51+2048], %f101; + ld.shared.f32 %f108, [%rd57+2560]; + add.ftz.f32 %f102, %f108, %f102; + st.shared.f32 [%rd51+2560], %f102; +$Lt_1_32258: + .loc 16 373 0 + shr.u32 %r60, %r60, 1; + mov.u32 %r62, 0; + setp.ne.u32 %p19, %r60, %r62; + @%p19 bra $Lt_1_32002; +$Lt_1_31490: + .loc 16 381 0 + mov.f32 %f11, %f93; + mov.f32 %f13, %f94; + mov.f32 %f15, %f95; + mov.f32 %f17, %f96; + mov.f32 %f19, %f101; + mov.f32 %f21, %f102; +$Lt_1_30978: +$Lt_1_28930: + selp.s32 %r63, 1, 0, %p4; + mov.s32 %r64, 0; + set.eq.u32.s32 %r65, %r10, %r64; + neg.s32 %r66, %r65; + and.b32 %r67, %r63, %r66; + mov.u32 %r68, 0; + setp.eq.s32 %p20, %r67, %r68; + @%p20 bra $Lt_1_33026; + .loc 16 387 0 + cvt.s64.s32 %rd58, %r13; + ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd60, %r13, 4; + add.u64 %rd61, %rd59, %rd60; + ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r70, 0; + setp.le.s32 %p21, %r69, %r70; + @%p21 bra $Lt_1_33538; + .loc 16 389 0 + st.global.f32 [%rd61+0], %f34; + .loc 16 390 0 + cvt.s64.s32 %rd62, %r14; + mul.wide.s32 %rd63, %r14, 4; + add.u64 %rd61, %rd61, %rd63; +$Lt_1_33538: + ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r72, 0; + setp.le.s32 %p22, %r71, %r72; + @%p22 bra $Lt_1_34050; + .loc 16 394 0 + mov.f32 %f109, %f11; + st.global.f32 [%rd61+0], %f109; + .loc 16 395 0 + cvt.s64.s32 %rd64, %r14; + mul.wide.s32 %rd65, %r14, 4; + add.u64 %rd66, %rd65, %rd61; + .loc 16 394 0 + mov.f32 %f110, %f13; + st.global.f32 [%rd66+0], %f110; + .loc 16 395 0 + add.u64 %rd67, %rd65, %rd66; + .loc 16 394 0 + mov.f32 %f111, %f15; + st.global.f32 [%rd67+0], %f111; + .loc 16 395 0 + add.u64 %rd68, %rd65, %rd67; + .loc 16 394 0 + mov.f32 %f112, %f17; + st.global.f32 [%rd68+0], %f112; + .loc 16 395 0 + add.u64 %rd61, %rd65, %rd68; + .loc 16 394 0 + mov.f32 %f113, %f19; + st.global.f32 [%rd61+0], %f113; + mov.f32 %f114, %f21; + add.u64 %rd69, %rd65, %rd61; + st.global.f32 [%rd69+0], %f114; +$Lt_1_34050: + .loc 16 398 0 + ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd71, %rd58, 16; + add.u64 %rd72, %rd70, %rd71; + mov.f32 %f115, %f116; + st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f115}; +$Lt_1_33026: + .loc 16 400 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/cmm_cut_gpu_ptx.h b/lib/gpu/cmm_cut_gpu_ptx.h new file mode 100644 index 000000000..cb4ac63c4 --- /dev/null +++ b/lib/gpu/cmm_cut_gpu_ptx.h @@ -0,0 +1,984 @@ +const char * cmm_cut_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .u64 __cudaparm_kernel_pair_lj3,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<72>;\n" +" .reg .u64 %rd<62>;\n" +" .reg .f32 %f<111>;\n" +" .reg .pred %p<21>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32590_35_non_const_red_acc108[3072];\n" +" .loc 16 88 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 95 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 96 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 97 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 98 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 107 0\n" +" mov.f32 %f5, 0f00000000; \n" +" mov.f32 %f6, %f5;\n" +" mov.f32 %f7, 0f00000000; \n" +" mov.f32 %f8, %f7;\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_20738;\n" +" .loc 16 114 0\n" +" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd2, %r11;\n" +" mul.wide.s32 %rd3, %r11, 4;\n" +" cvt.s64.s32 %rd4, %r9;\n" +" mul.wide.s32 %rd5, %r9, 4;\n" +" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd7, %rd5, %rd6;\n" +" add.u64 %rd8, %rd3, %rd7;\n" +" ld.global.s32 %r12, [%rd8+0];\n" +" add.u64 %rd9, %rd3, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd6;\n" +" @%p2 bra $Lt_0_21250;\n" +" .loc 16 120 0\n" +" cvt.s32.s64 %r13, %rd2;\n" +" mul.lo.s32 %r14, %r13, %r12;\n" +" cvt.s64.s32 %rd11, %r14;\n" +" mul.wide.s32 %rd12, %r14, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 121 0\n" +" mul.lo.s32 %r15, %r6, %r13;\n" +" cvt.s64.s32 %rd14, %r15;\n" +" mul.wide.s32 %rd15, %r15, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 122 0\n" +" mul.lo.s32 %r16, %r13, %r1;\n" +" bra.uni $Lt_0_20994;\n" +"$Lt_0_21250:\n" +" .loc 16 124 0\n" +" ld.global.s32 %r17, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r17;\n" +" mul.wide.s32 %rd18, %r17, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 125 0\n" +" cvt.s64.s32 %rd20, %r12;\n" +" mul.wide.s32 %rd21, %r12, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 126 0\n" +" mov.s32 %r16, %r1;\n" +" .loc 16 127 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_20994:\n" +" .loc 16 130 0\n" +" ld.global.s32 %r18, [%rd7+0];\n" +" mov.u32 %r19, %r18;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f21, %f17;\n" +" mov.f32 %f22, %f18;\n" +" mov.f32 %f23, %f19;\n" +" mov.f32 %f24, %f20;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_30722;\n" +" cvt.rzi.ftz.s32.f32 %r26, %f24;\n" +" cvt.s64.s32 %rd24, %r16;\n" +" ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r28, %r27, %r26;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;\n" +"$Lt_0_22018:\n" +" .loc 16 136 0\n" +" ld.global.s32 %r29, [%rd16+0];\n" +" .loc 16 137 0\n" +" shr.s32 %r30, %r29, 30;\n" +" and.b32 %r31, %r30, 3;\n" +" cvt.s64.s32 %rd27, %r31;\n" +" mul.wide.s32 %rd28, %r31, 4;\n" +" add.u64 %rd29, %rd26, %rd28;\n" +" ld.shared.f32 %f29, [%rd29+0];\n" +" .loc 16 140 0\n" +" and.b32 %r32, %r29, 1073741823;\n" +" mov.u32 %r33, %r32;\n" +" mov.s32 %r34, 0;\n" +" mov.u32 %r35, %r34;\n" +" mov.s32 %r36, 0;\n" +" mov.u32 %r37, %r36;\n" +" mov.s32 %r38, 0;\n" +" mov.u32 %r39, %r38;\n" +" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];\n" +" mov.f32 %f34, %f30;\n" +" mov.f32 %f35, %f31;\n" +" mov.f32 %f36, %f32;\n" +" mov.f32 %f37, %f33;\n" +" cvt.rzi.ftz.s32.f32 %r40, %f37;\n" +" sub.ftz.f32 %f38, %f22, %f35;\n" +" sub.ftz.f32 %f39, %f21, %f34;\n" +" sub.ftz.f32 %f40, %f23, %f36;\n" +" mul.ftz.f32 %f41, %f38, %f38;\n" +" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" +" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" +" add.s32 %r41, %r40, %r28;\n" +" cvt.s64.s32 %rd30, %r41;\n" +" mul.wide.s32 %rd31, %r41, 16;\n" +" add.u64 %rd32, %rd31, %rd25;\n" +" ld.global.f32 %f44, [%rd32+0];\n" +" setp.gt.ftz.f32 %p4, %f44, %f43;\n" +" @!%p4 bra $Lt_0_24322;\n" +" rcp.approx.ftz.f32 %f45, %f43;\n" +" ld.global.f32 %f46, [%rd32+4];\n" +" mov.f32 %f47, 0f40000000; \n" +" setp.eq.ftz.f32 %p5, %f46, %f47;\n" +" @!%p5 bra $Lt_0_23042;\n" +" .loc 16 155 0\n" +" mul.ftz.f32 %f48, %f45, %f45;\n" +" mov.f32 %f49, %f48;\n" +" .loc 16 156 0\n" +" mul.ftz.f32 %f50, %f48, %f48;\n" +" bra.uni $Lt_0_23298;\n" +"$Lt_0_23042:\n" +" mov.f32 %f51, 0f3f800000; \n" +" setp.eq.ftz.f32 %p6, %f46, %f51;\n" +" @!%p6 bra $Lt_0_23554;\n" +" .loc 16 158 0\n" +" sqrt.approx.ftz.f32 %f52, %f45;\n" +" mul.ftz.f32 %f53, %f45, %f52;\n" +" mov.f32 %f50, %f53;\n" +" .loc 16 159 0\n" +" mul.ftz.f32 %f49, %f53, %f53;\n" +" bra.uni $Lt_0_23298;\n" +"$Lt_0_23554:\n" +" .loc 16 161 0\n" +" mul.ftz.f32 %f54, %f45, %f45;\n" +" mul.ftz.f32 %f55, %f45, %f54;\n" +" mov.f32 %f49, %f55;\n" +" .loc 16 162 0\n" +" mov.f32 %f50, %f55;\n" +"$Lt_0_23298:\n" +"$Lt_0_22786:\n" +" .loc 16 164 0\n" +" mul.ftz.f32 %f56, %f45, %f29;\n" +" mul.ftz.f32 %f57, %f49, %f56;\n" +" ld.global.v2.f32 {%f58,%f59}, [%rd32+8];\n" +" mul.ftz.f32 %f60, %f58, %f50;\n" +" sub.ftz.f32 %f61, %f60, %f59;\n" +" mul.ftz.f32 %f62, %f57, %f61;\n" +" .loc 16 166 0\n" +" fma.rn.ftz.f32 %f27, %f39, %f62, %f27;\n" +" .loc 16 167 0\n" +" fma.rn.ftz.f32 %f26, %f38, %f62, %f26;\n" +" .loc 16 168 0\n" +" fma.rn.ftz.f32 %f25, %f40, %f62, %f25;\n" +" ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r43, 0;\n" +" setp.le.s32 %p7, %r42, %r43;\n" +" @%p7 bra $Lt_0_23810;\n" +" .loc 16 170 0\n" +" ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd34, %rd33, %rd31;\n" +" ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd34+0];\n" +" mul.ftz.f32 %f66, %f29, %f49;\n" +" mul.ftz.f32 %f67, %f63, %f50;\n" +" sub.ftz.f32 %f68, %f67, %f64;\n" +" mul.ftz.f32 %f69, %f66, %f68;\n" +" sub.ftz.f32 %f70, %f69, %f65;\n" +" add.ftz.f32 %f28, %f28, %f70;\n" +"$Lt_0_23810:\n" +" ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r45, 0;\n" +" setp.le.s32 %p8, %r44, %r45;\n" +" @%p8 bra $Lt_0_24322;\n" +" .loc 16 173 0\n" +" mov.f32 %f71, %f6;\n" +" mul.ftz.f32 %f72, %f39, %f39;\n" +" fma.rn.ftz.f32 %f73, %f62, %f72, %f71;\n" +" mov.f32 %f6, %f73;\n" +" .loc 16 174 0\n" +" mov.f32 %f74, %f8;\n" +" fma.rn.ftz.f32 %f75, %f62, %f41, %f74;\n" +" mov.f32 %f8, %f75;\n" +" .loc 16 175 0\n" +" mov.f32 %f76, %f10;\n" +" mul.ftz.f32 %f77, %f40, %f40;\n" +" fma.rn.ftz.f32 %f78, %f62, %f77, %f76;\n" +" mov.f32 %f10, %f78;\n" +" .loc 16 176 0\n" +" mov.f32 %f79, %f12;\n" +" mul.ftz.f32 %f80, %f38, %f39;\n" +" fma.rn.ftz.f32 %f81, %f62, %f80, %f79;\n" +" mov.f32 %f12, %f81;\n" +" .loc 16 177 0\n" +" mov.f32 %f82, %f14;\n" +" mul.ftz.f32 %f83, %f39, %f40;\n" +" fma.rn.ftz.f32 %f84, %f62, %f83, %f82;\n" +" mov.f32 %f14, %f84;\n" +" .loc 16 178 0\n" +" mul.ftz.f32 %f85, %f38, %f40;\n" +" fma.rn.ftz.f32 %f15, %f62, %f85, %f15;\n" +" mov.f32 %f16, %f15;\n" +"$Lt_0_24322:\n" +"$Lt_0_22274:\n" +" .loc 16 134 0\n" +" mul.lo.u64 %rd35, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd35;\n" +" setp.lt.u64 %p9, %rd16, %rd13;\n" +" @%p9 bra $Lt_0_22018;\n" +" bra.uni $Lt_0_20482;\n" +"$Lt_0_30722:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" bra.uni $Lt_0_20482;\n" +"$Lt_0_20738:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +"$Lt_0_20482:\n" +" mov.u32 %r46, 1;\n" +" setp.le.s32 %p10, %r1, %r46;\n" +" @%p10 bra $Lt_0_27138;\n" +" .loc 16 189 0\n" +" mov.u64 %rd36, __cuda___cuda_local_var_32590_35_non_const_red_acc108;\n" +" cvt.s64.s32 %rd37, %r2;\n" +" mul.wide.s32 %rd38, %r2, 4;\n" +" add.u64 %rd39, %rd36, %rd38;\n" +" mov.f32 %f86, %f27;\n" +" st.shared.f32 [%rd39+0], %f86;\n" +" .loc 16 190 0\n" +" mov.f32 %f87, %f26;\n" +" st.shared.f32 [%rd39+512], %f87;\n" +" .loc 16 191 0\n" +" mov.f32 %f88, %f25;\n" +" st.shared.f32 [%rd39+1024], %f88;\n" +" .loc 16 192 0\n" +" mov.f32 %f89, %f28;\n" +" st.shared.f32 [%rd39+1536], %f89;\n" +" .loc 16 194 0\n" +" shr.s32 %r47, %r1, 31;\n" +" mov.s32 %r48, 1;\n" +" and.b32 %r49, %r47, %r48;\n" +" add.s32 %r50, %r49, %r1;\n" +" shr.s32 %r51, %r50, 1;\n" +" mov.s32 %r52, %r51;\n" +" mov.u32 %r53, 0;\n" +" setp.ne.u32 %p11, %r51, %r53;\n" +" @!%p11 bra $Lt_0_25602;\n" +"$Lt_0_26114:\n" +" setp.ge.u32 %p12, %r6, %r52;\n" +" @%p12 bra $Lt_0_26370;\n" +" .loc 16 197 0\n" +" add.u32 %r54, %r2, %r52;\n" +" cvt.u64.u32 %rd40, %r54;\n" +" mul.wide.u32 %rd41, %r54, 4;\n" +" add.u64 %rd42, %rd36, %rd41;\n" +" ld.shared.f32 %f90, [%rd42+0];\n" +" add.ftz.f32 %f86, %f90, %f86;\n" +" st.shared.f32 [%rd39+0], %f86;\n" +" ld.shared.f32 %f91, [%rd42+512];\n" +" add.ftz.f32 %f87, %f91, %f87;\n" +" st.shared.f32 [%rd39+512], %f87;\n" +" ld.shared.f32 %f92, [%rd42+1024];\n" +" add.ftz.f32 %f88, %f92, %f88;\n" +" st.shared.f32 [%rd39+1024], %f88;\n" +" ld.shared.f32 %f93, [%rd42+1536];\n" +" add.ftz.f32 %f89, %f93, %f89;\n" +" st.shared.f32 [%rd39+1536], %f89;\n" +"$Lt_0_26370:\n" +" .loc 16 194 0\n" +" shr.u32 %r52, %r52, 1;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p13, %r52, %r55;\n" +" @%p13 bra $Lt_0_26114;\n" +"$Lt_0_25602:\n" +" .loc 16 201 0\n" +" mov.f32 %f27, %f86;\n" +" .loc 16 202 0\n" +" mov.f32 %f26, %f87;\n" +" .loc 16 203 0\n" +" mov.f32 %f25, %f88;\n" +" .loc 16 204 0\n" +" mov.f32 %f28, %f89;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p14, %r56, %r57;\n" +" @%p14 bra $Lt_0_27138;\n" +" .loc 16 208 0\n" +" mov.f32 %f86, %f6;\n" +" st.shared.f32 [%rd39+0], %f86;\n" +" mov.f32 %f87, %f8;\n" +" st.shared.f32 [%rd39+512], %f87;\n" +" mov.f32 %f88, %f10;\n" +" st.shared.f32 [%rd39+1024], %f88;\n" +" mov.f32 %f89, %f12;\n" +" st.shared.f32 [%rd39+1536], %f89;\n" +" mov.f32 %f94, %f14;\n" +" st.shared.f32 [%rd39+2048], %f94;\n" +" mov.f32 %f95, %f16;\n" +" st.shared.f32 [%rd39+2560], %f95;\n" +" .loc 16 210 0\n" +" mov.s32 %r58, %r51;\n" +" @!%p11 bra $Lt_0_27650;\n" +"$Lt_0_28162:\n" +" setp.ge.u32 %p15, %r6, %r58;\n" +" @%p15 bra $Lt_0_28418;\n" +" .loc 16 213 0\n" +" add.u32 %r59, %r2, %r58;\n" +" cvt.u64.u32 %rd43, %r59;\n" +" mul.wide.u32 %rd44, %r59, 4;\n" +" add.u64 %rd45, %rd36, %rd44;\n" +" ld.shared.f32 %f96, [%rd45+0];\n" +" add.ftz.f32 %f86, %f96, %f86;\n" +" st.shared.f32 [%rd39+0], %f86;\n" +" ld.shared.f32 %f97, [%rd45+512];\n" +" add.ftz.f32 %f87, %f97, %f87;\n" +" st.shared.f32 [%rd39+512], %f87;\n" +" ld.shared.f32 %f98, [%rd45+1024];\n" +" add.ftz.f32 %f88, %f98, %f88;\n" +" st.shared.f32 [%rd39+1024], %f88;\n" +" ld.shared.f32 %f99, [%rd45+1536];\n" +" add.ftz.f32 %f89, %f99, %f89;\n" +" st.shared.f32 [%rd39+1536], %f89;\n" +" ld.shared.f32 %f100, [%rd45+2048];\n" +" add.ftz.f32 %f94, %f100, %f94;\n" +" st.shared.f32 [%rd39+2048], %f94;\n" +" ld.shared.f32 %f101, [%rd45+2560];\n" +" add.ftz.f32 %f95, %f101, %f95;\n" +" st.shared.f32 [%rd39+2560], %f95;\n" +"$Lt_0_28418:\n" +" .loc 16 210 0\n" +" shr.u32 %r58, %r58, 1;\n" +" mov.u32 %r60, 0;\n" +" setp.ne.u32 %p16, %r58, %r60;\n" +" @%p16 bra $Lt_0_28162;\n" +"$Lt_0_27650:\n" +" .loc 16 218 0\n" +" mov.f32 %f6, %f86;\n" +" mov.f32 %f8, %f87;\n" +" mov.f32 %f10, %f88;\n" +" mov.f32 %f12, %f89;\n" +" mov.f32 %f14, %f94;\n" +" mov.f32 %f16, %f95;\n" +"$Lt_0_27138:\n" +"$Lt_0_25090:\n" +" selp.s32 %r61, 1, 0, %p1;\n" +" mov.s32 %r62, 0;\n" +" set.eq.u32.s32 %r63, %r6, %r62;\n" +" neg.s32 %r64, %r63;\n" +" and.b32 %r65, %r61, %r64;\n" +" mov.u32 %r66, 0;\n" +" setp.eq.s32 %p17, %r65, %r66;\n" +" @%p17 bra $Lt_0_29186;\n" +" .loc 16 224 0\n" +" cvt.s64.s32 %rd46, %r9;\n" +" ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd48, %r9, 4;\n" +" add.u64 %rd49, %rd47, %rd48;\n" +" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r68, 0;\n" +" setp.le.s32 %p18, %r67, %r68;\n" +" @%p18 bra $Lt_0_29698;\n" +" .loc 16 226 0\n" +" st.global.f32 [%rd49+0], %f28;\n" +" .loc 16 227 0\n" +" cvt.s64.s32 %rd50, %r10;\n" +" mul.wide.s32 %rd51, %r10, 4;\n" +" add.u64 %rd49, %rd49, %rd51;\n" +"$Lt_0_29698:\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p19, %r69, %r70;\n" +" @%p19 bra $Lt_0_30210;\n" +" .loc 16 231 0\n" +" mov.f32 %f102, %f6;\n" +" st.global.f32 [%rd49+0], %f102;\n" +" .loc 16 232 0\n" +" cvt.s64.s32 %rd52, %r10;\n" +" mul.wide.s32 %rd53, %r10, 4;\n" +" add.u64 %rd54, %rd53, %rd49;\n" +" .loc 16 231 0\n" +" mov.f32 %f103, %f8;\n" +" st.global.f32 [%rd54+0], %f103;\n" +" .loc 16 232 0\n" +" add.u64 %rd55, %rd53, %rd54;\n" +" .loc 16 231 0\n" +" mov.f32 %f104, %f10;\n" +" st.global.f32 [%rd55+0], %f104;\n" +" .loc 16 232 0\n" +" add.u64 %rd56, %rd53, %rd55;\n" +" .loc 16 231 0\n" +" mov.f32 %f105, %f12;\n" +" st.global.f32 [%rd56+0], %f105;\n" +" .loc 16 232 0\n" +" add.u64 %rd49, %rd53, %rd56;\n" +" .loc 16 231 0\n" +" mov.f32 %f106, %f14;\n" +" st.global.f32 [%rd49+0], %f106;\n" +" mov.f32 %f107, %f16;\n" +" add.u64 %rd57, %rd53, %rd49;\n" +" st.global.f32 [%rd57+0], %f107;\n" +"$Lt_0_30210:\n" +" .loc 16 235 0\n" +" ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd59, %rd46, 16;\n" +" add.u64 %rd60, %rd58, %rd59;\n" +" mov.f32 %f108, %f109;\n" +" st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f108};\n" +"$Lt_0_29186:\n" +" .loc 16 237 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<74>;\n" +" .reg .u64 %rd<74>;\n" +" .reg .f32 %f<118>;\n" +" .reg .pred %p<24>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32656_33_non_const_sp_lj3268[16];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32654_34_non_const_lj13296[1936];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32655_34_non_const_lj35232[1936];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32753_35_non_const_red_acc7168[3072];\n" +" .loc 16 245 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 3;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_22786;\n" +" .loc 16 255 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32656_33_non_const_sp_lj3268;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_22786:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32656_33_non_const_sp_lj3268;\n" +" mov.u32 %r3, 120;\n" +" setp.gt.s32 %p2, %r1, %r3;\n" +" @%p2 bra $Lt_1_23298;\n" +" .loc 16 257 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32654_34_non_const_lj13296;\n" +" cvt.s64.s32 %rd8, %r1;\n" +" mul.wide.s32 %rd9, %r1, 16;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" +" add.u64 %rd11, %rd10, %rd9;\n" +" add.u64 %rd12, %rd9, %rd7;\n" +" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" +" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" +" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r5, 0;\n" +" setp.le.s32 %p3, %r4, %r5;\n" +" @%p3 bra $Lt_1_23810;\n" +" .loc 16 259 0\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32655_34_non_const_lj35232;\n" +" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" +" add.u64 %rd15, %rd14, %rd9;\n" +" add.u64 %rd16, %rd9, %rd13;\n" +" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" +" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" +"$Lt_1_23810:\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32655_34_non_const_lj35232;\n" +"$Lt_1_23298:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32654_34_non_const_lj13296;\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32655_34_non_const_lj35232;\n" +" .loc 16 269 0\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, %f14;\n" +" mov.f32 %f16, 0f00000000; \n" +" mov.f32 %f17, %f16;\n" +" mov.f32 %f18, 0f00000000; \n" +" mov.f32 %f19, %f18;\n" +" mov.f32 %f20, 0f00000000; \n" +" mov.f32 %f21, %f20;\n" +" .loc 16 271 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r7, %r1, %r6;\n" +" cvt.s32.u32 %r8, %ntid.x;\n" +" div.s32 %r9, %r8, %r6;\n" +" rem.s32 %r10, %r1, %r6;\n" +" cvt.s32.u32 %r11, %ctaid.x;\n" +" mul.lo.s32 %r12, %r11, %r9;\n" +" add.s32 %r13, %r7, %r12;\n" +" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p4, %r13, %r14;\n" +" @!%p4 bra $Lt_1_24578;\n" +" .loc 16 277 0\n" +" ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd17, %r15;\n" +" mul.wide.s32 %rd18, %r15, 4;\n" +" cvt.s64.s32 %rd19, %r13;\n" +" mul.wide.s32 %rd20, %r13, 4;\n" +" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd22, %rd20, %rd21;\n" +" add.u64 %rd23, %rd18, %rd22;\n" +" ld.global.s32 %r16, [%rd23+0];\n" +" add.u64 %rd24, %rd18, %rd23;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p5, %rd25, %rd21;\n" +" @%p5 bra $Lt_1_25090;\n" +" .loc 16 283 0\n" +" cvt.s32.s64 %r17, %rd17;\n" +" mul.lo.s32 %r18, %r17, %r16;\n" +" cvt.s64.s32 %rd26, %r18;\n" +" mul.wide.s32 %rd27, %r18, 4;\n" +" add.u64 %rd28, %rd24, %rd27;\n" +" .loc 16 284 0\n" +" mul.lo.s32 %r19, %r10, %r17;\n" +" cvt.s64.s32 %rd29, %r19;\n" +" mul.wide.s32 %rd30, %r19, 4;\n" +" add.u64 %rd31, %rd24, %rd30;\n" +" .loc 16 285 0\n" +" mul.lo.s32 %r20, %r17, %r6;\n" +" bra.uni $Lt_1_24834;\n" +"$Lt_1_25090:\n" +" .loc 16 287 0\n" +" ld.global.s32 %r21, [%rd24+0];\n" +" cvt.s64.s32 %rd32, %r21;\n" +" mul.wide.s32 %rd33, %r21, 4;\n" +" add.u64 %rd34, %rd25, %rd33;\n" +" .loc 16 288 0\n" +" cvt.s64.s32 %rd35, %r16;\n" +" mul.wide.s32 %rd36, %r16, 4;\n" +" add.u64 %rd28, %rd34, %rd36;\n" +" .loc 16 289 0\n" +" mov.s32 %r20, %r6;\n" +" .loc 16 290 0\n" +" cvt.s64.s32 %rd37, %r10;\n" +" mul.wide.s32 %rd38, %r10, 4;\n" +" add.u64 %rd31, %rd34, %rd38;\n" +"$Lt_1_24834:\n" +" .loc 16 293 0\n" +" ld.global.s32 %r22, [%rd22+0];\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" mov.s32 %r28, 0;\n" +" mov.u32 %r29, %r28;\n" +" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" mov.f32 %f29, %f25;\n" +" setp.ge.u64 %p6, %rd31, %rd28;\n" +" @%p6 bra $Lt_1_34562;\n" +" cvt.rzi.ftz.s32.f32 %r30, %f29;\n" +" cvt.s64.s32 %rd39, %r20;\n" +" mul.lo.s32 %r31, %r30, 11;\n" +" cvt.rn.f32.s32 %f30, %r31;\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +"$Lt_1_25858:\n" +" .loc 16 300 0\n" +" ld.global.s32 %r32, [%rd31+0];\n" +" .loc 16 301 0\n" +" shr.s32 %r33, %r32, 30;\n" +" and.b32 %r34, %r33, 3;\n" +" cvt.s64.s32 %rd40, %r34;\n" +" mul.wide.s32 %rd41, %r34, 4;\n" +" add.u64 %rd42, %rd1, %rd41;\n" +" ld.shared.f32 %f35, [%rd42+0];\n" +" .loc 16 304 0\n" +" and.b32 %r35, %r32, 1073741823;\n" +" mov.u32 %r36, %r35;\n" +" mov.s32 %r37, 0;\n" +" mov.u32 %r38, %r37;\n" +" mov.s32 %r39, 0;\n" +" mov.u32 %r40, %r39;\n" +" mov.s32 %r41, 0;\n" +" mov.u32 %r42, %r41;\n" +" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];\n" +" mov.f32 %f40, %f36;\n" +" mov.f32 %f41, %f37;\n" +" mov.f32 %f42, %f38;\n" +" mov.f32 %f43, %f39;\n" +" sub.ftz.f32 %f44, %f27, %f41;\n" +" sub.ftz.f32 %f45, %f26, %f40;\n" +" sub.ftz.f32 %f46, %f28, %f42;\n" +" mul.ftz.f32 %f47, %f44, %f44;\n" +" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" +" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n" +" add.ftz.f32 %f50, %f30, %f43;\n" +" cvt.rzi.ftz.s32.f32 %r43, %f50;\n" +" cvt.s64.s32 %rd43, %r43;\n" +" mul.wide.s32 %rd44, %r43, 16;\n" +" add.u64 %rd45, %rd44, %rd7;\n" +" ld.shared.f32 %f51, [%rd45+0];\n" +" setp.gt.ftz.f32 %p7, %f51, %f49;\n" +" @!%p7 bra $Lt_1_28162;\n" +" rcp.approx.ftz.f32 %f52, %f49;\n" +" ld.shared.f32 %f53, [%rd45+4];\n" +" mov.f32 %f54, 0f40000000; \n" +" setp.eq.ftz.f32 %p8, %f53, %f54;\n" +" @!%p8 bra $Lt_1_26882;\n" +" .loc 16 318 0\n" +" mul.ftz.f32 %f55, %f52, %f52;\n" +" mov.f32 %f56, %f55;\n" +" .loc 16 319 0\n" +" mul.ftz.f32 %f57, %f55, %f55;\n" +" bra.uni $Lt_1_27138;\n" +"$Lt_1_26882:\n" +" mov.f32 %f58, 0f3f800000; \n" +" setp.eq.ftz.f32 %p9, %f53, %f58;\n" +" @!%p9 bra $Lt_1_27394;\n" +" .loc 16 321 0\n" +" sqrt.approx.ftz.f32 %f59, %f52;\n" +" mul.ftz.f32 %f60, %f52, %f59;\n" +" mov.f32 %f57, %f60;\n" +" .loc 16 322 0\n" +" mul.ftz.f32 %f56, %f60, %f60;\n" +" bra.uni $Lt_1_27138;\n" +"$Lt_1_27394:\n" +" .loc 16 324 0\n" +" mul.ftz.f32 %f61, %f52, %f52;\n" +" mul.ftz.f32 %f62, %f52, %f61;\n" +" mov.f32 %f56, %f62;\n" +" .loc 16 325 0\n" +" mov.f32 %f57, %f62;\n" +"$Lt_1_27138:\n" +"$Lt_1_26626:\n" +" .loc 16 327 0\n" +" mul.ftz.f32 %f63, %f52, %f35;\n" +" mul.ftz.f32 %f64, %f56, %f63;\n" +" ld.shared.v2.f32 {%f65,%f66}, [%rd45+8];\n" +" mul.ftz.f32 %f67, %f65, %f57;\n" +" sub.ftz.f32 %f68, %f67, %f66;\n" +" mul.ftz.f32 %f69, %f64, %f68;\n" +" .loc 16 329 0\n" +" fma.rn.ftz.f32 %f33, %f45, %f69, %f33;\n" +" .loc 16 330 0\n" +" fma.rn.ftz.f32 %f32, %f44, %f69, %f32;\n" +" .loc 16 331 0\n" +" fma.rn.ftz.f32 %f31, %f46, %f69, %f31;\n" +" ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r45, 0;\n" +" setp.le.s32 %p10, %r44, %r45;\n" +" @%p10 bra $Lt_1_27650;\n" +" .loc 16 333 0\n" +" add.u64 %rd46, %rd44, %rd13;\n" +" ld.shared.v4.f32 {%f70,%f71,%f72,_}, [%rd46+0];\n" +" mul.ftz.f32 %f73, %f35, %f56;\n" +" mul.ftz.f32 %f74, %f70, %f57;\n" +" sub.ftz.f32 %f75, %f74, %f71;\n" +" mul.ftz.f32 %f76, %f73, %f75;\n" +" sub.ftz.f32 %f77, %f76, %f72;\n" +" add.ftz.f32 %f34, %f34, %f77;\n" +"$Lt_1_27650:\n" +" ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r47, 0;\n" +" setp.le.s32 %p11, %r46, %r47;\n" +" @%p11 bra $Lt_1_28162;\n" +" .loc 16 336 0\n" +" mov.f32 %f78, %f11;\n" +" mul.ftz.f32 %f79, %f45, %f45;\n" +" fma.rn.ftz.f32 %f80, %f69, %f79, %f78;\n" +" mov.f32 %f11, %f80;\n" +" .loc 16 337 0\n" +" mov.f32 %f81, %f13;\n" +" fma.rn.ftz.f32 %f82, %f69, %f47, %f81;\n" +" mov.f32 %f13, %f82;\n" +" .loc 16 338 0\n" +" mov.f32 %f83, %f15;\n" +" mul.ftz.f32 %f84, %f46, %f46;\n" +" fma.rn.ftz.f32 %f85, %f69, %f84, %f83;\n" +" mov.f32 %f15, %f85;\n" +" .loc 16 339 0\n" +" mov.f32 %f86, %f17;\n" +" mul.ftz.f32 %f87, %f44, %f45;\n" +" fma.rn.ftz.f32 %f88, %f69, %f87, %f86;\n" +" mov.f32 %f17, %f88;\n" +" .loc 16 340 0\n" +" mov.f32 %f89, %f19;\n" +" mul.ftz.f32 %f90, %f45, %f46;\n" +" fma.rn.ftz.f32 %f91, %f69, %f90, %f89;\n" +" mov.f32 %f19, %f91;\n" +" .loc 16 341 0\n" +" mul.ftz.f32 %f92, %f44, %f46;\n" +" fma.rn.ftz.f32 %f20, %f69, %f92, %f20;\n" +" mov.f32 %f21, %f20;\n" +"$Lt_1_28162:\n" +"$Lt_1_26114:\n" +" .loc 16 298 0\n" +" mul.lo.u64 %rd47, %rd39, 4;\n" +" add.u64 %rd31, %rd31, %rd47;\n" +" setp.lt.u64 %p12, %rd31, %rd28;\n" +" @%p12 bra $Lt_1_25858;\n" +" bra.uni $Lt_1_24322;\n" +"$Lt_1_34562:\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +" bra.uni $Lt_1_24322;\n" +"$Lt_1_24578:\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +"$Lt_1_24322:\n" +" mov.u32 %r48, 1;\n" +" setp.le.s32 %p13, %r6, %r48;\n" +" @%p13 bra $Lt_1_30978;\n" +" .loc 16 352 0\n" +" mov.u64 %rd48, __cuda___cuda_local_var_32753_35_non_const_red_acc7168;\n" +" cvt.s64.s32 %rd49, %r1;\n" +" mul.wide.s32 %rd50, %r1, 4;\n" +" add.u64 %rd51, %rd48, %rd50;\n" +" mov.f32 %f93, %f33;\n" +" st.shared.f32 [%rd51+0], %f93;\n" +" .loc 16 353 0\n" +" mov.f32 %f94, %f32;\n" +" st.shared.f32 [%rd51+512], %f94;\n" +" .loc 16 354 0\n" +" mov.f32 %f95, %f31;\n" +" st.shared.f32 [%rd51+1024], %f95;\n" +" .loc 16 355 0\n" +" mov.f32 %f96, %f34;\n" +" st.shared.f32 [%rd51+1536], %f96;\n" +" .loc 16 357 0\n" +" shr.s32 %r49, %r6, 31;\n" +" mov.s32 %r50, 1;\n" +" and.b32 %r51, %r49, %r50;\n" +" add.s32 %r52, %r51, %r6;\n" +" shr.s32 %r53, %r52, 1;\n" +" mov.s32 %r54, %r53;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p14, %r53, %r55;\n" +" @!%p14 bra $Lt_1_29442;\n" +"$Lt_1_29954:\n" +" setp.ge.u32 %p15, %r10, %r54;\n" +" @%p15 bra $Lt_1_30210;\n" +" .loc 16 360 0\n" +" add.u32 %r56, %r1, %r54;\n" +" cvt.u64.u32 %rd52, %r56;\n" +" mul.wide.u32 %rd53, %r56, 4;\n" +" add.u64 %rd54, %rd48, %rd53;\n" +" ld.shared.f32 %f97, [%rd54+0];\n" +" add.ftz.f32 %f93, %f97, %f93;\n" +" st.shared.f32 [%rd51+0], %f93;\n" +" ld.shared.f32 %f98, [%rd54+512];\n" +" add.ftz.f32 %f94, %f98, %f94;\n" +" st.shared.f32 [%rd51+512], %f94;\n" +" ld.shared.f32 %f99, [%rd54+1024];\n" +" add.ftz.f32 %f95, %f99, %f95;\n" +" st.shared.f32 [%rd51+1024], %f95;\n" +" ld.shared.f32 %f100, [%rd54+1536];\n" +" add.ftz.f32 %f96, %f100, %f96;\n" +" st.shared.f32 [%rd51+1536], %f96;\n" +"$Lt_1_30210:\n" +" .loc 16 357 0\n" +" shr.u32 %r54, %r54, 1;\n" +" mov.u32 %r57, 0;\n" +" setp.ne.u32 %p16, %r54, %r57;\n" +" @%p16 bra $Lt_1_29954;\n" +"$Lt_1_29442:\n" +" .loc 16 364 0\n" +" mov.f32 %f33, %f93;\n" +" .loc 16 365 0\n" +" mov.f32 %f32, %f94;\n" +" .loc 16 366 0\n" +" mov.f32 %f31, %f95;\n" +" .loc 16 367 0\n" +" mov.f32 %f34, %f96;\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p17, %r58, %r59;\n" +" @%p17 bra $Lt_1_30978;\n" +" .loc 16 371 0\n" +" mov.f32 %f93, %f11;\n" +" st.shared.f32 [%rd51+0], %f93;\n" +" mov.f32 %f94, %f13;\n" +" st.shared.f32 [%rd51+512], %f94;\n" +" mov.f32 %f95, %f15;\n" +" st.shared.f32 [%rd51+1024], %f95;\n" +" mov.f32 %f96, %f17;\n" +" st.shared.f32 [%rd51+1536], %f96;\n" +" mov.f32 %f101, %f19;\n" +" st.shared.f32 [%rd51+2048], %f101;\n" +" mov.f32 %f102, %f21;\n" +" st.shared.f32 [%rd51+2560], %f102;\n" +" .loc 16 373 0\n" +" mov.s32 %r60, %r53;\n" +" @!%p14 bra $Lt_1_31490;\n" +"$Lt_1_32002:\n" +" setp.ge.u32 %p18, %r10, %r60;\n" +" @%p18 bra $Lt_1_32258;\n" +" .loc 16 376 0\n" +" add.u32 %r61, %r1, %r60;\n" +" cvt.u64.u32 %rd55, %r61;\n" +" mul.wide.u32 %rd56, %r61, 4;\n" +" add.u64 %rd57, %rd48, %rd56;\n" +" ld.shared.f32 %f103, [%rd57+0];\n" +" add.ftz.f32 %f93, %f103, %f93;\n" +" st.shared.f32 [%rd51+0], %f93;\n" +" ld.shared.f32 %f104, [%rd57+512];\n" +" add.ftz.f32 %f94, %f104, %f94;\n" +" st.shared.f32 [%rd51+512], %f94;\n" +" ld.shared.f32 %f105, [%rd57+1024];\n" +" add.ftz.f32 %f95, %f105, %f95;\n" +" st.shared.f32 [%rd51+1024], %f95;\n" +" ld.shared.f32 %f106, [%rd57+1536];\n" +" add.ftz.f32 %f96, %f106, %f96;\n" +" st.shared.f32 [%rd51+1536], %f96;\n" +" ld.shared.f32 %f107, [%rd57+2048];\n" +" add.ftz.f32 %f101, %f107, %f101;\n" +" st.shared.f32 [%rd51+2048], %f101;\n" +" ld.shared.f32 %f108, [%rd57+2560];\n" +" add.ftz.f32 %f102, %f108, %f102;\n" +" st.shared.f32 [%rd51+2560], %f102;\n" +"$Lt_1_32258:\n" +" .loc 16 373 0\n" +" shr.u32 %r60, %r60, 1;\n" +" mov.u32 %r62, 0;\n" +" setp.ne.u32 %p19, %r60, %r62;\n" +" @%p19 bra $Lt_1_32002;\n" +"$Lt_1_31490:\n" +" .loc 16 381 0\n" +" mov.f32 %f11, %f93;\n" +" mov.f32 %f13, %f94;\n" +" mov.f32 %f15, %f95;\n" +" mov.f32 %f17, %f96;\n" +" mov.f32 %f19, %f101;\n" +" mov.f32 %f21, %f102;\n" +"$Lt_1_30978:\n" +"$Lt_1_28930:\n" +" selp.s32 %r63, 1, 0, %p4;\n" +" mov.s32 %r64, 0;\n" +" set.eq.u32.s32 %r65, %r10, %r64;\n" +" neg.s32 %r66, %r65;\n" +" and.b32 %r67, %r63, %r66;\n" +" mov.u32 %r68, 0;\n" +" setp.eq.s32 %p20, %r67, %r68;\n" +" @%p20 bra $Lt_1_33026;\n" +" .loc 16 387 0\n" +" cvt.s64.s32 %rd58, %r13;\n" +" ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd60, %r13, 4;\n" +" add.u64 %rd61, %rd59, %rd60;\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p21, %r69, %r70;\n" +" @%p21 bra $Lt_1_33538;\n" +" .loc 16 389 0\n" +" st.global.f32 [%rd61+0], %f34;\n" +" .loc 16 390 0\n" +" cvt.s64.s32 %rd62, %r14;\n" +" mul.wide.s32 %rd63, %r14, 4;\n" +" add.u64 %rd61, %rd61, %rd63;\n" +"$Lt_1_33538:\n" +" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r72, 0;\n" +" setp.le.s32 %p22, %r71, %r72;\n" +" @%p22 bra $Lt_1_34050;\n" +" .loc 16 394 0\n" +" mov.f32 %f109, %f11;\n" +" st.global.f32 [%rd61+0], %f109;\n" +" .loc 16 395 0\n" +" cvt.s64.s32 %rd64, %r14;\n" +" mul.wide.s32 %rd65, %r14, 4;\n" +" add.u64 %rd66, %rd65, %rd61;\n" +" .loc 16 394 0\n" +" mov.f32 %f110, %f13;\n" +" st.global.f32 [%rd66+0], %f110;\n" +" .loc 16 395 0\n" +" add.u64 %rd67, %rd65, %rd66;\n" +" .loc 16 394 0\n" +" mov.f32 %f111, %f15;\n" +" st.global.f32 [%rd67+0], %f111;\n" +" .loc 16 395 0\n" +" add.u64 %rd68, %rd65, %rd67;\n" +" .loc 16 394 0\n" +" mov.f32 %f112, %f17;\n" +" st.global.f32 [%rd68+0], %f112;\n" +" .loc 16 395 0\n" +" add.u64 %rd61, %rd65, %rd68;\n" +" .loc 16 394 0\n" +" mov.f32 %f113, %f19;\n" +" st.global.f32 [%rd61+0], %f113;\n" +" mov.f32 %f114, %f21;\n" +" add.u64 %rd69, %rd65, %rd61;\n" +" st.global.f32 [%rd69+0], %f114;\n" +"$Lt_1_34050:\n" +" .loc 16 398 0\n" +" ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd71, %rd58, 16;\n" +" add.u64 %rd72, %rd70, %rd71;\n" +" mov.f32 %f115, %f116;\n" +" st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f115};\n" +"$Lt_1_33026:\n" +" .loc 16 400 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/cmmc_long_gpu_kernel.ptx b/lib/gpu/cmmc_long_gpu_kernel.ptx new file mode 100644 index 000000000..001141e8d --- /dev/null +++ b/lib/gpu/cmmc_long_gpu_kernel.ptx @@ -0,0 +1,1277 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000c070_00000000-9_cmmc_long_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.PEOSwN) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000c070_00000000-8_cmmc_long_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "cmmc_long_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + .global .texref q_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .u64 __cudaparm_kernel_pair_lj3, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_q_, + .param .f32 __cudaparm_kernel_pair_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_qqrd2e, + .param .f32 __cudaparm_kernel_pair_g_ewald, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<86>; + .reg .u64 %rd<65>; + .reg .f32 %f<175>; + .reg .pred %p<23>; + .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_lj112[32]; + .shared .align 4 .b8 __cuda___cuda_local_var_32619_35_non_const_red_acc144[3072]; + // __cuda_local_var_32510_10_non_const_f = 64 + // __cuda_local_var_32514_9_non_const_virial = 16 + // __cuda_local_var_32561_43_non_const_inv1 = 40 + // __cuda_local_var_32561_49_non_const_inv2 = 44 + // __cuda_local_var_32561_55_non_const_prefactor = 52 + // __cuda_local_var_32561_66_non_const__erfc = 48 + .loc 16 108 0 +$LDWbegin_kernel_pair: + .loc 16 115 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 116 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 117 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 118 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4}; + .loc 16 119 0 + ld.global.f32 %f5, [%rd1+16]; + .loc 16 120 0 + ld.global.f32 %f6, [%rd1+20]; + .loc 16 121 0 + ld.global.f32 %f7, [%rd1+24]; + .loc 16 122 0 + ld.global.f32 %f8, [%rd1+28]; + st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8}; + .loc 16 132 0 + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + mov.f32 %f17, 0f00000000; // 0 + mov.f32 %f18, %f17; + mov.f32 %f19, 0f00000000; // 0 + mov.f32 %f20, %f19; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_23810; + .loc 16 136 0 + cvt.s64.s32 %rd2, %r9; + mul.wide.s32 %rd3, %r9, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd5, %rd3, %rd4; + ld.global.s32 %r11, [%rd5+0]; + .loc 16 138 0 + ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd6, %r12; + mul.wide.s32 %rd7, %r12, 4; + add.u64 %rd8, %rd7, %rd5; + ld.global.s32 %r13, [%rd8+0]; + add.u64 %rd9, %rd7, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd4; + @%p2 bra $Lt_0_24322; + .loc 16 144 0 + cvt.s32.s64 %r14, %rd6; + mul.lo.s32 %r15, %r14, %r13; + cvt.s64.s32 %rd11, %r15; + mul.wide.s32 %rd12, %r15, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 145 0 + mul.lo.s32 %r16, %r6, %r14; + cvt.s64.s32 %rd14, %r16; + mul.wide.s32 %rd15, %r16, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 146 0 + mul.lo.s32 %r17, %r14, %r1; + bra.uni $Lt_0_24066; +$Lt_0_24322: + .loc 16 148 0 + ld.global.s32 %r18, [%rd9+0]; + cvt.s64.s32 %rd17, %r18; + mul.wide.s32 %rd18, %r18, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 149 0 + cvt.s64.s32 %rd20, %r13; + mul.wide.s32 %rd21, %r13, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 150 0 + mov.s32 %r17, %r1; + .loc 16 151 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_24066: + .loc 16 154 0 + mov.u32 %r19, %r11; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f25, %f21; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + .loc 16 155 0 + mov.u32 %r26, %r11; + mov.s32 %r27, 0; + mov.u32 %r28, %r27; + mov.s32 %r29, 0; + mov.u32 %r30, %r29; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r26,%r28,%r30,%r32}]; + mov.f32 %f33, %f29; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_35330; + cvt.rzi.ftz.s32.f32 %r33, %f28; + cvt.s64.s32 %rd24, %r17; + ld.param.s32 %r34, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r35, %r34, %r33; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1]; + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.u64 %rd26, __cuda___cuda_local_var_32498_33_non_const_sp_lj112; +$Lt_0_25090: + //<loop> Loop body line 155, nesting depth: 1, estimated iterations: unknown + .loc 16 159 0 + ld.global.s32 %r36, [%rd16+0]; + .loc 16 162 0 + shr.s32 %r37, %r36, 30; + and.b32 %r38, %r37, 3; + cvt.s64.s32 %rd27, %r38; + mul.wide.s32 %rd28, %r38, 4; + add.u64 %rd29, %rd26, %rd28; + ld.shared.f32 %f39, [%rd29+0]; + .loc 16 163 0 + mov.f32 %f40, 0f3f800000; // 1 + ld.shared.f32 %f41, [%rd29+16]; + sub.ftz.f32 %f42, %f40, %f41; + .loc 16 166 0 + and.b32 %r39, %r36, 1073741823; + mov.u32 %r40, %r39; + mov.s32 %r41, 0; + mov.u32 %r42, %r41; + mov.s32 %r43, 0; + mov.u32 %r44, %r43; + mov.s32 %r45, 0; + mov.u32 %r46, %r45; + tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r40,%r42,%r44,%r46}]; + mov.f32 %f47, %f43; + mov.f32 %f48, %f44; + mov.f32 %f49, %f45; + mov.f32 %f50, %f46; + cvt.rzi.ftz.s32.f32 %r47, %f50; + sub.ftz.f32 %f51, %f26, %f48; + sub.ftz.f32 %f52, %f25, %f47; + sub.ftz.f32 %f53, %f27, %f49; + mul.ftz.f32 %f54, %f51, %f51; + fma.rn.ftz.f32 %f55, %f52, %f52, %f54; + fma.rn.ftz.f32 %f56, %f53, %f53, %f55; + add.s32 %r48, %r47, %r35; + cvt.s64.s32 %rd30, %r48; + mul.wide.s32 %rd31, %r48, 16; + add.u64 %rd32, %rd31, %rd25; + ld.global.f32 %f57, [%rd32+0]; + setp.gt.ftz.f32 %p4, %f57, %f56; + @!%p4 bra $Lt_0_28930; + rcp.approx.ftz.f32 %f58, %f56; + ld.global.f32 %f59, [%rd32+4]; + setp.lt.ftz.f32 %p5, %f56, %f59; + @!%p5 bra $Lt_0_26114; + ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd34, %rd33, %rd31; + ld.global.f32 %f60, [%rd34+0]; + mov.f32 %f61, 0f40000000; // 2 + setp.eq.ftz.f32 %p6, %f60, %f61; + @!%p6 bra $Lt_0_26626; + .loc 16 182 0 + mul.ftz.f32 %f62, %f58, %f58; + mov.f32 %f63, %f62; + mov.f32 %f64, %f63; + .loc 16 183 0 + mul.ftz.f32 %f65, %f62, %f62; + mov.f32 %f66, %f65; + bra.uni $Lt_0_26882; +$Lt_0_26626: + mov.f32 %f67, 0f3f800000; // 1 + setp.eq.ftz.f32 %p7, %f60, %f67; + @!%p7 bra $Lt_0_27138; + .loc 16 185 0 + sqrt.approx.ftz.f32 %f68, %f58; + mul.ftz.f32 %f69, %f58, %f68; + mov.f32 %f65, %f69; + mov.f32 %f66, %f65; + .loc 16 186 0 + mul.ftz.f32 %f63, %f69, %f69; + mov.f32 %f64, %f63; + bra.uni $Lt_0_26882; +$Lt_0_27138: + .loc 16 188 0 + mul.ftz.f32 %f70, %f58, %f58; + mul.ftz.f32 %f71, %f58, %f70; + mov.f32 %f63, %f71; + mov.f32 %f64, %f63; + .loc 16 189 0 + mov.f32 %f65, %f71; + mov.f32 %f66, %f65; +$Lt_0_26882: +$Lt_0_26370: + .loc 16 191 0 + mul.ftz.f32 %f72, %f39, %f63; + ld.global.v2.f32 {%f73,%f74}, [%rd32+8]; + mul.ftz.f32 %f75, %f73, %f65; + sub.ftz.f32 %f76, %f75, %f74; + mul.ftz.f32 %f77, %f72, %f76; + bra.uni $Lt_0_25858; +$Lt_0_26114: + .loc 16 193 0 + mov.f32 %f77, 0f00000000; // 0 +$Lt_0_25858: + ld.param.f32 %f78, [__cudaparm_kernel_pair_cut_coulsq]; + setp.gt.ftz.f32 %p8, %f78, %f56; + @!%p8 bra $Lt_0_27650; + .loc 16 200 0 + sqrt.approx.ftz.f32 %f79, %f56; + ld.param.f32 %f80, [__cudaparm_kernel_pair_g_ewald]; + mul.ftz.f32 %f81, %f80, %f79; + mul.ftz.f32 %f82, %f81, %f81; + mov.f32 %f83, 0f3f800000; // 1 + mov.f32 %f84, 0f3ea7ba05; // 0.327591 + fma.rn.ftz.f32 %f85, %f84, %f81, %f83; + neg.ftz.f32 %f86, %f82; + rcp.approx.ftz.f32 %f87, %f85; + mov.f32 %f88, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f89, %f86, %f88; + ex2.approx.ftz.f32 %f90, %f89; + mov.f32 %f91, 0f3e827906; // 0.25483 + mov.f32 %f92, 0fbe91a98e; // -0.284497 + mov.f32 %f93, 0f3fb5f0e3; // 1.42141 + mov.f32 %f94, 0fbfba00e3; // -1.45315 + mov.f32 %f95, 0f3f87dc22; // 1.06141 + fma.rn.ftz.f32 %f96, %f95, %f87, %f94; + fma.rn.ftz.f32 %f97, %f87, %f96, %f93; + fma.rn.ftz.f32 %f98, %f87, %f97, %f92; + fma.rn.ftz.f32 %f99, %f87, %f98, %f91; + mul.ftz.f32 %f100, %f87, %f99; + mul.ftz.f32 %f101, %f90, %f100; + mov.f32 %f102, %f101; + .loc 16 201 0 + mov.u32 %r49, %r39; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + tex.1d.v4.f32.s32 {%f103,%f104,%f105,%f106},[q_tex,{%r49,%r51,%r53,%r55}]; + mov.f32 %f107, %f103; + ld.param.f32 %f108, [__cudaparm_kernel_pair_qqrd2e]; + mul.ftz.f32 %f109, %f108, %f33; + mul.ftz.f32 %f110, %f109, %f107; + div.approx.ftz.f32 %f111, %f110, %f79; + mov.f32 %f112, %f111; + .loc 16 202 0 + mov.f32 %f113, 0f3f906ebb; // 1.12838 + mul.ftz.f32 %f114, %f81, %f113; + fma.rn.ftz.f32 %f115, %f90, %f114, %f101; + sub.ftz.f32 %f116, %f115, %f42; + mul.ftz.f32 %f117, %f111, %f116; + bra.uni $Lt_0_27394; +$Lt_0_27650: + .loc 16 204 0 + mov.f32 %f117, 0f00000000; // 0 +$Lt_0_27394: + .loc 16 208 0 + add.ftz.f32 %f118, %f117, %f77; + mul.ftz.f32 %f119, %f118, %f58; + fma.rn.ftz.f32 %f36, %f52, %f119, %f36; + .loc 16 209 0 + fma.rn.ftz.f32 %f35, %f51, %f119, %f35; + .loc 16 210 0 + fma.rn.ftz.f32 %f34, %f53, %f119, %f34; + ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r57, 0; + setp.le.s32 %p9, %r56, %r57; + @%p9 bra $Lt_0_28418; + .loc 16 213 0 + mov.f32 %f120, %f112; + mov.f32 %f121, %f102; + sub.ftz.f32 %f122, %f121, %f42; + fma.rn.ftz.f32 %f123, %f120, %f122, %f37; + selp.f32 %f37, %f123, %f37, %p8; + @!%p5 bra $Lt_0_28418; + .loc 16 216 0 + ld.param.u64 %rd35, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd36, %rd35, %rd31; + ld.global.v4.f32 {_,%f124,%f125,%f126}, [%rd36+0]; + mov.f32 %f127, %f64; + mul.ftz.f32 %f128, %f127, %f39; + mov.f32 %f129, %f66; + mul.ftz.f32 %f130, %f124, %f129; + sub.ftz.f32 %f131, %f130, %f125; + mul.ftz.f32 %f132, %f128, %f131; + sub.ftz.f32 %f133, %f132, %f126; + add.ftz.f32 %f38, %f38, %f133; +$Lt_0_28418: +$Lt_0_27906: + ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p10, %r58, %r59; + @%p10 bra $Lt_0_28930; + .loc 16 221 0 + mov.f32 %f134, %f10; + mul.ftz.f32 %f135, %f52, %f52; + fma.rn.ftz.f32 %f136, %f119, %f135, %f134; + mov.f32 %f10, %f136; + .loc 16 222 0 + mov.f32 %f137, %f12; + fma.rn.ftz.f32 %f138, %f119, %f54, %f137; + mov.f32 %f12, %f138; + .loc 16 223 0 + mov.f32 %f139, %f14; + mul.ftz.f32 %f140, %f53, %f53; + fma.rn.ftz.f32 %f141, %f119, %f140, %f139; + mov.f32 %f14, %f141; + .loc 16 224 0 + mov.f32 %f142, %f16; + mul.ftz.f32 %f143, %f51, %f52; + fma.rn.ftz.f32 %f144, %f119, %f143, %f142; + mov.f32 %f16, %f144; + .loc 16 225 0 + mov.f32 %f145, %f18; + mul.ftz.f32 %f146, %f52, %f53; + fma.rn.ftz.f32 %f147, %f119, %f146, %f145; + mov.f32 %f18, %f147; + .loc 16 226 0 + mul.ftz.f32 %f148, %f51, %f53; + fma.rn.ftz.f32 %f19, %f119, %f148, %f19; + mov.f32 %f20, %f19; +$Lt_0_28930: +$Lt_0_25346: + .loc 16 158 0 + mul.lo.u64 %rd37, %rd24, 4; + add.u64 %rd16, %rd16, %rd37; + setp.lt.u64 %p11, %rd16, %rd13; + @%p11 bra $Lt_0_25090; + bra.uni $Lt_0_23554; +$Lt_0_35330: + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + bra.uni $Lt_0_23554; +$Lt_0_23810: + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 +$Lt_0_23554: + mov.u32 %r60, 1; + setp.le.s32 %p12, %r1, %r60; + @%p12 bra $Lt_0_31746; + .loc 16 237 0 + mov.u64 %rd38, __cuda___cuda_local_var_32619_35_non_const_red_acc144; + cvt.s64.s32 %rd39, %r2; + mul.wide.s32 %rd40, %r2, 4; + add.u64 %rd41, %rd38, %rd40; + mov.f32 %f149, %f36; + st.shared.f32 [%rd41+0], %f149; + .loc 16 238 0 + mov.f32 %f150, %f35; + st.shared.f32 [%rd41+512], %f150; + .loc 16 239 0 + mov.f32 %f151, %f34; + st.shared.f32 [%rd41+1024], %f151; + .loc 16 240 0 + mov.f32 %f152, %f38; + st.shared.f32 [%rd41+1536], %f152; + .loc 16 241 0 + mov.f32 %f153, %f37; + st.shared.f32 [%rd41+2048], %f153; + .loc 16 243 0 + shr.s32 %r61, %r1, 31; + mov.s32 %r62, 1; + and.b32 %r63, %r61, %r62; + add.s32 %r64, %r63, %r1; + shr.s32 %r65, %r64, 1; + mov.s32 %r66, %r65; + mov.u32 %r67, 0; + setp.ne.u32 %p13, %r65, %r67; + @!%p13 bra $Lt_0_30210; +$Lt_0_30722: + setp.ge.u32 %p14, %r6, %r66; + @%p14 bra $Lt_0_30978; + .loc 16 246 0 + add.u32 %r68, %r2, %r66; + cvt.u64.u32 %rd42, %r68; + mul.wide.u32 %rd43, %r68, 4; + add.u64 %rd44, %rd38, %rd43; + ld.shared.f32 %f154, [%rd44+0]; + add.ftz.f32 %f149, %f154, %f149; + st.shared.f32 [%rd41+0], %f149; + ld.shared.f32 %f155, [%rd44+512]; + add.ftz.f32 %f150, %f155, %f150; + st.shared.f32 [%rd41+512], %f150; + ld.shared.f32 %f156, [%rd44+1024]; + add.ftz.f32 %f151, %f156, %f151; + st.shared.f32 [%rd41+1024], %f151; + ld.shared.f32 %f157, [%rd44+1536]; + add.ftz.f32 %f152, %f157, %f152; + st.shared.f32 [%rd41+1536], %f152; + ld.shared.f32 %f158, [%rd44+2048]; + add.ftz.f32 %f153, %f158, %f153; + st.shared.f32 [%rd41+2048], %f153; +$Lt_0_30978: + .loc 16 243 0 + shr.u32 %r66, %r66, 1; + mov.u32 %r69, 0; + setp.ne.u32 %p15, %r66, %r69; + @%p15 bra $Lt_0_30722; +$Lt_0_30210: + .loc 16 250 0 + mov.f32 %f36, %f149; + .loc 16 251 0 + mov.f32 %f35, %f150; + .loc 16 252 0 + mov.f32 %f34, %f151; + .loc 16 253 0 + mov.f32 %f38, %f152; + .loc 16 254 0 + mov.f32 %f37, %f153; + ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r71, 0; + setp.le.s32 %p16, %r70, %r71; + @%p16 bra $Lt_0_31746; + .loc 16 258 0 + mov.f32 %f149, %f10; + st.shared.f32 [%rd41+0], %f149; + mov.f32 %f150, %f12; + st.shared.f32 [%rd41+512], %f150; + mov.f32 %f151, %f14; + st.shared.f32 [%rd41+1024], %f151; + mov.f32 %f152, %f16; + st.shared.f32 [%rd41+1536], %f152; + mov.f32 %f153, %f18; + st.shared.f32 [%rd41+2048], %f153; + mov.f32 %f159, %f20; + st.shared.f32 [%rd41+2560], %f159; + .loc 16 260 0 + mov.s32 %r72, %r65; + @!%p13 bra $Lt_0_32258; +$Lt_0_32770: + setp.ge.u32 %p17, %r6, %r72; + @%p17 bra $Lt_0_33026; + .loc 16 263 0 + add.u32 %r73, %r2, %r72; + cvt.u64.u32 %rd45, %r73; + mul.wide.u32 %rd46, %r73, 4; + add.u64 %rd47, %rd38, %rd46; + ld.shared.f32 %f160, [%rd47+0]; + add.ftz.f32 %f149, %f160, %f149; + st.shared.f32 [%rd41+0], %f149; + ld.shared.f32 %f161, [%rd47+512]; + add.ftz.f32 %f150, %f161, %f150; + st.shared.f32 [%rd41+512], %f150; + ld.shared.f32 %f162, [%rd47+1024]; + add.ftz.f32 %f151, %f162, %f151; + st.shared.f32 [%rd41+1024], %f151; + ld.shared.f32 %f163, [%rd47+1536]; + add.ftz.f32 %f152, %f163, %f152; + st.shared.f32 [%rd41+1536], %f152; + ld.shared.f32 %f164, [%rd47+2048]; + add.ftz.f32 %f153, %f164, %f153; + st.shared.f32 [%rd41+2048], %f153; + ld.shared.f32 %f165, [%rd47+2560]; + add.ftz.f32 %f159, %f165, %f159; + st.shared.f32 [%rd41+2560], %f159; +$Lt_0_33026: + .loc 16 260 0 + shr.u32 %r72, %r72, 1; + mov.u32 %r74, 0; + setp.ne.u32 %p18, %r72, %r74; + @%p18 bra $Lt_0_32770; +$Lt_0_32258: + .loc 16 268 0 + mov.f32 %f10, %f149; + mov.f32 %f12, %f150; + mov.f32 %f14, %f151; + mov.f32 %f16, %f152; + mov.f32 %f18, %f153; + mov.f32 %f20, %f159; +$Lt_0_31746: +$Lt_0_29698: + selp.s32 %r75, 1, 0, %p1; + mov.s32 %r76, 0; + set.eq.u32.s32 %r77, %r6, %r76; + neg.s32 %r78, %r77; + and.b32 %r79, %r75, %r78; + mov.u32 %r80, 0; + setp.eq.s32 %p19, %r79, %r80; + @%p19 bra $Lt_0_33794; + .loc 16 274 0 + cvt.s64.s32 %rd48, %r9; + ld.param.u64 %rd49, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd50, %r9, 4; + add.u64 %rd51, %rd49, %rd50; + ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r82, 0; + setp.le.s32 %p20, %r81, %r82; + @%p20 bra $Lt_0_34306; + .loc 16 276 0 + st.global.f32 [%rd51+0], %f38; + .loc 16 277 0 + cvt.s64.s32 %rd52, %r10; + mul.wide.s32 %rd53, %r10, 4; + add.u64 %rd54, %rd53, %rd51; + .loc 16 278 0 + st.global.f32 [%rd54+0], %f37; + .loc 16 279 0 + add.u64 %rd51, %rd53, %rd54; +$Lt_0_34306: + ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r84, 0; + setp.le.s32 %p21, %r83, %r84; + @%p21 bra $Lt_0_34818; + .loc 16 283 0 + mov.f32 %f166, %f10; + st.global.f32 [%rd51+0], %f166; + .loc 16 284 0 + cvt.s64.s32 %rd55, %r10; + mul.wide.s32 %rd56, %r10, 4; + add.u64 %rd57, %rd56, %rd51; + .loc 16 283 0 + mov.f32 %f167, %f12; + st.global.f32 [%rd57+0], %f167; + .loc 16 284 0 + add.u64 %rd58, %rd56, %rd57; + .loc 16 283 0 + mov.f32 %f168, %f14; + st.global.f32 [%rd58+0], %f168; + .loc 16 284 0 + add.u64 %rd59, %rd56, %rd58; + .loc 16 283 0 + mov.f32 %f169, %f16; + st.global.f32 [%rd59+0], %f169; + .loc 16 284 0 + add.u64 %rd51, %rd56, %rd59; + .loc 16 283 0 + mov.f32 %f170, %f18; + st.global.f32 [%rd51+0], %f170; + mov.f32 %f171, %f20; + add.u64 %rd60, %rd56, %rd51; + st.global.f32 [%rd60+0], %f171; +$Lt_0_34818: + .loc 16 287 0 + ld.param.u64 %rd61, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd62, %rd48, 16; + add.u64 %rd63, %rd61, %rd62; + mov.f32 %f172, %f173; + st.global.v4.f32 [%rd63+0], {%f36,%f35,%f34,%f172}; +$Lt_0_33794: + .loc 16 289 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_lj1_in, + .param .u64 __cudaparm_kernel_pair_fast_lj3_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_fast_q_, + .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, + .param .f32 __cudaparm_kernel_pair_fast_g_ewald, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<86>; + .reg .u64 %rd<76>; + .reg .f32 %f<178>; + .reg .pred %p<25>; + .shared .align 4 .b8 __cuda___cuda_local_var_32691_33_non_const_sp_lj3320[32]; + .shared .align 16 .b8 __cuda___cuda_local_var_32689_34_non_const_lj13360[1936]; + .shared .align 16 .b8 __cuda___cuda_local_var_32690_34_non_const_lj35296[1936]; + .shared .align 4 .b8 __cuda___cuda_local_var_32812_35_non_const_red_acc7232[3072]; + // __cuda_local_var_32701_10_non_const_f = 64 + // __cuda_local_var_32705_9_non_const_virial = 16 + // __cuda_local_var_32754_43_non_const_inv1 = 40 + // __cuda_local_var_32754_49_non_const_inv2 = 44 + // __cuda_local_var_32754_55_non_const_prefactor = 52 + // __cuda_local_var_32754_66_non_const__erfc = 48 + .loc 16 299 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 7; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_25090; + .loc 16 309 0 + mov.u64 %rd1, __cuda___cuda_local_var_32691_33_non_const_sp_lj3320; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_25090: + mov.u64 %rd1, __cuda___cuda_local_var_32691_33_non_const_sp_lj3320; + mov.u32 %r3, 120; + setp.gt.s32 %p2, %r1, %r3; + @%p2 bra $Lt_1_25602; + .loc 16 311 0 + mov.u64 %rd7, __cuda___cuda_local_var_32689_34_non_const_lj13360; + mov.u64 %rd8, __cuda___cuda_local_var_32690_34_non_const_lj35296; + cvt.s64.s32 %rd9, %r1; + mul.wide.s32 %rd10, %r1, 16; + ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in]; + add.u64 %rd12, %rd11, %rd10; + add.u64 %rd13, %rd10, %rd7; + ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0]; + st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5}; + .loc 16 312 0 + ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; + add.u64 %rd15, %rd14, %rd10; + add.u64 %rd16, %rd10, %rd8; + ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; + st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; +$Lt_1_25602: + mov.u64 %rd7, __cuda___cuda_local_var_32689_34_non_const_lj13360; + mov.u64 %rd8, __cuda___cuda_local_var_32690_34_non_const_lj35296; + .loc 16 323 0 + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, %f14; + mov.f32 %f16, 0f00000000; // 0 + mov.f32 %f17, %f16; + mov.f32 %f18, 0f00000000; // 0 + mov.f32 %f19, %f18; + mov.f32 %f20, 0f00000000; // 0 + mov.f32 %f21, %f20; + .loc 16 325 0 + bar.sync 0; + ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r5, %r1, %r4; + cvt.s32.u32 %r6, %ntid.x; + div.s32 %r7, %r6, %r4; + rem.s32 %r8, %r1, %r4; + cvt.s32.u32 %r9, %ctaid.x; + mul.lo.s32 %r10, %r9, %r7; + add.s32 %r11, %r5, %r10; + ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p3, %r11, %r12; + @!%p3 bra $Lt_1_26370; + .loc 16 329 0 + cvt.s64.s32 %rd17, %r11; + mul.wide.s32 %rd18, %r11, 4; + ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd20, %rd18, %rd19; + ld.global.s32 %r13, [%rd20+0]; + .loc 16 331 0 + ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd21, %r14; + mul.wide.s32 %rd22, %r14, 4; + add.u64 %rd23, %rd22, %rd20; + ld.global.s32 %r15, [%rd23+0]; + add.u64 %rd24, %rd22, %rd23; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p4, %rd25, %rd19; + @%p4 bra $Lt_1_26882; + .loc 16 337 0 + cvt.s32.s64 %r16, %rd21; + mul.lo.s32 %r17, %r16, %r15; + cvt.s64.s32 %rd26, %r17; + mul.wide.s32 %rd27, %r17, 4; + add.u64 %rd28, %rd24, %rd27; + .loc 16 338 0 + mul.lo.s32 %r18, %r8, %r16; + cvt.s64.s32 %rd29, %r18; + mul.wide.s32 %rd30, %r18, 4; + add.u64 %rd31, %rd24, %rd30; + .loc 16 339 0 + mul.lo.s32 %r19, %r16, %r4; + bra.uni $Lt_1_26626; +$Lt_1_26882: + .loc 16 341 0 + ld.global.s32 %r20, [%rd24+0]; + cvt.s64.s32 %rd32, %r20; + mul.wide.s32 %rd33, %r20, 4; + add.u64 %rd34, %rd25, %rd33; + .loc 16 342 0 + cvt.s64.s32 %rd35, %r15; + mul.wide.s32 %rd36, %r15, 4; + add.u64 %rd28, %rd34, %rd36; + .loc 16 343 0 + mov.s32 %r19, %r4; + .loc 16 344 0 + cvt.s64.s32 %rd37, %r8; + mul.wide.s32 %rd38, %r8, 4; + add.u64 %rd31, %rd34, %rd38; +$Lt_1_26626: + .loc 16 347 0 + mov.u32 %r21, %r13; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r21,%r23,%r25,%r27}]; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + mov.f32 %f29, %f25; + .loc 16 348 0 + mov.u32 %r28, %r13; + mov.s32 %r29, 0; + mov.u32 %r30, %r29; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + mov.s32 %r33, 0; + mov.u32 %r34, %r33; + tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r28,%r30,%r32,%r34}]; + mov.f32 %f34, %f30; + setp.ge.u64 %p5, %rd31, %rd28; + @%p5 bra $Lt_1_37890; + cvt.rzi.ftz.s32.f32 %r35, %f29; + cvt.s64.s32 %rd39, %r19; + mul.lo.s32 %r36, %r35, 11; + cvt.rn.f32.s32 %f35, %r36; + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.f32 %f40, 0f00000000; // 0 +$Lt_1_27650: + //<loop> Loop body line 348, nesting depth: 1, estimated iterations: unknown + .loc 16 353 0 + ld.global.s32 %r37, [%rd31+0]; + .loc 16 356 0 + shr.s32 %r38, %r37, 30; + and.b32 %r39, %r38, 3; + cvt.s64.s32 %rd40, %r39; + mul.wide.s32 %rd41, %r39, 4; + add.u64 %rd42, %rd1, %rd41; + ld.shared.f32 %f41, [%rd42+0]; + .loc 16 357 0 + mov.f32 %f42, 0f3f800000; // 1 + ld.shared.f32 %f43, [%rd42+16]; + sub.ftz.f32 %f44, %f42, %f43; + .loc 16 360 0 + and.b32 %r40, %r37, 1073741823; + mov.u32 %r41, %r40; + mov.s32 %r42, 0; + mov.u32 %r43, %r42; + mov.s32 %r44, 0; + mov.u32 %r45, %r44; + mov.s32 %r46, 0; + mov.u32 %r47, %r46; + tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r41,%r43,%r45,%r47}]; + mov.f32 %f49, %f45; + mov.f32 %f50, %f46; + mov.f32 %f51, %f47; + mov.f32 %f52, %f48; + sub.ftz.f32 %f53, %f27, %f50; + sub.ftz.f32 %f54, %f26, %f49; + sub.ftz.f32 %f55, %f28, %f51; + mul.ftz.f32 %f56, %f53, %f53; + fma.rn.ftz.f32 %f57, %f54, %f54, %f56; + fma.rn.ftz.f32 %f58, %f55, %f55, %f57; + add.ftz.f32 %f59, %f35, %f52; + cvt.rzi.ftz.s32.f32 %r48, %f59; + cvt.s64.s32 %rd43, %r48; + mul.wide.s32 %rd44, %r48, 16; + add.u64 %rd45, %rd44, %rd7; + ld.shared.f32 %f60, [%rd45+0]; + setp.gt.ftz.f32 %p6, %f60, %f58; + @!%p6 bra $Lt_1_31490; + rcp.approx.ftz.f32 %f61, %f58; + ld.shared.f32 %f62, [%rd45+4]; + setp.lt.ftz.f32 %p7, %f58, %f62; + @!%p7 bra $Lt_1_28674; + add.u64 %rd46, %rd44, %rd8; + ld.shared.f32 %f63, [%rd46+0]; + mov.f32 %f64, 0f40000000; // 2 + setp.eq.ftz.f32 %p8, %f63, %f64; + @!%p8 bra $Lt_1_29186; + .loc 16 375 0 + mul.ftz.f32 %f65, %f61, %f61; + mov.f32 %f66, %f65; + mov.f32 %f67, %f66; + .loc 16 376 0 + mul.ftz.f32 %f68, %f65, %f65; + mov.f32 %f69, %f68; + bra.uni $Lt_1_29442; +$Lt_1_29186: + mov.f32 %f70, 0f3f800000; // 1 + setp.eq.ftz.f32 %p9, %f63, %f70; + @!%p9 bra $Lt_1_29698; + .loc 16 378 0 + sqrt.approx.ftz.f32 %f71, %f61; + mul.ftz.f32 %f72, %f61, %f71; + mov.f32 %f68, %f72; + mov.f32 %f69, %f68; + .loc 16 379 0 + mul.ftz.f32 %f66, %f72, %f72; + mov.f32 %f67, %f66; + bra.uni $Lt_1_29442; +$Lt_1_29698: + .loc 16 381 0 + mul.ftz.f32 %f73, %f61, %f61; + mul.ftz.f32 %f74, %f61, %f73; + mov.f32 %f66, %f74; + mov.f32 %f67, %f66; + .loc 16 382 0 + mov.f32 %f68, %f74; + mov.f32 %f69, %f68; +$Lt_1_29442: +$Lt_1_28930: + .loc 16 384 0 + mul.ftz.f32 %f75, %f41, %f66; + ld.shared.v2.f32 {%f76,%f77}, [%rd45+8]; + mul.ftz.f32 %f78, %f76, %f68; + sub.ftz.f32 %f79, %f78, %f77; + mul.ftz.f32 %f80, %f75, %f79; + bra.uni $Lt_1_28418; +$Lt_1_28674: + .loc 16 386 0 + mov.f32 %f80, 0f00000000; // 0 +$Lt_1_28418: + ld.param.f32 %f81, [__cudaparm_kernel_pair_fast_cut_coulsq]; + setp.gt.ftz.f32 %p10, %f81, %f58; + @!%p10 bra $Lt_1_30210; + .loc 16 393 0 + sqrt.approx.ftz.f32 %f82, %f58; + ld.param.f32 %f83, [__cudaparm_kernel_pair_fast_g_ewald]; + mul.ftz.f32 %f84, %f83, %f82; + mul.ftz.f32 %f85, %f84, %f84; + mov.f32 %f86, 0f3f800000; // 1 + mov.f32 %f87, 0f3ea7ba05; // 0.327591 + fma.rn.ftz.f32 %f88, %f87, %f84, %f86; + neg.ftz.f32 %f89, %f85; + rcp.approx.ftz.f32 %f90, %f88; + mov.f32 %f91, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f92, %f89, %f91; + ex2.approx.ftz.f32 %f93, %f92; + mov.f32 %f94, 0f3e827906; // 0.25483 + mov.f32 %f95, 0fbe91a98e; // -0.284497 + mov.f32 %f96, 0f3fb5f0e3; // 1.42141 + mov.f32 %f97, 0fbfba00e3; // -1.45315 + mov.f32 %f98, 0f3f87dc22; // 1.06141 + fma.rn.ftz.f32 %f99, %f98, %f90, %f97; + fma.rn.ftz.f32 %f100, %f90, %f99, %f96; + fma.rn.ftz.f32 %f101, %f90, %f100, %f95; + fma.rn.ftz.f32 %f102, %f90, %f101, %f94; + mul.ftz.f32 %f103, %f90, %f102; + mul.ftz.f32 %f104, %f93, %f103; + mov.f32 %f105, %f104; + .loc 16 394 0 + mov.u32 %r49, %r40; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + tex.1d.v4.f32.s32 {%f106,%f107,%f108,%f109},[q_tex,{%r49,%r51,%r53,%r55}]; + mov.f32 %f110, %f106; + ld.param.f32 %f111, [__cudaparm_kernel_pair_fast_qqrd2e]; + mul.ftz.f32 %f112, %f111, %f34; + mul.ftz.f32 %f113, %f112, %f110; + div.approx.ftz.f32 %f114, %f113, %f82; + mov.f32 %f115, %f114; + .loc 16 395 0 + mov.f32 %f116, 0f3f906ebb; // 1.12838 + mul.ftz.f32 %f117, %f84, %f116; + fma.rn.ftz.f32 %f118, %f93, %f117, %f104; + sub.ftz.f32 %f119, %f118, %f44; + mul.ftz.f32 %f120, %f114, %f119; + bra.uni $Lt_1_29954; +$Lt_1_30210: + .loc 16 397 0 + mov.f32 %f120, 0f00000000; // 0 +$Lt_1_29954: + .loc 16 401 0 + add.ftz.f32 %f121, %f120, %f80; + mul.ftz.f32 %f122, %f121, %f61; + fma.rn.ftz.f32 %f38, %f54, %f122, %f38; + .loc 16 402 0 + fma.rn.ftz.f32 %f37, %f53, %f122, %f37; + .loc 16 403 0 + fma.rn.ftz.f32 %f36, %f55, %f122, %f36; + ld.param.s32 %r56, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r57, 0; + setp.le.s32 %p11, %r56, %r57; + @%p11 bra $Lt_1_30978; + .loc 16 406 0 + mov.f32 %f123, %f115; + mov.f32 %f124, %f105; + sub.ftz.f32 %f125, %f124, %f44; + fma.rn.ftz.f32 %f126, %f123, %f125, %f39; + selp.f32 %f39, %f126, %f39, %p10; + @!%p7 bra $Lt_1_30978; + .loc 16 409 0 + add.u64 %rd47, %rd44, %rd8; + ld.shared.v4.f32 {_,%f127,%f128,%f129}, [%rd47+0]; + mov.f32 %f130, %f67; + mul.ftz.f32 %f131, %f130, %f41; + mov.f32 %f132, %f69; + mul.ftz.f32 %f133, %f127, %f132; + sub.ftz.f32 %f134, %f133, %f128; + mul.ftz.f32 %f135, %f131, %f134; + sub.ftz.f32 %f136, %f135, %f129; + add.ftz.f32 %f40, %f40, %f136; +$Lt_1_30978: +$Lt_1_30466: + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p12, %r58, %r59; + @%p12 bra $Lt_1_31490; + .loc 16 414 0 + mov.f32 %f137, %f11; + mul.ftz.f32 %f138, %f54, %f54; + fma.rn.ftz.f32 %f139, %f122, %f138, %f137; + mov.f32 %f11, %f139; + .loc 16 415 0 + mov.f32 %f140, %f13; + fma.rn.ftz.f32 %f141, %f122, %f56, %f140; + mov.f32 %f13, %f141; + .loc 16 416 0 + mov.f32 %f142, %f15; + mul.ftz.f32 %f143, %f55, %f55; + fma.rn.ftz.f32 %f144, %f122, %f143, %f142; + mov.f32 %f15, %f144; + .loc 16 417 0 + mov.f32 %f145, %f17; + mul.ftz.f32 %f146, %f53, %f54; + fma.rn.ftz.f32 %f147, %f122, %f146, %f145; + mov.f32 %f17, %f147; + .loc 16 418 0 + mov.f32 %f148, %f19; + mul.ftz.f32 %f149, %f54, %f55; + fma.rn.ftz.f32 %f150, %f122, %f149, %f148; + mov.f32 %f19, %f150; + .loc 16 419 0 + mul.ftz.f32 %f151, %f53, %f55; + fma.rn.ftz.f32 %f20, %f122, %f151, %f20; + mov.f32 %f21, %f20; +$Lt_1_31490: +$Lt_1_27906: + .loc 16 352 0 + mul.lo.u64 %rd48, %rd39, 4; + add.u64 %rd31, %rd31, %rd48; + setp.lt.u64 %p13, %rd31, %rd28; + @%p13 bra $Lt_1_27650; + bra.uni $Lt_1_26114; +$Lt_1_37890: + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.f32 %f40, 0f00000000; // 0 + bra.uni $Lt_1_26114; +$Lt_1_26370: + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.f32 %f40, 0f00000000; // 0 +$Lt_1_26114: + mov.u32 %r60, 1; + setp.le.s32 %p14, %r4, %r60; + @%p14 bra $Lt_1_34306; + .loc 16 430 0 + mov.u64 %rd49, __cuda___cuda_local_var_32812_35_non_const_red_acc7232; + cvt.s64.s32 %rd50, %r1; + mul.wide.s32 %rd51, %r1, 4; + add.u64 %rd52, %rd49, %rd51; + mov.f32 %f152, %f38; + st.shared.f32 [%rd52+0], %f152; + .loc 16 431 0 + mov.f32 %f153, %f37; + st.shared.f32 [%rd52+512], %f153; + .loc 16 432 0 + mov.f32 %f154, %f36; + st.shared.f32 [%rd52+1024], %f154; + .loc 16 433 0 + mov.f32 %f155, %f40; + st.shared.f32 [%rd52+1536], %f155; + .loc 16 434 0 + mov.f32 %f156, %f39; + st.shared.f32 [%rd52+2048], %f156; + .loc 16 436 0 + shr.s32 %r61, %r4, 31; + mov.s32 %r62, 1; + and.b32 %r63, %r61, %r62; + add.s32 %r64, %r63, %r4; + shr.s32 %r65, %r64, 1; + mov.s32 %r66, %r65; + mov.u32 %r67, 0; + setp.ne.u32 %p15, %r65, %r67; + @!%p15 bra $Lt_1_32770; +$Lt_1_33282: + setp.ge.u32 %p16, %r8, %r66; + @%p16 bra $Lt_1_33538; + .loc 16 439 0 + add.u32 %r68, %r1, %r66; + cvt.u64.u32 %rd53, %r68; + mul.wide.u32 %rd54, %r68, 4; + add.u64 %rd55, %rd49, %rd54; + ld.shared.f32 %f157, [%rd55+0]; + add.ftz.f32 %f152, %f157, %f152; + st.shared.f32 [%rd52+0], %f152; + ld.shared.f32 %f158, [%rd55+512]; + add.ftz.f32 %f153, %f158, %f153; + st.shared.f32 [%rd52+512], %f153; + ld.shared.f32 %f159, [%rd55+1024]; + add.ftz.f32 %f154, %f159, %f154; + st.shared.f32 [%rd52+1024], %f154; + ld.shared.f32 %f160, [%rd55+1536]; + add.ftz.f32 %f155, %f160, %f155; + st.shared.f32 [%rd52+1536], %f155; + ld.shared.f32 %f161, [%rd55+2048]; + add.ftz.f32 %f156, %f161, %f156; + st.shared.f32 [%rd52+2048], %f156; +$Lt_1_33538: + .loc 16 436 0 + shr.u32 %r66, %r66, 1; + mov.u32 %r69, 0; + setp.ne.u32 %p17, %r66, %r69; + @%p17 bra $Lt_1_33282; +$Lt_1_32770: + .loc 16 443 0 + mov.f32 %f38, %f152; + .loc 16 444 0 + mov.f32 %f37, %f153; + .loc 16 445 0 + mov.f32 %f36, %f154; + .loc 16 446 0 + mov.f32 %f40, %f155; + .loc 16 447 0 + mov.f32 %f39, %f156; + ld.param.s32 %r70, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r71, 0; + setp.le.s32 %p18, %r70, %r71; + @%p18 bra $Lt_1_34306; + .loc 16 451 0 + mov.f32 %f152, %f11; + st.shared.f32 [%rd52+0], %f152; + mov.f32 %f153, %f13; + st.shared.f32 [%rd52+512], %f153; + mov.f32 %f154, %f15; + st.shared.f32 [%rd52+1024], %f154; + mov.f32 %f155, %f17; + st.shared.f32 [%rd52+1536], %f155; + mov.f32 %f156, %f19; + st.shared.f32 [%rd52+2048], %f156; + mov.f32 %f162, %f21; + st.shared.f32 [%rd52+2560], %f162; + .loc 16 453 0 + mov.s32 %r72, %r65; + @!%p15 bra $Lt_1_34818; +$Lt_1_35330: + setp.ge.u32 %p19, %r8, %r72; + @%p19 bra $Lt_1_35586; + .loc 16 456 0 + add.u32 %r73, %r1, %r72; + cvt.u64.u32 %rd56, %r73; + mul.wide.u32 %rd57, %r73, 4; + add.u64 %rd58, %rd49, %rd57; + ld.shared.f32 %f163, [%rd58+0]; + add.ftz.f32 %f152, %f163, %f152; + st.shared.f32 [%rd52+0], %f152; + ld.shared.f32 %f164, [%rd58+512]; + add.ftz.f32 %f153, %f164, %f153; + st.shared.f32 [%rd52+512], %f153; + ld.shared.f32 %f165, [%rd58+1024]; + add.ftz.f32 %f154, %f165, %f154; + st.shared.f32 [%rd52+1024], %f154; + ld.shared.f32 %f166, [%rd58+1536]; + add.ftz.f32 %f155, %f166, %f155; + st.shared.f32 [%rd52+1536], %f155; + ld.shared.f32 %f167, [%rd58+2048]; + add.ftz.f32 %f156, %f167, %f156; + st.shared.f32 [%rd52+2048], %f156; + ld.shared.f32 %f168, [%rd58+2560]; + add.ftz.f32 %f162, %f168, %f162; + st.shared.f32 [%rd52+2560], %f162; +$Lt_1_35586: + .loc 16 453 0 + shr.u32 %r72, %r72, 1; + mov.u32 %r74, 0; + setp.ne.u32 %p20, %r72, %r74; + @%p20 bra $Lt_1_35330; +$Lt_1_34818: + .loc 16 461 0 + mov.f32 %f11, %f152; + mov.f32 %f13, %f153; + mov.f32 %f15, %f154; + mov.f32 %f17, %f155; + mov.f32 %f19, %f156; + mov.f32 %f21, %f162; +$Lt_1_34306: +$Lt_1_32258: + selp.s32 %r75, 1, 0, %p3; + mov.s32 %r76, 0; + set.eq.u32.s32 %r77, %r8, %r76; + neg.s32 %r78, %r77; + and.b32 %r79, %r75, %r78; + mov.u32 %r80, 0; + setp.eq.s32 %p21, %r79, %r80; + @%p21 bra $Lt_1_36354; + .loc 16 467 0 + cvt.s64.s32 %rd59, %r11; + ld.param.u64 %rd60, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd61, %r11, 4; + add.u64 %rd62, %rd60, %rd61; + ld.param.s32 %r81, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r82, 0; + setp.le.s32 %p22, %r81, %r82; + @%p22 bra $Lt_1_36866; + .loc 16 469 0 + st.global.f32 [%rd62+0], %f40; + .loc 16 470 0 + cvt.s64.s32 %rd63, %r12; + mul.wide.s32 %rd64, %r12, 4; + add.u64 %rd65, %rd64, %rd62; + .loc 16 471 0 + st.global.f32 [%rd65+0], %f39; + .loc 16 472 0 + add.u64 %rd62, %rd64, %rd65; +$Lt_1_36866: + ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r84, 0; + setp.le.s32 %p23, %r83, %r84; + @%p23 bra $Lt_1_37378; + .loc 16 476 0 + mov.f32 %f169, %f11; + st.global.f32 [%rd62+0], %f169; + .loc 16 477 0 + cvt.s64.s32 %rd66, %r12; + mul.wide.s32 %rd67, %r12, 4; + add.u64 %rd68, %rd67, %rd62; + .loc 16 476 0 + mov.f32 %f170, %f13; + st.global.f32 [%rd68+0], %f170; + .loc 16 477 0 + add.u64 %rd69, %rd67, %rd68; + .loc 16 476 0 + mov.f32 %f171, %f15; + st.global.f32 [%rd69+0], %f171; + .loc 16 477 0 + add.u64 %rd70, %rd67, %rd69; + .loc 16 476 0 + mov.f32 %f172, %f17; + st.global.f32 [%rd70+0], %f172; + .loc 16 477 0 + add.u64 %rd62, %rd67, %rd70; + .loc 16 476 0 + mov.f32 %f173, %f19; + st.global.f32 [%rd62+0], %f173; + mov.f32 %f174, %f21; + add.u64 %rd71, %rd67, %rd62; + st.global.f32 [%rd71+0], %f174; +$Lt_1_37378: + .loc 16 480 0 + ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd73, %rd59, 16; + add.u64 %rd74, %rd72, %rd73; + mov.f32 %f175, %f176; + st.global.v4.f32 [%rd74+0], {%f38,%f37,%f36,%f175}; +$Lt_1_36354: + .loc 16 482 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/cmmc_long_gpu_ptx.h b/lib/gpu/cmmc_long_gpu_ptx.h new file mode 100644 index 000000000..253b4aa05 --- /dev/null +++ b/lib/gpu/cmmc_long_gpu_ptx.h @@ -0,0 +1,1217 @@ +const char * cmmc_long_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .global .texref q_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .u64 __cudaparm_kernel_pair_lj3,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_q_,\n" +" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" +" .param .f32 __cudaparm_kernel_pair_g_ewald,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<86>;\n" +" .reg .u64 %rd<65>;\n" +" .reg .f32 %f<175>;\n" +" .reg .pred %p<23>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_lj112[32];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32619_35_non_const_red_acc144[3072];\n" +" .loc 16 108 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 115 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 116 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 117 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 118 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 119 0\n" +" ld.global.f32 %f5, [%rd1+16];\n" +" .loc 16 120 0\n" +" ld.global.f32 %f6, [%rd1+20];\n" +" .loc 16 121 0\n" +" ld.global.f32 %f7, [%rd1+24];\n" +" .loc 16 122 0\n" +" ld.global.f32 %f8, [%rd1+28];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8};\n" +" .loc 16 132 0\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" mov.f32 %f17, 0f00000000; \n" +" mov.f32 %f18, %f17;\n" +" mov.f32 %f19, 0f00000000; \n" +" mov.f32 %f20, %f19;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_23810;\n" +" .loc 16 136 0\n" +" cvt.s64.s32 %rd2, %r9;\n" +" mul.wide.s32 %rd3, %r9, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd5, %rd3, %rd4;\n" +" ld.global.s32 %r11, [%rd5+0];\n" +" .loc 16 138 0\n" +" ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd6, %r12;\n" +" mul.wide.s32 %rd7, %r12, 4;\n" +" add.u64 %rd8, %rd7, %rd5;\n" +" ld.global.s32 %r13, [%rd8+0];\n" +" add.u64 %rd9, %rd7, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd4;\n" +" @%p2 bra $Lt_0_24322;\n" +" .loc 16 144 0\n" +" cvt.s32.s64 %r14, %rd6;\n" +" mul.lo.s32 %r15, %r14, %r13;\n" +" cvt.s64.s32 %rd11, %r15;\n" +" mul.wide.s32 %rd12, %r15, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 145 0\n" +" mul.lo.s32 %r16, %r6, %r14;\n" +" cvt.s64.s32 %rd14, %r16;\n" +" mul.wide.s32 %rd15, %r16, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 146 0\n" +" mul.lo.s32 %r17, %r14, %r1;\n" +" bra.uni $Lt_0_24066;\n" +"$Lt_0_24322:\n" +" .loc 16 148 0\n" +" ld.global.s32 %r18, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r18;\n" +" mul.wide.s32 %rd18, %r18, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 149 0\n" +" cvt.s64.s32 %rd20, %r13;\n" +" mul.wide.s32 %rd21, %r13, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 150 0\n" +" mov.s32 %r17, %r1;\n" +" .loc 16 151 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_24066:\n" +" .loc 16 154 0\n" +" mov.u32 %r19, %r11;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f25, %f21;\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" .loc 16 155 0\n" +" mov.u32 %r26, %r11;\n" +" mov.s32 %r27, 0;\n" +" mov.u32 %r28, %r27;\n" +" mov.s32 %r29, 0;\n" +" mov.u32 %r30, %r29;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r26,%r28,%r30,%r32}];\n" +" mov.f32 %f33, %f29;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_35330;\n" +" cvt.rzi.ftz.s32.f32 %r33, %f28;\n" +" cvt.s64.s32 %rd24, %r17;\n" +" ld.param.s32 %r34, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r35, %r34, %r33;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];\n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.u64 %rd26, __cuda___cuda_local_var_32498_33_non_const_sp_lj112;\n" +"$Lt_0_25090:\n" +" .loc 16 159 0\n" +" ld.global.s32 %r36, [%rd16+0];\n" +" .loc 16 162 0\n" +" shr.s32 %r37, %r36, 30;\n" +" and.b32 %r38, %r37, 3;\n" +" cvt.s64.s32 %rd27, %r38;\n" +" mul.wide.s32 %rd28, %r38, 4;\n" +" add.u64 %rd29, %rd26, %rd28;\n" +" ld.shared.f32 %f39, [%rd29+0];\n" +" .loc 16 163 0\n" +" mov.f32 %f40, 0f3f800000; \n" +" ld.shared.f32 %f41, [%rd29+16];\n" +" sub.ftz.f32 %f42, %f40, %f41;\n" +" .loc 16 166 0\n" +" and.b32 %r39, %r36, 1073741823;\n" +" mov.u32 %r40, %r39;\n" +" mov.s32 %r41, 0;\n" +" mov.u32 %r42, %r41;\n" +" mov.s32 %r43, 0;\n" +" mov.u32 %r44, %r43;\n" +" mov.s32 %r45, 0;\n" +" mov.u32 %r46, %r45;\n" +" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r40,%r42,%r44,%r46}];\n" +" mov.f32 %f47, %f43;\n" +" mov.f32 %f48, %f44;\n" +" mov.f32 %f49, %f45;\n" +" mov.f32 %f50, %f46;\n" +" cvt.rzi.ftz.s32.f32 %r47, %f50;\n" +" sub.ftz.f32 %f51, %f26, %f48;\n" +" sub.ftz.f32 %f52, %f25, %f47;\n" +" sub.ftz.f32 %f53, %f27, %f49;\n" +" mul.ftz.f32 %f54, %f51, %f51;\n" +" fma.rn.ftz.f32 %f55, %f52, %f52, %f54;\n" +" fma.rn.ftz.f32 %f56, %f53, %f53, %f55;\n" +" add.s32 %r48, %r47, %r35;\n" +" cvt.s64.s32 %rd30, %r48;\n" +" mul.wide.s32 %rd31, %r48, 16;\n" +" add.u64 %rd32, %rd31, %rd25;\n" +" ld.global.f32 %f57, [%rd32+0];\n" +" setp.gt.ftz.f32 %p4, %f57, %f56;\n" +" @!%p4 bra $Lt_0_28930;\n" +" rcp.approx.ftz.f32 %f58, %f56;\n" +" ld.global.f32 %f59, [%rd32+4];\n" +" setp.lt.ftz.f32 %p5, %f56, %f59;\n" +" @!%p5 bra $Lt_0_26114;\n" +" ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd34, %rd33, %rd31;\n" +" ld.global.f32 %f60, [%rd34+0];\n" +" mov.f32 %f61, 0f40000000; \n" +" setp.eq.ftz.f32 %p6, %f60, %f61;\n" +" @!%p6 bra $Lt_0_26626;\n" +" .loc 16 182 0\n" +" mul.ftz.f32 %f62, %f58, %f58;\n" +" mov.f32 %f63, %f62;\n" +" mov.f32 %f64, %f63;\n" +" .loc 16 183 0\n" +" mul.ftz.f32 %f65, %f62, %f62;\n" +" mov.f32 %f66, %f65;\n" +" bra.uni $Lt_0_26882;\n" +"$Lt_0_26626:\n" +" mov.f32 %f67, 0f3f800000; \n" +" setp.eq.ftz.f32 %p7, %f60, %f67;\n" +" @!%p7 bra $Lt_0_27138;\n" +" .loc 16 185 0\n" +" sqrt.approx.ftz.f32 %f68, %f58;\n" +" mul.ftz.f32 %f69, %f58, %f68;\n" +" mov.f32 %f65, %f69;\n" +" mov.f32 %f66, %f65;\n" +" .loc 16 186 0\n" +" mul.ftz.f32 %f63, %f69, %f69;\n" +" mov.f32 %f64, %f63;\n" +" bra.uni $Lt_0_26882;\n" +"$Lt_0_27138:\n" +" .loc 16 188 0\n" +" mul.ftz.f32 %f70, %f58, %f58;\n" +" mul.ftz.f32 %f71, %f58, %f70;\n" +" mov.f32 %f63, %f71;\n" +" mov.f32 %f64, %f63;\n" +" .loc 16 189 0\n" +" mov.f32 %f65, %f71;\n" +" mov.f32 %f66, %f65;\n" +"$Lt_0_26882:\n" +"$Lt_0_26370:\n" +" .loc 16 191 0\n" +" mul.ftz.f32 %f72, %f39, %f63;\n" +" ld.global.v2.f32 {%f73,%f74}, [%rd32+8];\n" +" mul.ftz.f32 %f75, %f73, %f65;\n" +" sub.ftz.f32 %f76, %f75, %f74;\n" +" mul.ftz.f32 %f77, %f72, %f76;\n" +" bra.uni $Lt_0_25858;\n" +"$Lt_0_26114:\n" +" .loc 16 193 0\n" +" mov.f32 %f77, 0f00000000; \n" +"$Lt_0_25858:\n" +" ld.param.f32 %f78, [__cudaparm_kernel_pair_cut_coulsq];\n" +" setp.gt.ftz.f32 %p8, %f78, %f56;\n" +" @!%p8 bra $Lt_0_27650;\n" +" .loc 16 200 0\n" +" sqrt.approx.ftz.f32 %f79, %f56;\n" +" ld.param.f32 %f80, [__cudaparm_kernel_pair_g_ewald];\n" +" mul.ftz.f32 %f81, %f80, %f79;\n" +" mul.ftz.f32 %f82, %f81, %f81;\n" +" mov.f32 %f83, 0f3f800000; \n" +" mov.f32 %f84, 0f3ea7ba05; \n" +" fma.rn.ftz.f32 %f85, %f84, %f81, %f83;\n" +" neg.ftz.f32 %f86, %f82;\n" +" rcp.approx.ftz.f32 %f87, %f85;\n" +" mov.f32 %f88, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f89, %f86, %f88;\n" +" ex2.approx.ftz.f32 %f90, %f89;\n" +" mov.f32 %f91, 0f3e827906; \n" +" mov.f32 %f92, 0fbe91a98e; \n" +" mov.f32 %f93, 0f3fb5f0e3; \n" +" mov.f32 %f94, 0fbfba00e3; \n" +" mov.f32 %f95, 0f3f87dc22; \n" +" fma.rn.ftz.f32 %f96, %f95, %f87, %f94;\n" +" fma.rn.ftz.f32 %f97, %f87, %f96, %f93;\n" +" fma.rn.ftz.f32 %f98, %f87, %f97, %f92;\n" +" fma.rn.ftz.f32 %f99, %f87, %f98, %f91;\n" +" mul.ftz.f32 %f100, %f87, %f99;\n" +" mul.ftz.f32 %f101, %f90, %f100;\n" +" mov.f32 %f102, %f101;\n" +" .loc 16 201 0\n" +" mov.u32 %r49, %r39;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" tex.1d.v4.f32.s32 {%f103,%f104,%f105,%f106},[q_tex,{%r49,%r51,%r53,%r55}];\n" +" mov.f32 %f107, %f103;\n" +" ld.param.f32 %f108, [__cudaparm_kernel_pair_qqrd2e];\n" +" mul.ftz.f32 %f109, %f108, %f33;\n" +" mul.ftz.f32 %f110, %f109, %f107;\n" +" div.approx.ftz.f32 %f111, %f110, %f79;\n" +" mov.f32 %f112, %f111;\n" +" .loc 16 202 0\n" +" mov.f32 %f113, 0f3f906ebb; \n" +" mul.ftz.f32 %f114, %f81, %f113;\n" +" fma.rn.ftz.f32 %f115, %f90, %f114, %f101;\n" +" sub.ftz.f32 %f116, %f115, %f42;\n" +" mul.ftz.f32 %f117, %f111, %f116;\n" +" bra.uni $Lt_0_27394;\n" +"$Lt_0_27650:\n" +" .loc 16 204 0\n" +" mov.f32 %f117, 0f00000000; \n" +"$Lt_0_27394:\n" +" .loc 16 208 0\n" +" add.ftz.f32 %f118, %f117, %f77;\n" +" mul.ftz.f32 %f119, %f118, %f58;\n" +" fma.rn.ftz.f32 %f36, %f52, %f119, %f36;\n" +" .loc 16 209 0\n" +" fma.rn.ftz.f32 %f35, %f51, %f119, %f35;\n" +" .loc 16 210 0\n" +" fma.rn.ftz.f32 %f34, %f53, %f119, %f34;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p9, %r56, %r57;\n" +" @%p9 bra $Lt_0_28418;\n" +" .loc 16 213 0\n" +" mov.f32 %f120, %f112;\n" +" mov.f32 %f121, %f102;\n" +" sub.ftz.f32 %f122, %f121, %f42;\n" +" fma.rn.ftz.f32 %f123, %f120, %f122, %f37;\n" +" selp.f32 %f37, %f123, %f37, %p8;\n" +" @!%p5 bra $Lt_0_28418;\n" +" .loc 16 216 0\n" +" ld.param.u64 %rd35, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd36, %rd35, %rd31;\n" +" ld.global.v4.f32 {_,%f124,%f125,%f126}, [%rd36+0];\n" +" mov.f32 %f127, %f64;\n" +" mul.ftz.f32 %f128, %f127, %f39;\n" +" mov.f32 %f129, %f66;\n" +" mul.ftz.f32 %f130, %f124, %f129;\n" +" sub.ftz.f32 %f131, %f130, %f125;\n" +" mul.ftz.f32 %f132, %f128, %f131;\n" +" sub.ftz.f32 %f133, %f132, %f126;\n" +" add.ftz.f32 %f38, %f38, %f133;\n" +"$Lt_0_28418:\n" +"$Lt_0_27906:\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p10, %r58, %r59;\n" +" @%p10 bra $Lt_0_28930;\n" +" .loc 16 221 0\n" +" mov.f32 %f134, %f10;\n" +" mul.ftz.f32 %f135, %f52, %f52;\n" +" fma.rn.ftz.f32 %f136, %f119, %f135, %f134;\n" +" mov.f32 %f10, %f136;\n" +" .loc 16 222 0\n" +" mov.f32 %f137, %f12;\n" +" fma.rn.ftz.f32 %f138, %f119, %f54, %f137;\n" +" mov.f32 %f12, %f138;\n" +" .loc 16 223 0\n" +" mov.f32 %f139, %f14;\n" +" mul.ftz.f32 %f140, %f53, %f53;\n" +" fma.rn.ftz.f32 %f141, %f119, %f140, %f139;\n" +" mov.f32 %f14, %f141;\n" +" .loc 16 224 0\n" +" mov.f32 %f142, %f16;\n" +" mul.ftz.f32 %f143, %f51, %f52;\n" +" fma.rn.ftz.f32 %f144, %f119, %f143, %f142;\n" +" mov.f32 %f16, %f144;\n" +" .loc 16 225 0\n" +" mov.f32 %f145, %f18;\n" +" mul.ftz.f32 %f146, %f52, %f53;\n" +" fma.rn.ftz.f32 %f147, %f119, %f146, %f145;\n" +" mov.f32 %f18, %f147;\n" +" .loc 16 226 0\n" +" mul.ftz.f32 %f148, %f51, %f53;\n" +" fma.rn.ftz.f32 %f19, %f119, %f148, %f19;\n" +" mov.f32 %f20, %f19;\n" +"$Lt_0_28930:\n" +"$Lt_0_25346:\n" +" .loc 16 158 0\n" +" mul.lo.u64 %rd37, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd37;\n" +" setp.lt.u64 %p11, %rd16, %rd13;\n" +" @%p11 bra $Lt_0_25090;\n" +" bra.uni $Lt_0_23554;\n" +"$Lt_0_35330:\n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" bra.uni $Lt_0_23554;\n" +"$Lt_0_23810:\n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +"$Lt_0_23554:\n" +" mov.u32 %r60, 1;\n" +" setp.le.s32 %p12, %r1, %r60;\n" +" @%p12 bra $Lt_0_31746;\n" +" .loc 16 237 0\n" +" mov.u64 %rd38, __cuda___cuda_local_var_32619_35_non_const_red_acc144;\n" +" cvt.s64.s32 %rd39, %r2;\n" +" mul.wide.s32 %rd40, %r2, 4;\n" +" add.u64 %rd41, %rd38, %rd40;\n" +" mov.f32 %f149, %f36;\n" +" st.shared.f32 [%rd41+0], %f149;\n" +" .loc 16 238 0\n" +" mov.f32 %f150, %f35;\n" +" st.shared.f32 [%rd41+512], %f150;\n" +" .loc 16 239 0\n" +" mov.f32 %f151, %f34;\n" +" st.shared.f32 [%rd41+1024], %f151;\n" +" .loc 16 240 0\n" +" mov.f32 %f152, %f38;\n" +" st.shared.f32 [%rd41+1536], %f152;\n" +" .loc 16 241 0\n" +" mov.f32 %f153, %f37;\n" +" st.shared.f32 [%rd41+2048], %f153;\n" +" .loc 16 243 0\n" +" shr.s32 %r61, %r1, 31;\n" +" mov.s32 %r62, 1;\n" +" and.b32 %r63, %r61, %r62;\n" +" add.s32 %r64, %r63, %r1;\n" +" shr.s32 %r65, %r64, 1;\n" +" mov.s32 %r66, %r65;\n" +" mov.u32 %r67, 0;\n" +" setp.ne.u32 %p13, %r65, %r67;\n" +" @!%p13 bra $Lt_0_30210;\n" +"$Lt_0_30722:\n" +" setp.ge.u32 %p14, %r6, %r66;\n" +" @%p14 bra $Lt_0_30978;\n" +" .loc 16 246 0\n" +" add.u32 %r68, %r2, %r66;\n" +" cvt.u64.u32 %rd42, %r68;\n" +" mul.wide.u32 %rd43, %r68, 4;\n" +" add.u64 %rd44, %rd38, %rd43;\n" +" ld.shared.f32 %f154, [%rd44+0];\n" +" add.ftz.f32 %f149, %f154, %f149;\n" +" st.shared.f32 [%rd41+0], %f149;\n" +" ld.shared.f32 %f155, [%rd44+512];\n" +" add.ftz.f32 %f150, %f155, %f150;\n" +" st.shared.f32 [%rd41+512], %f150;\n" +" ld.shared.f32 %f156, [%rd44+1024];\n" +" add.ftz.f32 %f151, %f156, %f151;\n" +" st.shared.f32 [%rd41+1024], %f151;\n" +" ld.shared.f32 %f157, [%rd44+1536];\n" +" add.ftz.f32 %f152, %f157, %f152;\n" +" st.shared.f32 [%rd41+1536], %f152;\n" +" ld.shared.f32 %f158, [%rd44+2048];\n" +" add.ftz.f32 %f153, %f158, %f153;\n" +" st.shared.f32 [%rd41+2048], %f153;\n" +"$Lt_0_30978:\n" +" .loc 16 243 0\n" +" shr.u32 %r66, %r66, 1;\n" +" mov.u32 %r69, 0;\n" +" setp.ne.u32 %p15, %r66, %r69;\n" +" @%p15 bra $Lt_0_30722;\n" +"$Lt_0_30210:\n" +" .loc 16 250 0\n" +" mov.f32 %f36, %f149;\n" +" .loc 16 251 0\n" +" mov.f32 %f35, %f150;\n" +" .loc 16 252 0\n" +" mov.f32 %f34, %f151;\n" +" .loc 16 253 0\n" +" mov.f32 %f38, %f152;\n" +" .loc 16 254 0\n" +" mov.f32 %f37, %f153;\n" +" ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r71, 0;\n" +" setp.le.s32 %p16, %r70, %r71;\n" +" @%p16 bra $Lt_0_31746;\n" +" .loc 16 258 0\n" +" mov.f32 %f149, %f10;\n" +" st.shared.f32 [%rd41+0], %f149;\n" +" mov.f32 %f150, %f12;\n" +" st.shared.f32 [%rd41+512], %f150;\n" +" mov.f32 %f151, %f14;\n" +" st.shared.f32 [%rd41+1024], %f151;\n" +" mov.f32 %f152, %f16;\n" +" st.shared.f32 [%rd41+1536], %f152;\n" +" mov.f32 %f153, %f18;\n" +" st.shared.f32 [%rd41+2048], %f153;\n" +" mov.f32 %f159, %f20;\n" +" st.shared.f32 [%rd41+2560], %f159;\n" +" .loc 16 260 0\n" +" mov.s32 %r72, %r65;\n" +" @!%p13 bra $Lt_0_32258;\n" +"$Lt_0_32770:\n" +" setp.ge.u32 %p17, %r6, %r72;\n" +" @%p17 bra $Lt_0_33026;\n" +" .loc 16 263 0\n" +" add.u32 %r73, %r2, %r72;\n" +" cvt.u64.u32 %rd45, %r73;\n" +" mul.wide.u32 %rd46, %r73, 4;\n" +" add.u64 %rd47, %rd38, %rd46;\n" +" ld.shared.f32 %f160, [%rd47+0];\n" +" add.ftz.f32 %f149, %f160, %f149;\n" +" st.shared.f32 [%rd41+0], %f149;\n" +" ld.shared.f32 %f161, [%rd47+512];\n" +" add.ftz.f32 %f150, %f161, %f150;\n" +" st.shared.f32 [%rd41+512], %f150;\n" +" ld.shared.f32 %f162, [%rd47+1024];\n" +" add.ftz.f32 %f151, %f162, %f151;\n" +" st.shared.f32 [%rd41+1024], %f151;\n" +" ld.shared.f32 %f163, [%rd47+1536];\n" +" add.ftz.f32 %f152, %f163, %f152;\n" +" st.shared.f32 [%rd41+1536], %f152;\n" +" ld.shared.f32 %f164, [%rd47+2048];\n" +" add.ftz.f32 %f153, %f164, %f153;\n" +" st.shared.f32 [%rd41+2048], %f153;\n" +" ld.shared.f32 %f165, [%rd47+2560];\n" +" add.ftz.f32 %f159, %f165, %f159;\n" +" st.shared.f32 [%rd41+2560], %f159;\n" +"$Lt_0_33026:\n" +" .loc 16 260 0\n" +" shr.u32 %r72, %r72, 1;\n" +" mov.u32 %r74, 0;\n" +" setp.ne.u32 %p18, %r72, %r74;\n" +" @%p18 bra $Lt_0_32770;\n" +"$Lt_0_32258:\n" +" .loc 16 268 0\n" +" mov.f32 %f10, %f149;\n" +" mov.f32 %f12, %f150;\n" +" mov.f32 %f14, %f151;\n" +" mov.f32 %f16, %f152;\n" +" mov.f32 %f18, %f153;\n" +" mov.f32 %f20, %f159;\n" +"$Lt_0_31746:\n" +"$Lt_0_29698:\n" +" selp.s32 %r75, 1, 0, %p1;\n" +" mov.s32 %r76, 0;\n" +" set.eq.u32.s32 %r77, %r6, %r76;\n" +" neg.s32 %r78, %r77;\n" +" and.b32 %r79, %r75, %r78;\n" +" mov.u32 %r80, 0;\n" +" setp.eq.s32 %p19, %r79, %r80;\n" +" @%p19 bra $Lt_0_33794;\n" +" .loc 16 274 0\n" +" cvt.s64.s32 %rd48, %r9;\n" +" ld.param.u64 %rd49, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd50, %r9, 4;\n" +" add.u64 %rd51, %rd49, %rd50;\n" +" ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r82, 0;\n" +" setp.le.s32 %p20, %r81, %r82;\n" +" @%p20 bra $Lt_0_34306;\n" +" .loc 16 276 0\n" +" st.global.f32 [%rd51+0], %f38;\n" +" .loc 16 277 0\n" +" cvt.s64.s32 %rd52, %r10;\n" +" mul.wide.s32 %rd53, %r10, 4;\n" +" add.u64 %rd54, %rd53, %rd51;\n" +" .loc 16 278 0\n" +" st.global.f32 [%rd54+0], %f37;\n" +" .loc 16 279 0\n" +" add.u64 %rd51, %rd53, %rd54;\n" +"$Lt_0_34306:\n" +" ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r84, 0;\n" +" setp.le.s32 %p21, %r83, %r84;\n" +" @%p21 bra $Lt_0_34818;\n" +" .loc 16 283 0\n" +" mov.f32 %f166, %f10;\n" +" st.global.f32 [%rd51+0], %f166;\n" +" .loc 16 284 0\n" +" cvt.s64.s32 %rd55, %r10;\n" +" mul.wide.s32 %rd56, %r10, 4;\n" +" add.u64 %rd57, %rd56, %rd51;\n" +" .loc 16 283 0\n" +" mov.f32 %f167, %f12;\n" +" st.global.f32 [%rd57+0], %f167;\n" +" .loc 16 284 0\n" +" add.u64 %rd58, %rd56, %rd57;\n" +" .loc 16 283 0\n" +" mov.f32 %f168, %f14;\n" +" st.global.f32 [%rd58+0], %f168;\n" +" .loc 16 284 0\n" +" add.u64 %rd59, %rd56, %rd58;\n" +" .loc 16 283 0\n" +" mov.f32 %f169, %f16;\n" +" st.global.f32 [%rd59+0], %f169;\n" +" .loc 16 284 0\n" +" add.u64 %rd51, %rd56, %rd59;\n" +" .loc 16 283 0\n" +" mov.f32 %f170, %f18;\n" +" st.global.f32 [%rd51+0], %f170;\n" +" mov.f32 %f171, %f20;\n" +" add.u64 %rd60, %rd56, %rd51;\n" +" st.global.f32 [%rd60+0], %f171;\n" +"$Lt_0_34818:\n" +" .loc 16 287 0\n" +" ld.param.u64 %rd61, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd62, %rd48, 16;\n" +" add.u64 %rd63, %rd61, %rd62;\n" +" mov.f32 %f172, %f173;\n" +" st.global.v4.f32 [%rd63+0], {%f36,%f35,%f34,%f172};\n" +"$Lt_0_33794:\n" +" .loc 16 289 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" +" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" +" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<86>;\n" +" .reg .u64 %rd<76>;\n" +" .reg .f32 %f<178>;\n" +" .reg .pred %p<25>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32691_33_non_const_sp_lj3320[32];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32689_34_non_const_lj13360[1936];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32690_34_non_const_lj35296[1936];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32812_35_non_const_red_acc7232[3072];\n" +" .loc 16 299 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 7;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_25090;\n" +" .loc 16 309 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32691_33_non_const_sp_lj3320;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_25090:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32691_33_non_const_sp_lj3320;\n" +" mov.u32 %r3, 120;\n" +" setp.gt.s32 %p2, %r1, %r3;\n" +" @%p2 bra $Lt_1_25602;\n" +" .loc 16 311 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32689_34_non_const_lj13360;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32690_34_non_const_lj35296;\n" +" cvt.s64.s32 %rd9, %r1;\n" +" mul.wide.s32 %rd10, %r1, 16;\n" +" ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n" +" add.u64 %rd12, %rd11, %rd10;\n" +" add.u64 %rd13, %rd10, %rd7;\n" +" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0];\n" +" st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5};\n" +" .loc 16 312 0\n" +" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" +" add.u64 %rd15, %rd14, %rd10;\n" +" add.u64 %rd16, %rd10, %rd8;\n" +" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" +" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" +"$Lt_1_25602:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32689_34_non_const_lj13360;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32690_34_non_const_lj35296;\n" +" .loc 16 323 0\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, %f14;\n" +" mov.f32 %f16, 0f00000000; \n" +" mov.f32 %f17, %f16;\n" +" mov.f32 %f18, 0f00000000; \n" +" mov.f32 %f19, %f18;\n" +" mov.f32 %f20, 0f00000000; \n" +" mov.f32 %f21, %f20;\n" +" .loc 16 325 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r5, %r1, %r4;\n" +" cvt.s32.u32 %r6, %ntid.x;\n" +" div.s32 %r7, %r6, %r4;\n" +" rem.s32 %r8, %r1, %r4;\n" +" cvt.s32.u32 %r9, %ctaid.x;\n" +" mul.lo.s32 %r10, %r9, %r7;\n" +" add.s32 %r11, %r5, %r10;\n" +" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p3, %r11, %r12;\n" +" @!%p3 bra $Lt_1_26370;\n" +" .loc 16 329 0\n" +" cvt.s64.s32 %rd17, %r11;\n" +" mul.wide.s32 %rd18, %r11, 4;\n" +" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd20, %rd18, %rd19;\n" +" ld.global.s32 %r13, [%rd20+0];\n" +" .loc 16 331 0\n" +" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd21, %r14;\n" +" mul.wide.s32 %rd22, %r14, 4;\n" +" add.u64 %rd23, %rd22, %rd20;\n" +" ld.global.s32 %r15, [%rd23+0];\n" +" add.u64 %rd24, %rd22, %rd23;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p4, %rd25, %rd19;\n" +" @%p4 bra $Lt_1_26882;\n" +" .loc 16 337 0\n" +" cvt.s32.s64 %r16, %rd21;\n" +" mul.lo.s32 %r17, %r16, %r15;\n" +" cvt.s64.s32 %rd26, %r17;\n" +" mul.wide.s32 %rd27, %r17, 4;\n" +" add.u64 %rd28, %rd24, %rd27;\n" +" .loc 16 338 0\n" +" mul.lo.s32 %r18, %r8, %r16;\n" +" cvt.s64.s32 %rd29, %r18;\n" +" mul.wide.s32 %rd30, %r18, 4;\n" +" add.u64 %rd31, %rd24, %rd30;\n" +" .loc 16 339 0\n" +" mul.lo.s32 %r19, %r16, %r4;\n" +" bra.uni $Lt_1_26626;\n" +"$Lt_1_26882:\n" +" .loc 16 341 0\n" +" ld.global.s32 %r20, [%rd24+0];\n" +" cvt.s64.s32 %rd32, %r20;\n" +" mul.wide.s32 %rd33, %r20, 4;\n" +" add.u64 %rd34, %rd25, %rd33;\n" +" .loc 16 342 0\n" +" cvt.s64.s32 %rd35, %r15;\n" +" mul.wide.s32 %rd36, %r15, 4;\n" +" add.u64 %rd28, %rd34, %rd36;\n" +" .loc 16 343 0\n" +" mov.s32 %r19, %r4;\n" +" .loc 16 344 0\n" +" cvt.s64.s32 %rd37, %r8;\n" +" mul.wide.s32 %rd38, %r8, 4;\n" +" add.u64 %rd31, %rd34, %rd38;\n" +"$Lt_1_26626:\n" +" .loc 16 347 0\n" +" mov.u32 %r21, %r13;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r21,%r23,%r25,%r27}];\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" mov.f32 %f29, %f25;\n" +" .loc 16 348 0\n" +" mov.u32 %r28, %r13;\n" +" mov.s32 %r29, 0;\n" +" mov.u32 %r30, %r29;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" mov.s32 %r33, 0;\n" +" mov.u32 %r34, %r33;\n" +" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r28,%r30,%r32,%r34}];\n" +" mov.f32 %f34, %f30;\n" +" setp.ge.u64 %p5, %rd31, %rd28;\n" +" @%p5 bra $Lt_1_37890;\n" +" cvt.rzi.ftz.s32.f32 %r35, %f29;\n" +" cvt.s64.s32 %rd39, %r19;\n" +" mul.lo.s32 %r36, %r35, 11;\n" +" cvt.rn.f32.s32 %f35, %r36;\n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.f32 %f40, 0f00000000; \n" +"$Lt_1_27650:\n" +" .loc 16 353 0\n" +" ld.global.s32 %r37, [%rd31+0];\n" +" .loc 16 356 0\n" +" shr.s32 %r38, %r37, 30;\n" +" and.b32 %r39, %r38, 3;\n" +" cvt.s64.s32 %rd40, %r39;\n" +" mul.wide.s32 %rd41, %r39, 4;\n" +" add.u64 %rd42, %rd1, %rd41;\n" +" ld.shared.f32 %f41, [%rd42+0];\n" +" .loc 16 357 0\n" +" mov.f32 %f42, 0f3f800000; \n" +" ld.shared.f32 %f43, [%rd42+16];\n" +" sub.ftz.f32 %f44, %f42, %f43;\n" +" .loc 16 360 0\n" +" and.b32 %r40, %r37, 1073741823;\n" +" mov.u32 %r41, %r40;\n" +" mov.s32 %r42, 0;\n" +" mov.u32 %r43, %r42;\n" +" mov.s32 %r44, 0;\n" +" mov.u32 %r45, %r44;\n" +" mov.s32 %r46, 0;\n" +" mov.u32 %r47, %r46;\n" +" tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r41,%r43,%r45,%r47}];\n" +" mov.f32 %f49, %f45;\n" +" mov.f32 %f50, %f46;\n" +" mov.f32 %f51, %f47;\n" +" mov.f32 %f52, %f48;\n" +" sub.ftz.f32 %f53, %f27, %f50;\n" +" sub.ftz.f32 %f54, %f26, %f49;\n" +" sub.ftz.f32 %f55, %f28, %f51;\n" +" mul.ftz.f32 %f56, %f53, %f53;\n" +" fma.rn.ftz.f32 %f57, %f54, %f54, %f56;\n" +" fma.rn.ftz.f32 %f58, %f55, %f55, %f57;\n" +" add.ftz.f32 %f59, %f35, %f52;\n" +" cvt.rzi.ftz.s32.f32 %r48, %f59;\n" +" cvt.s64.s32 %rd43, %r48;\n" +" mul.wide.s32 %rd44, %r48, 16;\n" +" add.u64 %rd45, %rd44, %rd7;\n" +" ld.shared.f32 %f60, [%rd45+0];\n" +" setp.gt.ftz.f32 %p6, %f60, %f58;\n" +" @!%p6 bra $Lt_1_31490;\n" +" rcp.approx.ftz.f32 %f61, %f58;\n" +" ld.shared.f32 %f62, [%rd45+4];\n" +" setp.lt.ftz.f32 %p7, %f58, %f62;\n" +" @!%p7 bra $Lt_1_28674;\n" +" add.u64 %rd46, %rd44, %rd8;\n" +" ld.shared.f32 %f63, [%rd46+0];\n" +" mov.f32 %f64, 0f40000000; \n" +" setp.eq.ftz.f32 %p8, %f63, %f64;\n" +" @!%p8 bra $Lt_1_29186;\n" +" .loc 16 375 0\n" +" mul.ftz.f32 %f65, %f61, %f61;\n" +" mov.f32 %f66, %f65;\n" +" mov.f32 %f67, %f66;\n" +" .loc 16 376 0\n" +" mul.ftz.f32 %f68, %f65, %f65;\n" +" mov.f32 %f69, %f68;\n" +" bra.uni $Lt_1_29442;\n" +"$Lt_1_29186:\n" +" mov.f32 %f70, 0f3f800000; \n" +" setp.eq.ftz.f32 %p9, %f63, %f70;\n" +" @!%p9 bra $Lt_1_29698;\n" +" .loc 16 378 0\n" +" sqrt.approx.ftz.f32 %f71, %f61;\n" +" mul.ftz.f32 %f72, %f61, %f71;\n" +" mov.f32 %f68, %f72;\n" +" mov.f32 %f69, %f68;\n" +" .loc 16 379 0\n" +" mul.ftz.f32 %f66, %f72, %f72;\n" +" mov.f32 %f67, %f66;\n" +" bra.uni $Lt_1_29442;\n" +"$Lt_1_29698:\n" +" .loc 16 381 0\n" +" mul.ftz.f32 %f73, %f61, %f61;\n" +" mul.ftz.f32 %f74, %f61, %f73;\n" +" mov.f32 %f66, %f74;\n" +" mov.f32 %f67, %f66;\n" +" .loc 16 382 0\n" +" mov.f32 %f68, %f74;\n" +" mov.f32 %f69, %f68;\n" +"$Lt_1_29442:\n" +"$Lt_1_28930:\n" +" .loc 16 384 0\n" +" mul.ftz.f32 %f75, %f41, %f66;\n" +" ld.shared.v2.f32 {%f76,%f77}, [%rd45+8];\n" +" mul.ftz.f32 %f78, %f76, %f68;\n" +" sub.ftz.f32 %f79, %f78, %f77;\n" +" mul.ftz.f32 %f80, %f75, %f79;\n" +" bra.uni $Lt_1_28418;\n" +"$Lt_1_28674:\n" +" .loc 16 386 0\n" +" mov.f32 %f80, 0f00000000; \n" +"$Lt_1_28418:\n" +" ld.param.f32 %f81, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" +" setp.gt.ftz.f32 %p10, %f81, %f58;\n" +" @!%p10 bra $Lt_1_30210;\n" +" .loc 16 393 0\n" +" sqrt.approx.ftz.f32 %f82, %f58;\n" +" ld.param.f32 %f83, [__cudaparm_kernel_pair_fast_g_ewald];\n" +" mul.ftz.f32 %f84, %f83, %f82;\n" +" mul.ftz.f32 %f85, %f84, %f84;\n" +" mov.f32 %f86, 0f3f800000; \n" +" mov.f32 %f87, 0f3ea7ba05; \n" +" fma.rn.ftz.f32 %f88, %f87, %f84, %f86;\n" +" neg.ftz.f32 %f89, %f85;\n" +" rcp.approx.ftz.f32 %f90, %f88;\n" +" mov.f32 %f91, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f92, %f89, %f91;\n" +" ex2.approx.ftz.f32 %f93, %f92;\n" +" mov.f32 %f94, 0f3e827906; \n" +" mov.f32 %f95, 0fbe91a98e; \n" +" mov.f32 %f96, 0f3fb5f0e3; \n" +" mov.f32 %f97, 0fbfba00e3; \n" +" mov.f32 %f98, 0f3f87dc22; \n" +" fma.rn.ftz.f32 %f99, %f98, %f90, %f97;\n" +" fma.rn.ftz.f32 %f100, %f90, %f99, %f96;\n" +" fma.rn.ftz.f32 %f101, %f90, %f100, %f95;\n" +" fma.rn.ftz.f32 %f102, %f90, %f101, %f94;\n" +" mul.ftz.f32 %f103, %f90, %f102;\n" +" mul.ftz.f32 %f104, %f93, %f103;\n" +" mov.f32 %f105, %f104;\n" +" .loc 16 394 0\n" +" mov.u32 %r49, %r40;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" tex.1d.v4.f32.s32 {%f106,%f107,%f108,%f109},[q_tex,{%r49,%r51,%r53,%r55}];\n" +" mov.f32 %f110, %f106;\n" +" ld.param.f32 %f111, [__cudaparm_kernel_pair_fast_qqrd2e];\n" +" mul.ftz.f32 %f112, %f111, %f34;\n" +" mul.ftz.f32 %f113, %f112, %f110;\n" +" div.approx.ftz.f32 %f114, %f113, %f82;\n" +" mov.f32 %f115, %f114;\n" +" .loc 16 395 0\n" +" mov.f32 %f116, 0f3f906ebb; \n" +" mul.ftz.f32 %f117, %f84, %f116;\n" +" fma.rn.ftz.f32 %f118, %f93, %f117, %f104;\n" +" sub.ftz.f32 %f119, %f118, %f44;\n" +" mul.ftz.f32 %f120, %f114, %f119;\n" +" bra.uni $Lt_1_29954;\n" +"$Lt_1_30210:\n" +" .loc 16 397 0\n" +" mov.f32 %f120, 0f00000000; \n" +"$Lt_1_29954:\n" +" .loc 16 401 0\n" +" add.ftz.f32 %f121, %f120, %f80;\n" +" mul.ftz.f32 %f122, %f121, %f61;\n" +" fma.rn.ftz.f32 %f38, %f54, %f122, %f38;\n" +" .loc 16 402 0\n" +" fma.rn.ftz.f32 %f37, %f53, %f122, %f37;\n" +" .loc 16 403 0\n" +" fma.rn.ftz.f32 %f36, %f55, %f122, %f36;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p11, %r56, %r57;\n" +" @%p11 bra $Lt_1_30978;\n" +" .loc 16 406 0\n" +" mov.f32 %f123, %f115;\n" +" mov.f32 %f124, %f105;\n" +" sub.ftz.f32 %f125, %f124, %f44;\n" +" fma.rn.ftz.f32 %f126, %f123, %f125, %f39;\n" +" selp.f32 %f39, %f126, %f39, %p10;\n" +" @!%p7 bra $Lt_1_30978;\n" +" .loc 16 409 0\n" +" add.u64 %rd47, %rd44, %rd8;\n" +" ld.shared.v4.f32 {_,%f127,%f128,%f129}, [%rd47+0];\n" +" mov.f32 %f130, %f67;\n" +" mul.ftz.f32 %f131, %f130, %f41;\n" +" mov.f32 %f132, %f69;\n" +" mul.ftz.f32 %f133, %f127, %f132;\n" +" sub.ftz.f32 %f134, %f133, %f128;\n" +" mul.ftz.f32 %f135, %f131, %f134;\n" +" sub.ftz.f32 %f136, %f135, %f129;\n" +" add.ftz.f32 %f40, %f40, %f136;\n" +"$Lt_1_30978:\n" +"$Lt_1_30466:\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p12, %r58, %r59;\n" +" @%p12 bra $Lt_1_31490;\n" +" .loc 16 414 0\n" +" mov.f32 %f137, %f11;\n" +" mul.ftz.f32 %f138, %f54, %f54;\n" +" fma.rn.ftz.f32 %f139, %f122, %f138, %f137;\n" +" mov.f32 %f11, %f139;\n" +" .loc 16 415 0\n" +" mov.f32 %f140, %f13;\n" +" fma.rn.ftz.f32 %f141, %f122, %f56, %f140;\n" +" mov.f32 %f13, %f141;\n" +" .loc 16 416 0\n" +" mov.f32 %f142, %f15;\n" +" mul.ftz.f32 %f143, %f55, %f55;\n" +" fma.rn.ftz.f32 %f144, %f122, %f143, %f142;\n" +" mov.f32 %f15, %f144;\n" +" .loc 16 417 0\n" +" mov.f32 %f145, %f17;\n" +" mul.ftz.f32 %f146, %f53, %f54;\n" +" fma.rn.ftz.f32 %f147, %f122, %f146, %f145;\n" +" mov.f32 %f17, %f147;\n" +" .loc 16 418 0\n" +" mov.f32 %f148, %f19;\n" +" mul.ftz.f32 %f149, %f54, %f55;\n" +" fma.rn.ftz.f32 %f150, %f122, %f149, %f148;\n" +" mov.f32 %f19, %f150;\n" +" .loc 16 419 0\n" +" mul.ftz.f32 %f151, %f53, %f55;\n" +" fma.rn.ftz.f32 %f20, %f122, %f151, %f20;\n" +" mov.f32 %f21, %f20;\n" +"$Lt_1_31490:\n" +"$Lt_1_27906:\n" +" .loc 16 352 0\n" +" mul.lo.u64 %rd48, %rd39, 4;\n" +" add.u64 %rd31, %rd31, %rd48;\n" +" setp.lt.u64 %p13, %rd31, %rd28;\n" +" @%p13 bra $Lt_1_27650;\n" +" bra.uni $Lt_1_26114;\n" +"$Lt_1_37890:\n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.f32 %f40, 0f00000000; \n" +" bra.uni $Lt_1_26114;\n" +"$Lt_1_26370:\n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.f32 %f40, 0f00000000; \n" +"$Lt_1_26114:\n" +" mov.u32 %r60, 1;\n" +" setp.le.s32 %p14, %r4, %r60;\n" +" @%p14 bra $Lt_1_34306;\n" +" .loc 16 430 0\n" +" mov.u64 %rd49, __cuda___cuda_local_var_32812_35_non_const_red_acc7232;\n" +" cvt.s64.s32 %rd50, %r1;\n" +" mul.wide.s32 %rd51, %r1, 4;\n" +" add.u64 %rd52, %rd49, %rd51;\n" +" mov.f32 %f152, %f38;\n" +" st.shared.f32 [%rd52+0], %f152;\n" +" .loc 16 431 0\n" +" mov.f32 %f153, %f37;\n" +" st.shared.f32 [%rd52+512], %f153;\n" +" .loc 16 432 0\n" +" mov.f32 %f154, %f36;\n" +" st.shared.f32 [%rd52+1024], %f154;\n" +" .loc 16 433 0\n" +" mov.f32 %f155, %f40;\n" +" st.shared.f32 [%rd52+1536], %f155;\n" +" .loc 16 434 0\n" +" mov.f32 %f156, %f39;\n" +" st.shared.f32 [%rd52+2048], %f156;\n" +" .loc 16 436 0\n" +" shr.s32 %r61, %r4, 31;\n" +" mov.s32 %r62, 1;\n" +" and.b32 %r63, %r61, %r62;\n" +" add.s32 %r64, %r63, %r4;\n" +" shr.s32 %r65, %r64, 1;\n" +" mov.s32 %r66, %r65;\n" +" mov.u32 %r67, 0;\n" +" setp.ne.u32 %p15, %r65, %r67;\n" +" @!%p15 bra $Lt_1_32770;\n" +"$Lt_1_33282:\n" +" setp.ge.u32 %p16, %r8, %r66;\n" +" @%p16 bra $Lt_1_33538;\n" +" .loc 16 439 0\n" +" add.u32 %r68, %r1, %r66;\n" +" cvt.u64.u32 %rd53, %r68;\n" +" mul.wide.u32 %rd54, %r68, 4;\n" +" add.u64 %rd55, %rd49, %rd54;\n" +" ld.shared.f32 %f157, [%rd55+0];\n" +" add.ftz.f32 %f152, %f157, %f152;\n" +" st.shared.f32 [%rd52+0], %f152;\n" +" ld.shared.f32 %f158, [%rd55+512];\n" +" add.ftz.f32 %f153, %f158, %f153;\n" +" st.shared.f32 [%rd52+512], %f153;\n" +" ld.shared.f32 %f159, [%rd55+1024];\n" +" add.ftz.f32 %f154, %f159, %f154;\n" +" st.shared.f32 [%rd52+1024], %f154;\n" +" ld.shared.f32 %f160, [%rd55+1536];\n" +" add.ftz.f32 %f155, %f160, %f155;\n" +" st.shared.f32 [%rd52+1536], %f155;\n" +" ld.shared.f32 %f161, [%rd55+2048];\n" +" add.ftz.f32 %f156, %f161, %f156;\n" +" st.shared.f32 [%rd52+2048], %f156;\n" +"$Lt_1_33538:\n" +" .loc 16 436 0\n" +" shr.u32 %r66, %r66, 1;\n" +" mov.u32 %r69, 0;\n" +" setp.ne.u32 %p17, %r66, %r69;\n" +" @%p17 bra $Lt_1_33282;\n" +"$Lt_1_32770:\n" +" .loc 16 443 0\n" +" mov.f32 %f38, %f152;\n" +" .loc 16 444 0\n" +" mov.f32 %f37, %f153;\n" +" .loc 16 445 0\n" +" mov.f32 %f36, %f154;\n" +" .loc 16 446 0\n" +" mov.f32 %f40, %f155;\n" +" .loc 16 447 0\n" +" mov.f32 %f39, %f156;\n" +" ld.param.s32 %r70, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r71, 0;\n" +" setp.le.s32 %p18, %r70, %r71;\n" +" @%p18 bra $Lt_1_34306;\n" +" .loc 16 451 0\n" +" mov.f32 %f152, %f11;\n" +" st.shared.f32 [%rd52+0], %f152;\n" +" mov.f32 %f153, %f13;\n" +" st.shared.f32 [%rd52+512], %f153;\n" +" mov.f32 %f154, %f15;\n" +" st.shared.f32 [%rd52+1024], %f154;\n" +" mov.f32 %f155, %f17;\n" +" st.shared.f32 [%rd52+1536], %f155;\n" +" mov.f32 %f156, %f19;\n" +" st.shared.f32 [%rd52+2048], %f156;\n" +" mov.f32 %f162, %f21;\n" +" st.shared.f32 [%rd52+2560], %f162;\n" +" .loc 16 453 0\n" +" mov.s32 %r72, %r65;\n" +" @!%p15 bra $Lt_1_34818;\n" +"$Lt_1_35330:\n" +" setp.ge.u32 %p19, %r8, %r72;\n" +" @%p19 bra $Lt_1_35586;\n" +" .loc 16 456 0\n" +" add.u32 %r73, %r1, %r72;\n" +" cvt.u64.u32 %rd56, %r73;\n" +" mul.wide.u32 %rd57, %r73, 4;\n" +" add.u64 %rd58, %rd49, %rd57;\n" +" ld.shared.f32 %f163, [%rd58+0];\n" +" add.ftz.f32 %f152, %f163, %f152;\n" +" st.shared.f32 [%rd52+0], %f152;\n" +" ld.shared.f32 %f164, [%rd58+512];\n" +" add.ftz.f32 %f153, %f164, %f153;\n" +" st.shared.f32 [%rd52+512], %f153;\n" +" ld.shared.f32 %f165, [%rd58+1024];\n" +" add.ftz.f32 %f154, %f165, %f154;\n" +" st.shared.f32 [%rd52+1024], %f154;\n" +" ld.shared.f32 %f166, [%rd58+1536];\n" +" add.ftz.f32 %f155, %f166, %f155;\n" +" st.shared.f32 [%rd52+1536], %f155;\n" +" ld.shared.f32 %f167, [%rd58+2048];\n" +" add.ftz.f32 %f156, %f167, %f156;\n" +" st.shared.f32 [%rd52+2048], %f156;\n" +" ld.shared.f32 %f168, [%rd58+2560];\n" +" add.ftz.f32 %f162, %f168, %f162;\n" +" st.shared.f32 [%rd52+2560], %f162;\n" +"$Lt_1_35586:\n" +" .loc 16 453 0\n" +" shr.u32 %r72, %r72, 1;\n" +" mov.u32 %r74, 0;\n" +" setp.ne.u32 %p20, %r72, %r74;\n" +" @%p20 bra $Lt_1_35330;\n" +"$Lt_1_34818:\n" +" .loc 16 461 0\n" +" mov.f32 %f11, %f152;\n" +" mov.f32 %f13, %f153;\n" +" mov.f32 %f15, %f154;\n" +" mov.f32 %f17, %f155;\n" +" mov.f32 %f19, %f156;\n" +" mov.f32 %f21, %f162;\n" +"$Lt_1_34306:\n" +"$Lt_1_32258:\n" +" selp.s32 %r75, 1, 0, %p3;\n" +" mov.s32 %r76, 0;\n" +" set.eq.u32.s32 %r77, %r8, %r76;\n" +" neg.s32 %r78, %r77;\n" +" and.b32 %r79, %r75, %r78;\n" +" mov.u32 %r80, 0;\n" +" setp.eq.s32 %p21, %r79, %r80;\n" +" @%p21 bra $Lt_1_36354;\n" +" .loc 16 467 0\n" +" cvt.s64.s32 %rd59, %r11;\n" +" ld.param.u64 %rd60, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd61, %r11, 4;\n" +" add.u64 %rd62, %rd60, %rd61;\n" +" ld.param.s32 %r81, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r82, 0;\n" +" setp.le.s32 %p22, %r81, %r82;\n" +" @%p22 bra $Lt_1_36866;\n" +" .loc 16 469 0\n" +" st.global.f32 [%rd62+0], %f40;\n" +" .loc 16 470 0\n" +" cvt.s64.s32 %rd63, %r12;\n" +" mul.wide.s32 %rd64, %r12, 4;\n" +" add.u64 %rd65, %rd64, %rd62;\n" +" .loc 16 471 0\n" +" st.global.f32 [%rd65+0], %f39;\n" +" .loc 16 472 0\n" +" add.u64 %rd62, %rd64, %rd65;\n" +"$Lt_1_36866:\n" +" ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r84, 0;\n" +" setp.le.s32 %p23, %r83, %r84;\n" +" @%p23 bra $Lt_1_37378;\n" +" .loc 16 476 0\n" +" mov.f32 %f169, %f11;\n" +" st.global.f32 [%rd62+0], %f169;\n" +" .loc 16 477 0\n" +" cvt.s64.s32 %rd66, %r12;\n" +" mul.wide.s32 %rd67, %r12, 4;\n" +" add.u64 %rd68, %rd67, %rd62;\n" +" .loc 16 476 0\n" +" mov.f32 %f170, %f13;\n" +" st.global.f32 [%rd68+0], %f170;\n" +" .loc 16 477 0\n" +" add.u64 %rd69, %rd67, %rd68;\n" +" .loc 16 476 0\n" +" mov.f32 %f171, %f15;\n" +" st.global.f32 [%rd69+0], %f171;\n" +" .loc 16 477 0\n" +" add.u64 %rd70, %rd67, %rd69;\n" +" .loc 16 476 0\n" +" mov.f32 %f172, %f17;\n" +" st.global.f32 [%rd70+0], %f172;\n" +" .loc 16 477 0\n" +" add.u64 %rd62, %rd67, %rd70;\n" +" .loc 16 476 0\n" +" mov.f32 %f173, %f19;\n" +" st.global.f32 [%rd62+0], %f173;\n" +" mov.f32 %f174, %f21;\n" +" add.u64 %rd71, %rd67, %rd62;\n" +" st.global.f32 [%rd71+0], %f174;\n" +"$Lt_1_37378:\n" +" .loc 16 480 0\n" +" ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd73, %rd59, 16;\n" +" add.u64 %rd74, %rd72, %rd73;\n" +" mov.f32 %f175, %f176;\n" +" st.global.v4.f32 [%rd74+0], {%f38,%f37,%f36,%f175};\n" +"$Lt_1_36354:\n" +" .loc 16 482 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/cmmc_msm_gpu_kernel.ptx b/lib/gpu/cmmc_msm_gpu_kernel.ptx new file mode 100644 index 000000000..e622a1bd3 --- /dev/null +++ b/lib/gpu/cmmc_msm_gpu_kernel.ptx @@ -0,0 +1,1417 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000c0c5_00000000-9_cmmc_msm_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.giU2qD) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000c0c5_00000000-8_cmmc_msm_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "cmmc_msm_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + .global .texref q_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .u64 __cudaparm_kernel_pair_lj3, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_q_, + .param .f32 __cudaparm_kernel_pair_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_qqrd2e, + .param .s32 __cudaparm_kernel_pair_smooth, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<90>; + .reg .u64 %rd<65>; + .reg .f32 %f<210>; + .reg .pred %p<25>; + .shared .f32 __cuda_local_var_32507_33_non_const__ia; + .shared .f32 __cuda_local_var_32508_33_non_const__ia2; + .shared .f32 __cuda_local_var_32509_33_non_const__ia3; + .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_lj112[32]; + .shared .align 4 .b8 __cuda___cuda_local_var_32647_35_non_const_red_acc144[3072]; + // __cuda_local_var_32513_10_non_const_f = 64 + // __cuda_local_var_32517_9_non_const_virial = 16 + // __cuda_local_var_32568_43_non_const_inv1 = 40 + // __cuda_local_var_32568_49_non_const_inv2 = 44 + // __cuda_local_var_32586_15_non_const_ir = 48 + // __cuda_local_var_32586_19_non_const_r2_ia2 = 52 + // __cuda_local_var_32586_27_non_const_r4_ia4 = 56 + // __cuda_local_var_32586_35_non_const_r6_ia6 = 60 + .loc 16 100 0 +$LDWbegin_kernel_pair: + .loc 16 107 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 108 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 109 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 110 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4}; + .loc 16 111 0 + ld.global.f32 %f5, [%rd1+16]; + .loc 16 112 0 + ld.global.f32 %f6, [%rd1+20]; + .loc 16 113 0 + ld.global.f32 %f7, [%rd1+24]; + .loc 16 114 0 + ld.global.f32 %f8, [%rd1+28]; + st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8}; + .loc 16 127 0 + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + mov.f32 %f17, 0f00000000; // 0 + mov.f32 %f18, %f17; + mov.f32 %f19, 0f00000000; // 0 + mov.f32 %f20, %f19; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_25346; + .loc 16 130 0 + ld.param.f32 %f21, [__cudaparm_kernel_pair_cut_coulsq]; + sqrt.approx.ftz.f32 %f22, %f21; + mov.f32 %f23, 0fbf800000; // -1 + div.approx.ftz.f32 %f24, %f23, %f22; + st.shared.f32 [__cuda_local_var_32507_33_non_const__ia], %f24; + .loc 16 131 0 + mov.f32 %f25, 0fbf800000; // -1 + div.approx.ftz.f32 %f26, %f25, %f21; + st.shared.f32 [__cuda_local_var_32508_33_non_const__ia2], %f26; + .loc 16 132 0 + mul.ftz.f32 %f27, %f26, %f24; + st.shared.f32 [__cuda_local_var_32509_33_non_const__ia3], %f27; + .loc 16 135 0 + cvt.s64.s32 %rd2, %r9; + mul.wide.s32 %rd3, %r9, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd5, %rd3, %rd4; + ld.global.s32 %r11, [%rd5+0]; + .loc 16 137 0 + ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd6, %r12; + mul.wide.s32 %rd7, %r12, 4; + add.u64 %rd8, %rd7, %rd5; + ld.global.s32 %r13, [%rd8+0]; + add.u64 %rd9, %rd7, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd4; + @%p2 bra $Lt_0_25858; + .loc 16 143 0 + cvt.s32.s64 %r14, %rd6; + mul.lo.s32 %r15, %r14, %r13; + cvt.s64.s32 %rd11, %r15; + mul.wide.s32 %rd12, %r15, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 144 0 + mul.lo.s32 %r16, %r6, %r14; + cvt.s64.s32 %rd14, %r16; + mul.wide.s32 %rd15, %r16, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 145 0 + mul.lo.s32 %r17, %r14, %r1; + bra.uni $Lt_0_25602; +$Lt_0_25858: + .loc 16 147 0 + ld.global.s32 %r18, [%rd9+0]; + cvt.s64.s32 %rd17, %r18; + mul.wide.s32 %rd18, %r18, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 148 0 + cvt.s64.s32 %rd20, %r13; + mul.wide.s32 %rd21, %r13, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 149 0 + mov.s32 %r17, %r1; + .loc 16 150 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_25602: + .loc 16 153 0 + mov.u32 %r19, %r11; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f28,%f29,%f30,%f31},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f32, %f28; + mov.f32 %f33, %f29; + mov.f32 %f34, %f30; + mov.f32 %f35, %f31; + .loc 16 154 0 + mov.u32 %r26, %r11; + mov.s32 %r27, 0; + mov.u32 %r28, %r27; + mov.s32 %r29, 0; + mov.u32 %r30, %r29; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[q_tex,{%r26,%r28,%r30,%r32}]; + mov.f32 %f40, %f36; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_38402; + cvt.rzi.ftz.s32.f32 %r33, %f35; + cvt.s64.s32 %rd24, %r17; + ld.param.s32 %r34, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r35, %r34, %r33; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1]; + mov.f32 %f41, 0f00000000; // 0 + mov.f32 %f42, 0f00000000; // 0 + mov.f32 %f43, 0f00000000; // 0 + mov.f32 %f44, 0f00000000; // 0 + mov.f32 %f45, 0f00000000; // 0 + mov.u64 %rd26, __cuda___cuda_local_var_32498_33_non_const_sp_lj112; +$Lt_0_26626: + //<loop> Loop body line 154, nesting depth: 1, estimated iterations: unknown + .loc 16 158 0 + ld.global.s32 %r36, [%rd16+0]; + .loc 16 161 0 + shr.s32 %r37, %r36, 30; + and.b32 %r38, %r37, 3; + cvt.s64.s32 %rd27, %r38; + mul.wide.s32 %rd28, %r38, 4; + add.u64 %rd29, %rd26, %rd28; + ld.shared.f32 %f46, [%rd29+0]; + .loc 16 162 0 + mov.f32 %f47, 0f3f800000; // 1 + ld.shared.f32 %f48, [%rd29+16]; + sub.ftz.f32 %f49, %f47, %f48; + .loc 16 165 0 + and.b32 %r39, %r36, 1073741823; + mov.u32 %r40, %r39; + mov.s32 %r41, 0; + mov.u32 %r42, %r41; + mov.s32 %r43, 0; + mov.u32 %r44, %r43; + mov.s32 %r45, 0; + mov.u32 %r46, %r45; + tex.1d.v4.f32.s32 {%f50,%f51,%f52,%f53},[pos_tex,{%r40,%r42,%r44,%r46}]; + mov.f32 %f54, %f50; + mov.f32 %f55, %f51; + mov.f32 %f56, %f52; + mov.f32 %f57, %f53; + cvt.rzi.ftz.s32.f32 %r47, %f57; + sub.ftz.f32 %f58, %f33, %f55; + sub.ftz.f32 %f59, %f32, %f54; + sub.ftz.f32 %f60, %f34, %f56; + mul.ftz.f32 %f61, %f58, %f58; + fma.rn.ftz.f32 %f62, %f59, %f59, %f61; + fma.rn.ftz.f32 %f63, %f60, %f60, %f62; + add.s32 %r48, %r47, %r35; + cvt.s64.s32 %rd30, %r48; + mul.wide.s32 %rd31, %r48, 16; + add.u64 %rd32, %rd31, %rd25; + ld.global.f32 %f64, [%rd32+0]; + setp.gt.ftz.f32 %p4, %f64, %f63; + @!%p4 bra $Lt_0_32002; + rcp.approx.ftz.f32 %f65, %f63; + ld.global.f32 %f66, [%rd32+4]; + setp.lt.ftz.f32 %p5, %f63, %f66; + @!%p5 bra $Lt_0_27650; + ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd34, %rd33, %rd31; + ld.global.f32 %f67, [%rd34+0]; + mov.f32 %f68, 0f40000000; // 2 + setp.eq.ftz.f32 %p6, %f67, %f68; + @!%p6 bra $Lt_0_28162; + .loc 16 181 0 + mul.ftz.f32 %f69, %f65, %f65; + mov.f32 %f70, %f69; + mov.f32 %f71, %f70; + .loc 16 182 0 + mul.ftz.f32 %f72, %f69, %f69; + mov.f32 %f73, %f72; + bra.uni $Lt_0_28418; +$Lt_0_28162: + mov.f32 %f74, 0f3f800000; // 1 + setp.eq.ftz.f32 %p7, %f67, %f74; + @!%p7 bra $Lt_0_28674; + .loc 16 184 0 + sqrt.approx.ftz.f32 %f75, %f65; + mul.ftz.f32 %f76, %f65, %f75; + mov.f32 %f72, %f76; + mov.f32 %f73, %f72; + .loc 16 185 0 + mul.ftz.f32 %f70, %f76, %f76; + mov.f32 %f71, %f70; + bra.uni $Lt_0_28418; +$Lt_0_28674: + .loc 16 187 0 + mul.ftz.f32 %f77, %f65, %f65; + mul.ftz.f32 %f78, %f65, %f77; + mov.f32 %f70, %f78; + mov.f32 %f71, %f70; + .loc 16 188 0 + mov.f32 %f72, %f78; + mov.f32 %f73, %f72; +$Lt_0_28418: +$Lt_0_27906: + .loc 16 190 0 + mul.ftz.f32 %f79, %f46, %f70; + ld.global.v2.f32 {%f80,%f81}, [%rd32+8]; + mul.ftz.f32 %f82, %f80, %f72; + sub.ftz.f32 %f83, %f82, %f81; + mul.ftz.f32 %f84, %f79, %f83; + bra.uni $Lt_0_27394; +$Lt_0_27650: + .loc 16 192 0 + mov.f32 %f84, 0f00000000; // 0 +$Lt_0_27394: + setp.lt.ftz.f32 %p8, %f63, %f21; + @!%p8 bra $Lt_0_29186; + .loc 16 196 0 + sqrt.approx.ftz.f32 %f85, %f63; + rcp.approx.ftz.f32 %f86, %f85; + mov.f32 %f87, %f86; + .loc 16 197 0 + mov.u32 %r49, %r39; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + tex.1d.v4.f32.s32 {%f88,%f89,%f90,%f91},[q_tex,{%r49,%r51,%r53,%r55}]; + mov.f32 %f92, %f88; + ld.param.f32 %f93, [__cudaparm_kernel_pair_qqrd2e]; + mul.ftz.f32 %f94, %f93, %f40; + mul.ftz.f32 %f95, %f94, %f92; + mov.f32 %f96, %f95; + .loc 16 198 0 + mul.ftz.f32 %f97, %f63, %f26; + mov.f32 %f98, %f97; + .loc 16 199 0 + mul.ftz.f32 %f99, %f97, %f97; + mov.f32 %f100, %f99; + ld.param.s32 %r56, [__cudaparm_kernel_pair_smooth]; + mov.u32 %r57, 0; + setp.ne.s32 %p9, %r56, %r57; + @%p9 bra $Lt_0_29698; + .loc 16 201 0 + div.approx.ftz.f32 %f101, %f86, %f63; + mov.f32 %f102, 0fc08c0000; // -4.375 + mov.f32 %f103, 0f40a80000; // 5.25 + fma.rn.ftz.f32 %f104, %f103, %f97, %f102; + mov.f32 %f105, 0f3ff00000; // 1.875 + mul.ftz.f32 %f106, %f105, %f99; + sub.ftz.f32 %f107, %f104, %f106; + mul.ftz.f32 %f108, %f27, %f107; + sub.ftz.f32 %f109, %f108, %f101; + mul.ftz.f32 %f110, %f49, %f86; + sub.ftz.f32 %f111, %f109, %f110; + mul.ftz.f32 %f112, %f95, %f111; + bra.uni $Lt_0_28930; +$Lt_0_29698: + .loc 16 205 0 + mul.ftz.f32 %f113, %f99, %f97; + mov.f32 %f114, %f113; + .loc 16 206 0 + div.approx.ftz.f32 %f115, %f86, %f63; + mov.f32 %f116, 0fc0d20000; // -6.5625 + mov.f32 %f117, 0f413d0000; // 11.8125 + fma.rn.ftz.f32 %f118, %f117, %f97, %f116; + mov.f32 %f119, 0f41070000; // 8.4375 + mul.ftz.f32 %f120, %f119, %f99; + sub.ftz.f32 %f121, %f118, %f120; + mov.f32 %f122, 0f400c0000; // 2.1875 + fma.rn.ftz.f32 %f123, %f122, %f113, %f121; + mul.ftz.f32 %f124, %f27, %f123; + sub.ftz.f32 %f125, %f124, %f115; + mul.ftz.f32 %f126, %f49, %f86; + sub.ftz.f32 %f127, %f125, %f126; + mul.ftz.f32 %f112, %f95, %f127; + bra.uni $Lt_0_28930; +$Lt_0_29186: + .loc 16 213 0 + mov.f32 %f96, 0f00000000; // 0 + mov.f32 %f112, 0f00000000; // 0 +$Lt_0_28930: + .loc 16 218 0 + fma.rn.ftz.f32 %f128, %f84, %f65, %f112; + fma.rn.ftz.f32 %f43, %f59, %f128, %f43; + .loc 16 219 0 + fma.rn.ftz.f32 %f42, %f58, %f128, %f42; + .loc 16 220 0 + fma.rn.ftz.f32 %f41, %f60, %f128, %f41; + ld.param.s32 %r58, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r59, 0; + setp.le.s32 %p10, %r58, %r59; + @%p10 bra $Lt_0_31490; + @!%p8 bra $Lt_0_30978; + mov.f32 %f129, %f100; + mov.f32 %f130, %f98; + mov.f32 %f131, %f87; + ld.param.s32 %r60, [__cudaparm_kernel_pair_smooth]; + mov.u32 %r61, 0; + setp.ne.s32 %p11, %r60, %r61; + @%p11 bra $Lt_0_31234; + .loc 16 225 0 + mov.f32 %f132, 0f400c0000; // 2.1875 + mov.f32 %f133, 0f400c0000; // 2.1875 + mul.ftz.f32 %f134, %f133, %f130; + sub.ftz.f32 %f135, %f132, %f134; + mov.f32 %f136, 0f3fa80000; // 1.3125 + fma.rn.ftz.f32 %f137, %f136, %f129, %f135; + mov.f32 %f138, 0f3ea00000; // 0.3125 + mul.ftz.f32 %f139, %f129, %f138; + mul.ftz.f32 %f140, %f130, %f139; + sub.ftz.f32 %f141, %f137, %f140; + fma.rn.ftz.f32 %f142, %f24, %f141, %f131; + mul.ftz.f32 %f143, %f49, %f131; + sub.ftz.f32 %f144, %f142, %f143; + fma.rn.ftz.f32 %f44, %f96, %f144, %f44; + bra.uni $Lt_0_30978; +$Lt_0_31234: + .loc 16 230 0 + mov.f32 %f145, 0f401d8000; // 2.46094 + mov.f32 %f146, 0f40520000; // 3.28125 + mul.ftz.f32 %f147, %f146, %f130; + sub.ftz.f32 %f148, %f145, %f147; + mov.f32 %f149, 0f403d0000; // 2.95312 + fma.rn.ftz.f32 %f150, %f149, %f129, %f148; + mov.f32 %f151, 0f3fb40000; // 1.40625 + mov.f32 %f152, %f114; + mul.ftz.f32 %f153, %f151, %f152; + sub.ftz.f32 %f154, %f150, %f153; + mov.f32 %f155, 0f3e8c0000; // 0.273438 + mul.ftz.f32 %f156, %f129, %f155; + fma.rn.ftz.f32 %f157, %f129, %f156, %f154; + fma.rn.ftz.f32 %f158, %f24, %f157, %f131; + fma.rn.ftz.f32 %f44, %f96, %f158, %f44; +$Lt_0_30978: +$Lt_0_30466: + @!%p5 bra $Lt_0_31490; + .loc 16 236 0 + ld.param.u64 %rd35, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd36, %rd35, %rd31; + ld.global.v4.f32 {_,%f159,%f160,%f161}, [%rd36+0]; + mov.f32 %f162, %f71; + mul.ftz.f32 %f163, %f162, %f46; + mov.f32 %f164, %f73; + mul.ftz.f32 %f165, %f159, %f164; + sub.ftz.f32 %f166, %f165, %f160; + mul.ftz.f32 %f167, %f163, %f166; + sub.ftz.f32 %f168, %f167, %f161; + add.ftz.f32 %f45, %f45, %f168; +$Lt_0_31490: +$Lt_0_29954: + ld.param.s32 %r62, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r63, 0; + setp.le.s32 %p12, %r62, %r63; + @%p12 bra $Lt_0_32002; + .loc 16 241 0 + mov.f32 %f169, %f10; + mul.ftz.f32 %f170, %f59, %f59; + fma.rn.ftz.f32 %f171, %f128, %f170, %f169; + mov.f32 %f10, %f171; + .loc 16 242 0 + mov.f32 %f172, %f12; + fma.rn.ftz.f32 %f173, %f128, %f61, %f172; + mov.f32 %f12, %f173; + .loc 16 243 0 + mov.f32 %f174, %f14; + mul.ftz.f32 %f175, %f60, %f60; + fma.rn.ftz.f32 %f176, %f128, %f175, %f174; + mov.f32 %f14, %f176; + .loc 16 244 0 + mov.f32 %f177, %f16; + mul.ftz.f32 %f178, %f58, %f59; + fma.rn.ftz.f32 %f179, %f128, %f178, %f177; + mov.f32 %f16, %f179; + .loc 16 245 0 + mov.f32 %f180, %f18; + mul.ftz.f32 %f181, %f59, %f60; + fma.rn.ftz.f32 %f182, %f128, %f181, %f180; + mov.f32 %f18, %f182; + .loc 16 246 0 + mul.ftz.f32 %f183, %f58, %f60; + fma.rn.ftz.f32 %f19, %f128, %f183, %f19; + mov.f32 %f20, %f19; +$Lt_0_32002: +$Lt_0_26882: + .loc 16 157 0 + mul.lo.u64 %rd37, %rd24, 4; + add.u64 %rd16, %rd16, %rd37; + setp.lt.u64 %p13, %rd16, %rd13; + @%p13 bra $Lt_0_26626; + bra.uni $Lt_0_25090; +$Lt_0_38402: + mov.f32 %f41, 0f00000000; // 0 + mov.f32 %f42, 0f00000000; // 0 + mov.f32 %f43, 0f00000000; // 0 + mov.f32 %f44, 0f00000000; // 0 + mov.f32 %f45, 0f00000000; // 0 + bra.uni $Lt_0_25090; +$Lt_0_25346: + mov.f32 %f41, 0f00000000; // 0 + mov.f32 %f42, 0f00000000; // 0 + mov.f32 %f43, 0f00000000; // 0 + mov.f32 %f44, 0f00000000; // 0 + mov.f32 %f45, 0f00000000; // 0 +$Lt_0_25090: + mov.u32 %r64, 1; + setp.le.s32 %p14, %r1, %r64; + @%p14 bra $Lt_0_34818; + .loc 16 257 0 + mov.u64 %rd38, __cuda___cuda_local_var_32647_35_non_const_red_acc144; + cvt.s64.s32 %rd39, %r2; + mul.wide.s32 %rd40, %r2, 4; + add.u64 %rd41, %rd38, %rd40; + mov.f32 %f184, %f43; + st.shared.f32 [%rd41+0], %f184; + .loc 16 258 0 + mov.f32 %f185, %f42; + st.shared.f32 [%rd41+512], %f185; + .loc 16 259 0 + mov.f32 %f186, %f41; + st.shared.f32 [%rd41+1024], %f186; + .loc 16 260 0 + mov.f32 %f187, %f45; + st.shared.f32 [%rd41+1536], %f187; + .loc 16 261 0 + mov.f32 %f188, %f44; + st.shared.f32 [%rd41+2048], %f188; + .loc 16 263 0 + shr.s32 %r65, %r1, 31; + mov.s32 %r66, 1; + and.b32 %r67, %r65, %r66; + add.s32 %r68, %r67, %r1; + shr.s32 %r69, %r68, 1; + mov.s32 %r70, %r69; + mov.u32 %r71, 0; + setp.ne.u32 %p15, %r69, %r71; + @!%p15 bra $Lt_0_33282; +$Lt_0_33794: + setp.ge.u32 %p16, %r6, %r70; + @%p16 bra $Lt_0_34050; + .loc 16 266 0 + add.u32 %r72, %r2, %r70; + cvt.u64.u32 %rd42, %r72; + mul.wide.u32 %rd43, %r72, 4; + add.u64 %rd44, %rd38, %rd43; + ld.shared.f32 %f189, [%rd44+0]; + add.ftz.f32 %f184, %f189, %f184; + st.shared.f32 [%rd41+0], %f184; + ld.shared.f32 %f190, [%rd44+512]; + add.ftz.f32 %f185, %f190, %f185; + st.shared.f32 [%rd41+512], %f185; + ld.shared.f32 %f191, [%rd44+1024]; + add.ftz.f32 %f186, %f191, %f186; + st.shared.f32 [%rd41+1024], %f186; + ld.shared.f32 %f192, [%rd44+1536]; + add.ftz.f32 %f187, %f192, %f187; + st.shared.f32 [%rd41+1536], %f187; + ld.shared.f32 %f193, [%rd44+2048]; + add.ftz.f32 %f188, %f193, %f188; + st.shared.f32 [%rd41+2048], %f188; +$Lt_0_34050: + .loc 16 263 0 + shr.u32 %r70, %r70, 1; + mov.u32 %r73, 0; + setp.ne.u32 %p17, %r70, %r73; + @%p17 bra $Lt_0_33794; +$Lt_0_33282: + .loc 16 270 0 + mov.f32 %f43, %f184; + .loc 16 271 0 + mov.f32 %f42, %f185; + .loc 16 272 0 + mov.f32 %f41, %f186; + .loc 16 273 0 + mov.f32 %f45, %f187; + .loc 16 274 0 + mov.f32 %f44, %f188; + ld.param.s32 %r74, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r75, 0; + setp.le.s32 %p18, %r74, %r75; + @%p18 bra $Lt_0_34818; + .loc 16 278 0 + mov.f32 %f184, %f10; + st.shared.f32 [%rd41+0], %f184; + mov.f32 %f185, %f12; + st.shared.f32 [%rd41+512], %f185; + mov.f32 %f186, %f14; + st.shared.f32 [%rd41+1024], %f186; + mov.f32 %f187, %f16; + st.shared.f32 [%rd41+1536], %f187; + mov.f32 %f188, %f18; + st.shared.f32 [%rd41+2048], %f188; + mov.f32 %f194, %f20; + st.shared.f32 [%rd41+2560], %f194; + .loc 16 280 0 + mov.s32 %r76, %r69; + @!%p15 bra $Lt_0_35330; +$Lt_0_35842: + setp.ge.u32 %p19, %r6, %r76; + @%p19 bra $Lt_0_36098; + .loc 16 283 0 + add.u32 %r77, %r2, %r76; + cvt.u64.u32 %rd45, %r77; + mul.wide.u32 %rd46, %r77, 4; + add.u64 %rd47, %rd38, %rd46; + ld.shared.f32 %f195, [%rd47+0]; + add.ftz.f32 %f184, %f195, %f184; + st.shared.f32 [%rd41+0], %f184; + ld.shared.f32 %f196, [%rd47+512]; + add.ftz.f32 %f185, %f196, %f185; + st.shared.f32 [%rd41+512], %f185; + ld.shared.f32 %f197, [%rd47+1024]; + add.ftz.f32 %f186, %f197, %f186; + st.shared.f32 [%rd41+1024], %f186; + ld.shared.f32 %f198, [%rd47+1536]; + add.ftz.f32 %f187, %f198, %f187; + st.shared.f32 [%rd41+1536], %f187; + ld.shared.f32 %f199, [%rd47+2048]; + add.ftz.f32 %f188, %f199, %f188; + st.shared.f32 [%rd41+2048], %f188; + ld.shared.f32 %f200, [%rd47+2560]; + add.ftz.f32 %f194, %f200, %f194; + st.shared.f32 [%rd41+2560], %f194; +$Lt_0_36098: + .loc 16 280 0 + shr.u32 %r76, %r76, 1; + mov.u32 %r78, 0; + setp.ne.u32 %p20, %r76, %r78; + @%p20 bra $Lt_0_35842; +$Lt_0_35330: + .loc 16 288 0 + mov.f32 %f10, %f184; + mov.f32 %f12, %f185; + mov.f32 %f14, %f186; + mov.f32 %f16, %f187; + mov.f32 %f18, %f188; + mov.f32 %f20, %f194; +$Lt_0_34818: +$Lt_0_32770: + selp.s32 %r79, 1, 0, %p1; + mov.s32 %r80, 0; + set.eq.u32.s32 %r81, %r6, %r80; + neg.s32 %r82, %r81; + and.b32 %r83, %r79, %r82; + mov.u32 %r84, 0; + setp.eq.s32 %p21, %r83, %r84; + @%p21 bra $Lt_0_36866; + .loc 16 294 0 + cvt.s64.s32 %rd48, %r9; + ld.param.u64 %rd49, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd50, %r9, 4; + add.u64 %rd51, %rd49, %rd50; + ld.param.s32 %r85, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r86, 0; + setp.le.s32 %p22, %r85, %r86; + @%p22 bra $Lt_0_37378; + .loc 16 296 0 + st.global.f32 [%rd51+0], %f45; + .loc 16 297 0 + cvt.s64.s32 %rd52, %r10; + mul.wide.s32 %rd53, %r10, 4; + add.u64 %rd54, %rd53, %rd51; + .loc 16 298 0 + st.global.f32 [%rd54+0], %f44; + .loc 16 299 0 + add.u64 %rd51, %rd53, %rd54; +$Lt_0_37378: + ld.param.s32 %r87, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r88, 0; + setp.le.s32 %p23, %r87, %r88; + @%p23 bra $Lt_0_37890; + .loc 16 303 0 + mov.f32 %f201, %f10; + st.global.f32 [%rd51+0], %f201; + .loc 16 304 0 + cvt.s64.s32 %rd55, %r10; + mul.wide.s32 %rd56, %r10, 4; + add.u64 %rd57, %rd56, %rd51; + .loc 16 303 0 + mov.f32 %f202, %f12; + st.global.f32 [%rd57+0], %f202; + .loc 16 304 0 + add.u64 %rd58, %rd56, %rd57; + .loc 16 303 0 + mov.f32 %f203, %f14; + st.global.f32 [%rd58+0], %f203; + .loc 16 304 0 + add.u64 %rd59, %rd56, %rd58; + .loc 16 303 0 + mov.f32 %f204, %f16; + st.global.f32 [%rd59+0], %f204; + .loc 16 304 0 + add.u64 %rd51, %rd56, %rd59; + .loc 16 303 0 + mov.f32 %f205, %f18; + st.global.f32 [%rd51+0], %f205; + mov.f32 %f206, %f20; + add.u64 %rd60, %rd56, %rd51; + st.global.f32 [%rd60+0], %f206; +$Lt_0_37890: + .loc 16 307 0 + ld.param.u64 %rd61, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd62, %rd48, 16; + add.u64 %rd63, %rd61, %rd62; + mov.f32 %f207, %f208; + st.global.v4.f32 [%rd63+0], {%f43,%f42,%f41,%f207}; +$Lt_0_36866: + .loc 16 309 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_lj1_in, + .param .u64 __cudaparm_kernel_pair_fast_lj3_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_fast_q_, + .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, + .param .s32 __cudaparm_kernel_pair_fast_smooth, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<90>; + .reg .u64 %rd<76>; + .reg .f32 %f<215>; + .reg .pred %p<27>; + .shared .f32 __cuda_local_var_32737_33_non_const__ia; + .shared .f32 __cuda_local_var_32738_33_non_const__ia2; + .shared .f32 __cuda_local_var_32739_33_non_const__ia3; + .shared .align 4 .b8 __cuda___cuda_local_var_32719_33_non_const_sp_lj3320[32]; + .shared .align 16 .b8 __cuda___cuda_local_var_32717_34_non_const_lj13360[1936]; + .shared .align 16 .b8 __cuda___cuda_local_var_32718_34_non_const_lj35296[1936]; + .shared .align 4 .b8 __cuda___cuda_local_var_32866_35_non_const_red_acc7232[3072]; + // __cuda_local_var_32729_10_non_const_f = 64 + // __cuda_local_var_32733_9_non_const_virial = 16 + // __cuda_local_var_32788_43_non_const_inv1 = 40 + // __cuda_local_var_32788_49_non_const_inv2 = 44 + // __cuda_local_var_32806_15_non_const_ir = 48 + // __cuda_local_var_32806_19_non_const_r2_ia2 = 52 + // __cuda_local_var_32806_27_non_const_r4_ia4 = 56 + // __cuda_local_var_32806_35_non_const_r6_ia6 = 60 + .loc 16 319 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 7; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_26626; + .loc 16 329 0 + mov.u64 %rd1, __cuda___cuda_local_var_32719_33_non_const_sp_lj3320; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_26626: + mov.u64 %rd1, __cuda___cuda_local_var_32719_33_non_const_sp_lj3320; + mov.u32 %r3, 120; + setp.gt.s32 %p2, %r1, %r3; + @%p2 bra $Lt_1_27138; + .loc 16 331 0 + mov.u64 %rd7, __cuda___cuda_local_var_32717_34_non_const_lj13360; + mov.u64 %rd8, __cuda___cuda_local_var_32718_34_non_const_lj35296; + cvt.s64.s32 %rd9, %r1; + mul.wide.s32 %rd10, %r1, 16; + ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in]; + add.u64 %rd12, %rd11, %rd10; + add.u64 %rd13, %rd10, %rd7; + ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0]; + st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5}; + .loc 16 332 0 + ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; + add.u64 %rd15, %rd14, %rd10; + add.u64 %rd16, %rd10, %rd8; + ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; + st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; +$Lt_1_27138: + mov.u64 %rd7, __cuda___cuda_local_var_32717_34_non_const_lj13360; + mov.u64 %rd8, __cuda___cuda_local_var_32718_34_non_const_lj35296; + .loc 16 343 0 + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, %f14; + mov.f32 %f16, 0f00000000; // 0 + mov.f32 %f17, %f16; + mov.f32 %f18, 0f00000000; // 0 + mov.f32 %f19, %f18; + mov.f32 %f20, 0f00000000; // 0 + mov.f32 %f21, %f20; + .loc 16 348 0 + ld.param.f32 %f22, [__cudaparm_kernel_pair_fast_cut_coulsq]; + sqrt.approx.ftz.f32 %f23, %f22; + mov.f32 %f24, 0fbf800000; // -1 + div.approx.ftz.f32 %f25, %f24, %f23; + st.shared.f32 [__cuda_local_var_32737_33_non_const__ia], %f25; + .loc 16 349 0 + rcp.approx.ftz.f32 %f26, %f22; + st.shared.f32 [__cuda_local_var_32738_33_non_const__ia2], %f26; + .loc 16 350 0 + mul.ftz.f32 %f27, %f26, %f25; + st.shared.f32 [__cuda_local_var_32739_33_non_const__ia3], %f27; + .loc 16 351 0 + bar.sync 0; + ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r5, %r1, %r4; + cvt.s32.u32 %r6, %ntid.x; + div.s32 %r7, %r6, %r4; + rem.s32 %r8, %r1, %r4; + cvt.s32.u32 %r9, %ctaid.x; + mul.lo.s32 %r10, %r9, %r7; + add.s32 %r11, %r5, %r10; + ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p3, %r11, %r12; + @!%p3 bra $Lt_1_27906; + .loc 16 355 0 + cvt.s64.s32 %rd17, %r11; + mul.wide.s32 %rd18, %r11, 4; + ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd20, %rd18, %rd19; + ld.global.s32 %r13, [%rd20+0]; + .loc 16 357 0 + ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd21, %r14; + mul.wide.s32 %rd22, %r14, 4; + add.u64 %rd23, %rd22, %rd20; + ld.global.s32 %r15, [%rd23+0]; + add.u64 %rd24, %rd22, %rd23; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p4, %rd25, %rd19; + @%p4 bra $Lt_1_28418; + .loc 16 363 0 + cvt.s32.s64 %r16, %rd21; + mul.lo.s32 %r17, %r16, %r15; + cvt.s64.s32 %rd26, %r17; + mul.wide.s32 %rd27, %r17, 4; + add.u64 %rd28, %rd24, %rd27; + .loc 16 364 0 + mul.lo.s32 %r18, %r8, %r16; + cvt.s64.s32 %rd29, %r18; + mul.wide.s32 %rd30, %r18, 4; + add.u64 %rd31, %rd24, %rd30; + .loc 16 365 0 + mul.lo.s32 %r19, %r16, %r4; + bra.uni $Lt_1_28162; +$Lt_1_28418: + .loc 16 367 0 + ld.global.s32 %r20, [%rd24+0]; + cvt.s64.s32 %rd32, %r20; + mul.wide.s32 %rd33, %r20, 4; + add.u64 %rd34, %rd25, %rd33; + .loc 16 368 0 + cvt.s64.s32 %rd35, %r15; + mul.wide.s32 %rd36, %r15, 4; + add.u64 %rd28, %rd34, %rd36; + .loc 16 369 0 + mov.s32 %r19, %r4; + .loc 16 370 0 + cvt.s64.s32 %rd37, %r8; + mul.wide.s32 %rd38, %r8, 4; + add.u64 %rd31, %rd34, %rd38; +$Lt_1_28162: + .loc 16 373 0 + mov.u32 %r21, %r13; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + tex.1d.v4.f32.s32 {%f28,%f29,%f30,%f31},[pos_tex,{%r21,%r23,%r25,%r27}]; + mov.f32 %f32, %f28; + mov.f32 %f33, %f29; + mov.f32 %f34, %f30; + mov.f32 %f35, %f31; + .loc 16 374 0 + mov.u32 %r28, %r13; + mov.s32 %r29, 0; + mov.u32 %r30, %r29; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + mov.s32 %r33, 0; + mov.u32 %r34, %r33; + tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[q_tex,{%r28,%r30,%r32,%r34}]; + mov.f32 %f40, %f36; + setp.ge.u64 %p5, %rd31, %rd28; + @%p5 bra $Lt_1_40962; + cvt.rzi.ftz.s32.f32 %r35, %f35; + cvt.s64.s32 %rd39, %r19; + mul.lo.s32 %r36, %r35, 11; + cvt.rn.f32.s32 %f41, %r36; + mov.f32 %f42, 0f00000000; // 0 + mov.f32 %f43, 0f00000000; // 0 + mov.f32 %f44, 0f00000000; // 0 + mov.f32 %f45, 0f00000000; // 0 + mov.f32 %f46, 0f00000000; // 0 +$Lt_1_29186: + //<loop> Loop body line 374, nesting depth: 1, estimated iterations: unknown + .loc 16 379 0 + ld.global.s32 %r37, [%rd31+0]; + .loc 16 382 0 + shr.s32 %r38, %r37, 30; + and.b32 %r39, %r38, 3; + cvt.s64.s32 %rd40, %r39; + mul.wide.s32 %rd41, %r39, 4; + add.u64 %rd42, %rd1, %rd41; + ld.shared.f32 %f47, [%rd42+0]; + .loc 16 383 0 + mov.f32 %f48, 0f3f800000; // 1 + ld.shared.f32 %f49, [%rd42+16]; + sub.ftz.f32 %f50, %f48, %f49; + .loc 16 386 0 + and.b32 %r40, %r37, 1073741823; + mov.u32 %r41, %r40; + mov.s32 %r42, 0; + mov.u32 %r43, %r42; + mov.s32 %r44, 0; + mov.u32 %r45, %r44; + mov.s32 %r46, 0; + mov.u32 %r47, %r46; + tex.1d.v4.f32.s32 {%f51,%f52,%f53,%f54},[pos_tex,{%r41,%r43,%r45,%r47}]; + mov.f32 %f55, %f51; + mov.f32 %f56, %f52; + mov.f32 %f57, %f53; + mov.f32 %f58, %f54; + sub.ftz.f32 %f59, %f33, %f56; + sub.ftz.f32 %f60, %f32, %f55; + sub.ftz.f32 %f61, %f34, %f57; + mul.ftz.f32 %f62, %f59, %f59; + fma.rn.ftz.f32 %f63, %f60, %f60, %f62; + fma.rn.ftz.f32 %f64, %f61, %f61, %f63; + add.ftz.f32 %f65, %f41, %f58; + cvt.rzi.ftz.s32.f32 %r48, %f65; + cvt.s64.s32 %rd43, %r48; + mul.wide.s32 %rd44, %r48, 16; + add.u64 %rd45, %rd44, %rd7; + ld.shared.f32 %f66, [%rd45+0]; + setp.gt.ftz.f32 %p6, %f66, %f64; + @!%p6 bra $Lt_1_34562; + rcp.approx.ftz.f32 %f67, %f64; + ld.shared.f32 %f68, [%rd45+4]; + setp.lt.ftz.f32 %p7, %f64, %f68; + @!%p7 bra $Lt_1_30210; + add.u64 %rd46, %rd44, %rd8; + ld.shared.f32 %f69, [%rd46+0]; + mov.f32 %f70, 0f40000000; // 2 + setp.eq.ftz.f32 %p8, %f69, %f70; + @!%p8 bra $Lt_1_30722; + .loc 16 401 0 + mul.ftz.f32 %f71, %f67, %f67; + mov.f32 %f72, %f71; + mov.f32 %f73, %f72; + .loc 16 402 0 + mul.ftz.f32 %f74, %f71, %f71; + mov.f32 %f75, %f74; + bra.uni $Lt_1_30978; +$Lt_1_30722: + mov.f32 %f76, 0f3f800000; // 1 + setp.eq.ftz.f32 %p9, %f69, %f76; + @!%p9 bra $Lt_1_31234; + .loc 16 404 0 + sqrt.approx.ftz.f32 %f77, %f67; + mul.ftz.f32 %f78, %f67, %f77; + mov.f32 %f74, %f78; + mov.f32 %f75, %f74; + .loc 16 405 0 + mul.ftz.f32 %f72, %f78, %f78; + mov.f32 %f73, %f72; + bra.uni $Lt_1_30978; +$Lt_1_31234: + .loc 16 407 0 + mul.ftz.f32 %f79, %f67, %f67; + mul.ftz.f32 %f80, %f67, %f79; + mov.f32 %f72, %f80; + mov.f32 %f73, %f72; + .loc 16 408 0 + mov.f32 %f74, %f80; + mov.f32 %f75, %f74; +$Lt_1_30978: +$Lt_1_30466: + .loc 16 410 0 + mul.ftz.f32 %f81, %f47, %f72; + ld.shared.v2.f32 {%f82,%f83}, [%rd45+8]; + mul.ftz.f32 %f84, %f82, %f74; + sub.ftz.f32 %f85, %f84, %f83; + mul.ftz.f32 %f86, %f81, %f85; + bra.uni $Lt_1_29954; +$Lt_1_30210: + .loc 16 412 0 + mov.f32 %f86, 0f00000000; // 0 +$Lt_1_29954: + setp.lt.ftz.f32 %p10, %f64, %f22; + @!%p10 bra $Lt_1_31746; + .loc 16 416 0 + sqrt.approx.ftz.f32 %f87, %f64; + rcp.approx.ftz.f32 %f88, %f87; + mov.f32 %f89, %f88; + .loc 16 417 0 + mov.u32 %r49, %r40; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + tex.1d.v4.f32.s32 {%f90,%f91,%f92,%f93},[q_tex,{%r49,%r51,%r53,%r55}]; + mov.f32 %f94, %f90; + ld.param.f32 %f95, [__cudaparm_kernel_pair_fast_qqrd2e]; + mul.ftz.f32 %f96, %f95, %f40; + mul.ftz.f32 %f97, %f96, %f94; + mov.f32 %f98, %f97; + .loc 16 418 0 + ld.shared.f32 %f99, [__cuda_local_var_32738_33_non_const__ia2]; + mul.ftz.f32 %f100, %f99, %f64; + mov.f32 %f101, %f100; + .loc 16 419 0 + mul.ftz.f32 %f102, %f100, %f100; + mov.f32 %f103, %f102; + ld.shared.f32 %f104, [__cuda_local_var_32739_33_non_const__ia3]; + ld.param.s32 %r56, [__cudaparm_kernel_pair_fast_smooth]; + mov.u32 %r57, 0; + setp.ne.s32 %p11, %r56, %r57; + @%p11 bra $Lt_1_32258; + .loc 16 421 0 + div.approx.ftz.f32 %f105, %f88, %f64; + mov.f32 %f106, 0fc08c0000; // -4.375 + mov.f32 %f107, 0f40a80000; // 5.25 + fma.rn.ftz.f32 %f108, %f107, %f100, %f106; + mov.f32 %f109, 0f3ff00000; // 1.875 + mul.ftz.f32 %f110, %f109, %f102; + sub.ftz.f32 %f111, %f108, %f110; + mul.ftz.f32 %f112, %f104, %f111; + sub.ftz.f32 %f113, %f112, %f105; + mul.ftz.f32 %f114, %f50, %f88; + sub.ftz.f32 %f115, %f113, %f114; + mul.ftz.f32 %f116, %f97, %f115; + bra.uni $Lt_1_31490; +$Lt_1_32258: + .loc 16 425 0 + mul.ftz.f32 %f117, %f102, %f100; + mov.f32 %f118, %f117; + .loc 16 426 0 + div.approx.ftz.f32 %f119, %f88, %f64; + mov.f32 %f120, 0fc0d20000; // -6.5625 + mov.f32 %f121, 0f413d0000; // 11.8125 + fma.rn.ftz.f32 %f122, %f121, %f100, %f120; + mov.f32 %f123, 0f41070000; // 8.4375 + mul.ftz.f32 %f124, %f123, %f102; + sub.ftz.f32 %f125, %f122, %f124; + mov.f32 %f126, 0f400c0000; // 2.1875 + fma.rn.ftz.f32 %f127, %f126, %f117, %f125; + mul.ftz.f32 %f128, %f104, %f127; + sub.ftz.f32 %f129, %f128, %f119; + mul.ftz.f32 %f130, %f50, %f88; + sub.ftz.f32 %f131, %f129, %f130; + mul.ftz.f32 %f116, %f97, %f131; + bra.uni $Lt_1_31490; +$Lt_1_31746: + .loc 16 433 0 + mov.f32 %f98, 0f00000000; // 0 + mov.f32 %f116, 0f00000000; // 0 +$Lt_1_31490: + .loc 16 438 0 + fma.rn.ftz.f32 %f132, %f86, %f67, %f116; + fma.rn.ftz.f32 %f44, %f60, %f132, %f44; + .loc 16 439 0 + fma.rn.ftz.f32 %f43, %f59, %f132, %f43; + .loc 16 440 0 + fma.rn.ftz.f32 %f42, %f61, %f132, %f42; + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r59, 0; + setp.le.s32 %p12, %r58, %r59; + @%p12 bra $Lt_1_34050; + @!%p10 bra $Lt_1_33538; + mov.f32 %f133, %f103; + mov.f32 %f134, %f101; + mov.f32 %f135, %f89; + ld.shared.f32 %f136, [__cuda_local_var_32737_33_non_const__ia]; + ld.param.s32 %r60, [__cudaparm_kernel_pair_fast_smooth]; + mov.u32 %r61, 0; + setp.ne.s32 %p13, %r60, %r61; + @%p13 bra $Lt_1_33794; + .loc 16 445 0 + mov.f32 %f137, 0f400c0000; // 2.1875 + mov.f32 %f138, 0f400c0000; // 2.1875 + mul.ftz.f32 %f139, %f138, %f134; + sub.ftz.f32 %f140, %f137, %f139; + mov.f32 %f141, 0f3fa80000; // 1.3125 + fma.rn.ftz.f32 %f142, %f141, %f133, %f140; + mov.f32 %f143, 0f3ea00000; // 0.3125 + mul.ftz.f32 %f144, %f133, %f143; + mul.ftz.f32 %f145, %f134, %f144; + sub.ftz.f32 %f146, %f142, %f145; + fma.rn.ftz.f32 %f147, %f136, %f146, %f135; + mul.ftz.f32 %f148, %f50, %f135; + sub.ftz.f32 %f149, %f147, %f148; + fma.rn.ftz.f32 %f45, %f98, %f149, %f45; + bra.uni $Lt_1_33538; +$Lt_1_33794: + .loc 16 450 0 + mov.f32 %f150, 0f401d8000; // 2.46094 + mov.f32 %f151, 0f40520000; // 3.28125 + mul.ftz.f32 %f152, %f151, %f134; + sub.ftz.f32 %f153, %f150, %f152; + mov.f32 %f154, 0f403d0000; // 2.95312 + fma.rn.ftz.f32 %f155, %f154, %f133, %f153; + mov.f32 %f156, 0f3fb40000; // 1.40625 + mov.f32 %f157, %f118; + mul.ftz.f32 %f158, %f156, %f157; + sub.ftz.f32 %f159, %f155, %f158; + mov.f32 %f160, 0f3e8c0000; // 0.273438 + mul.ftz.f32 %f161, %f133, %f160; + fma.rn.ftz.f32 %f162, %f133, %f161, %f159; + fma.rn.ftz.f32 %f163, %f136, %f162, %f135; + fma.rn.ftz.f32 %f45, %f98, %f163, %f45; +$Lt_1_33538: +$Lt_1_33026: + @!%p7 bra $Lt_1_34050; + .loc 16 455 0 + add.u64 %rd47, %rd44, %rd8; + ld.shared.v4.f32 {_,%f164,%f165,%f166}, [%rd47+0]; + mov.f32 %f167, %f73; + mul.ftz.f32 %f168, %f167, %f47; + mov.f32 %f169, %f75; + mul.ftz.f32 %f170, %f164, %f169; + sub.ftz.f32 %f171, %f170, %f165; + mul.ftz.f32 %f172, %f168, %f171; + sub.ftz.f32 %f173, %f172, %f166; + add.ftz.f32 %f46, %f46, %f173; +$Lt_1_34050: +$Lt_1_32514: + ld.param.s32 %r62, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r63, 0; + setp.le.s32 %p14, %r62, %r63; + @%p14 bra $Lt_1_34562; + .loc 16 460 0 + mov.f32 %f174, %f11; + mul.ftz.f32 %f175, %f60, %f60; + fma.rn.ftz.f32 %f176, %f132, %f175, %f174; + mov.f32 %f11, %f176; + .loc 16 461 0 + mov.f32 %f177, %f13; + fma.rn.ftz.f32 %f178, %f132, %f62, %f177; + mov.f32 %f13, %f178; + .loc 16 462 0 + mov.f32 %f179, %f15; + mul.ftz.f32 %f180, %f61, %f61; + fma.rn.ftz.f32 %f181, %f132, %f180, %f179; + mov.f32 %f15, %f181; + .loc 16 463 0 + mov.f32 %f182, %f17; + mul.ftz.f32 %f183, %f59, %f60; + fma.rn.ftz.f32 %f184, %f132, %f183, %f182; + mov.f32 %f17, %f184; + .loc 16 464 0 + mov.f32 %f185, %f19; + mul.ftz.f32 %f186, %f60, %f61; + fma.rn.ftz.f32 %f187, %f132, %f186, %f185; + mov.f32 %f19, %f187; + .loc 16 465 0 + mul.ftz.f32 %f188, %f59, %f61; + fma.rn.ftz.f32 %f20, %f132, %f188, %f20; + mov.f32 %f21, %f20; +$Lt_1_34562: +$Lt_1_29442: + .loc 16 378 0 + mul.lo.u64 %rd48, %rd39, 4; + add.u64 %rd31, %rd31, %rd48; + setp.lt.u64 %p15, %rd31, %rd28; + @%p15 bra $Lt_1_29186; + bra.uni $Lt_1_27650; +$Lt_1_40962: + mov.f32 %f42, 0f00000000; // 0 + mov.f32 %f43, 0f00000000; // 0 + mov.f32 %f44, 0f00000000; // 0 + mov.f32 %f45, 0f00000000; // 0 + mov.f32 %f46, 0f00000000; // 0 + bra.uni $Lt_1_27650; +$Lt_1_27906: + mov.f32 %f42, 0f00000000; // 0 + mov.f32 %f43, 0f00000000; // 0 + mov.f32 %f44, 0f00000000; // 0 + mov.f32 %f45, 0f00000000; // 0 + mov.f32 %f46, 0f00000000; // 0 +$Lt_1_27650: + mov.u32 %r64, 1; + setp.le.s32 %p16, %r4, %r64; + @%p16 bra $Lt_1_37378; + .loc 16 476 0 + mov.u64 %rd49, __cuda___cuda_local_var_32866_35_non_const_red_acc7232; + cvt.s64.s32 %rd50, %r1; + mul.wide.s32 %rd51, %r1, 4; + add.u64 %rd52, %rd49, %rd51; + mov.f32 %f189, %f44; + st.shared.f32 [%rd52+0], %f189; + .loc 16 477 0 + mov.f32 %f190, %f43; + st.shared.f32 [%rd52+512], %f190; + .loc 16 478 0 + mov.f32 %f191, %f42; + st.shared.f32 [%rd52+1024], %f191; + .loc 16 479 0 + mov.f32 %f192, %f46; + st.shared.f32 [%rd52+1536], %f192; + .loc 16 480 0 + mov.f32 %f193, %f45; + st.shared.f32 [%rd52+2048], %f193; + .loc 16 482 0 + shr.s32 %r65, %r4, 31; + mov.s32 %r66, 1; + and.b32 %r67, %r65, %r66; + add.s32 %r68, %r67, %r4; + shr.s32 %r69, %r68, 1; + mov.s32 %r70, %r69; + mov.u32 %r71, 0; + setp.ne.u32 %p17, %r69, %r71; + @!%p17 bra $Lt_1_35842; +$Lt_1_36354: + setp.ge.u32 %p18, %r8, %r70; + @%p18 bra $Lt_1_36610; + .loc 16 485 0 + add.u32 %r72, %r1, %r70; + cvt.u64.u32 %rd53, %r72; + mul.wide.u32 %rd54, %r72, 4; + add.u64 %rd55, %rd49, %rd54; + ld.shared.f32 %f194, [%rd55+0]; + add.ftz.f32 %f189, %f194, %f189; + st.shared.f32 [%rd52+0], %f189; + ld.shared.f32 %f195, [%rd55+512]; + add.ftz.f32 %f190, %f195, %f190; + st.shared.f32 [%rd52+512], %f190; + ld.shared.f32 %f196, [%rd55+1024]; + add.ftz.f32 %f191, %f196, %f191; + st.shared.f32 [%rd52+1024], %f191; + ld.shared.f32 %f197, [%rd55+1536]; + add.ftz.f32 %f192, %f197, %f192; + st.shared.f32 [%rd52+1536], %f192; + ld.shared.f32 %f198, [%rd55+2048]; + add.ftz.f32 %f193, %f198, %f193; + st.shared.f32 [%rd52+2048], %f193; +$Lt_1_36610: + .loc 16 482 0 + shr.u32 %r70, %r70, 1; + mov.u32 %r73, 0; + setp.ne.u32 %p19, %r70, %r73; + @%p19 bra $Lt_1_36354; +$Lt_1_35842: + .loc 16 489 0 + mov.f32 %f44, %f189; + .loc 16 490 0 + mov.f32 %f43, %f190; + .loc 16 491 0 + mov.f32 %f42, %f191; + .loc 16 492 0 + mov.f32 %f46, %f192; + .loc 16 493 0 + mov.f32 %f45, %f193; + ld.param.s32 %r74, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r75, 0; + setp.le.s32 %p20, %r74, %r75; + @%p20 bra $Lt_1_37378; + .loc 16 497 0 + mov.f32 %f189, %f11; + st.shared.f32 [%rd52+0], %f189; + mov.f32 %f190, %f13; + st.shared.f32 [%rd52+512], %f190; + mov.f32 %f191, %f15; + st.shared.f32 [%rd52+1024], %f191; + mov.f32 %f192, %f17; + st.shared.f32 [%rd52+1536], %f192; + mov.f32 %f193, %f19; + st.shared.f32 [%rd52+2048], %f193; + mov.f32 %f199, %f21; + st.shared.f32 [%rd52+2560], %f199; + .loc 16 499 0 + mov.s32 %r76, %r69; + @!%p17 bra $Lt_1_37890; +$Lt_1_38402: + setp.ge.u32 %p21, %r8, %r76; + @%p21 bra $Lt_1_38658; + .loc 16 502 0 + add.u32 %r77, %r1, %r76; + cvt.u64.u32 %rd56, %r77; + mul.wide.u32 %rd57, %r77, 4; + add.u64 %rd58, %rd49, %rd57; + ld.shared.f32 %f200, [%rd58+0]; + add.ftz.f32 %f189, %f200, %f189; + st.shared.f32 [%rd52+0], %f189; + ld.shared.f32 %f201, [%rd58+512]; + add.ftz.f32 %f190, %f201, %f190; + st.shared.f32 [%rd52+512], %f190; + ld.shared.f32 %f202, [%rd58+1024]; + add.ftz.f32 %f191, %f202, %f191; + st.shared.f32 [%rd52+1024], %f191; + ld.shared.f32 %f203, [%rd58+1536]; + add.ftz.f32 %f192, %f203, %f192; + st.shared.f32 [%rd52+1536], %f192; + ld.shared.f32 %f204, [%rd58+2048]; + add.ftz.f32 %f193, %f204, %f193; + st.shared.f32 [%rd52+2048], %f193; + ld.shared.f32 %f205, [%rd58+2560]; + add.ftz.f32 %f199, %f205, %f199; + st.shared.f32 [%rd52+2560], %f199; +$Lt_1_38658: + .loc 16 499 0 + shr.u32 %r76, %r76, 1; + mov.u32 %r78, 0; + setp.ne.u32 %p22, %r76, %r78; + @%p22 bra $Lt_1_38402; +$Lt_1_37890: + .loc 16 507 0 + mov.f32 %f11, %f189; + mov.f32 %f13, %f190; + mov.f32 %f15, %f191; + mov.f32 %f17, %f192; + mov.f32 %f19, %f193; + mov.f32 %f21, %f199; +$Lt_1_37378: +$Lt_1_35330: + selp.s32 %r79, 1, 0, %p3; + mov.s32 %r80, 0; + set.eq.u32.s32 %r81, %r8, %r80; + neg.s32 %r82, %r81; + and.b32 %r83, %r79, %r82; + mov.u32 %r84, 0; + setp.eq.s32 %p23, %r83, %r84; + @%p23 bra $Lt_1_39426; + .loc 16 513 0 + cvt.s64.s32 %rd59, %r11; + ld.param.u64 %rd60, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd61, %r11, 4; + add.u64 %rd62, %rd60, %rd61; + ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r86, 0; + setp.le.s32 %p24, %r85, %r86; + @%p24 bra $Lt_1_39938; + .loc 16 515 0 + st.global.f32 [%rd62+0], %f46; + .loc 16 516 0 + cvt.s64.s32 %rd63, %r12; + mul.wide.s32 %rd64, %r12, 4; + add.u64 %rd65, %rd64, %rd62; + .loc 16 517 0 + st.global.f32 [%rd65+0], %f45; + .loc 16 518 0 + add.u64 %rd62, %rd64, %rd65; +$Lt_1_39938: + ld.param.s32 %r87, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r88, 0; + setp.le.s32 %p25, %r87, %r88; + @%p25 bra $Lt_1_40450; + .loc 16 522 0 + mov.f32 %f206, %f11; + st.global.f32 [%rd62+0], %f206; + .loc 16 523 0 + cvt.s64.s32 %rd66, %r12; + mul.wide.s32 %rd67, %r12, 4; + add.u64 %rd68, %rd67, %rd62; + .loc 16 522 0 + mov.f32 %f207, %f13; + st.global.f32 [%rd68+0], %f207; + .loc 16 523 0 + add.u64 %rd69, %rd67, %rd68; + .loc 16 522 0 + mov.f32 %f208, %f15; + st.global.f32 [%rd69+0], %f208; + .loc 16 523 0 + add.u64 %rd70, %rd67, %rd69; + .loc 16 522 0 + mov.f32 %f209, %f17; + st.global.f32 [%rd70+0], %f209; + .loc 16 523 0 + add.u64 %rd62, %rd67, %rd70; + .loc 16 522 0 + mov.f32 %f210, %f19; + st.global.f32 [%rd62+0], %f210; + mov.f32 %f211, %f21; + add.u64 %rd71, %rd67, %rd62; + st.global.f32 [%rd71+0], %f211; +$Lt_1_40450: + .loc 16 526 0 + ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd73, %rd59, 16; + add.u64 %rd74, %rd72, %rd73; + mov.f32 %f212, %f213; + st.global.v4.f32 [%rd74+0], {%f44,%f43,%f42,%f212}; +$Lt_1_39426: + .loc 16 528 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/cmmc_msm_gpu_ptx.h b/lib/gpu/cmmc_msm_gpu_ptx.h new file mode 100644 index 000000000..7ac653b08 --- /dev/null +++ b/lib/gpu/cmmc_msm_gpu_ptx.h @@ -0,0 +1,1353 @@ +const char * cmmc_msm_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .global .texref q_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .u64 __cudaparm_kernel_pair_lj3,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_q_,\n" +" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" +" .param .s32 __cudaparm_kernel_pair_smooth,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<90>;\n" +" .reg .u64 %rd<65>;\n" +" .reg .f32 %f<210>;\n" +" .reg .pred %p<25>;\n" +" .shared .f32 __cuda_local_var_32507_33_non_const__ia;\n" +" .shared .f32 __cuda_local_var_32508_33_non_const__ia2;\n" +" .shared .f32 __cuda_local_var_32509_33_non_const__ia3;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_lj112[32];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32647_35_non_const_red_acc144[3072];\n" +" .loc 16 100 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 107 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 108 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 109 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 110 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 111 0\n" +" ld.global.f32 %f5, [%rd1+16];\n" +" .loc 16 112 0\n" +" ld.global.f32 %f6, [%rd1+20];\n" +" .loc 16 113 0\n" +" ld.global.f32 %f7, [%rd1+24];\n" +" .loc 16 114 0\n" +" ld.global.f32 %f8, [%rd1+28];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8};\n" +" .loc 16 127 0\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" mov.f32 %f17, 0f00000000; \n" +" mov.f32 %f18, %f17;\n" +" mov.f32 %f19, 0f00000000; \n" +" mov.f32 %f20, %f19;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_25346;\n" +" .loc 16 130 0\n" +" ld.param.f32 %f21, [__cudaparm_kernel_pair_cut_coulsq];\n" +" sqrt.approx.ftz.f32 %f22, %f21;\n" +" mov.f32 %f23, 0fbf800000; \n" +" div.approx.ftz.f32 %f24, %f23, %f22;\n" +" st.shared.f32 [__cuda_local_var_32507_33_non_const__ia], %f24;\n" +" .loc 16 131 0\n" +" mov.f32 %f25, 0fbf800000; \n" +" div.approx.ftz.f32 %f26, %f25, %f21;\n" +" st.shared.f32 [__cuda_local_var_32508_33_non_const__ia2], %f26;\n" +" .loc 16 132 0\n" +" mul.ftz.f32 %f27, %f26, %f24;\n" +" st.shared.f32 [__cuda_local_var_32509_33_non_const__ia3], %f27;\n" +" .loc 16 135 0\n" +" cvt.s64.s32 %rd2, %r9;\n" +" mul.wide.s32 %rd3, %r9, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd5, %rd3, %rd4;\n" +" ld.global.s32 %r11, [%rd5+0];\n" +" .loc 16 137 0\n" +" ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd6, %r12;\n" +" mul.wide.s32 %rd7, %r12, 4;\n" +" add.u64 %rd8, %rd7, %rd5;\n" +" ld.global.s32 %r13, [%rd8+0];\n" +" add.u64 %rd9, %rd7, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd4;\n" +" @%p2 bra $Lt_0_25858;\n" +" .loc 16 143 0\n" +" cvt.s32.s64 %r14, %rd6;\n" +" mul.lo.s32 %r15, %r14, %r13;\n" +" cvt.s64.s32 %rd11, %r15;\n" +" mul.wide.s32 %rd12, %r15, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 144 0\n" +" mul.lo.s32 %r16, %r6, %r14;\n" +" cvt.s64.s32 %rd14, %r16;\n" +" mul.wide.s32 %rd15, %r16, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 145 0\n" +" mul.lo.s32 %r17, %r14, %r1;\n" +" bra.uni $Lt_0_25602;\n" +"$Lt_0_25858:\n" +" .loc 16 147 0\n" +" ld.global.s32 %r18, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r18;\n" +" mul.wide.s32 %rd18, %r18, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 148 0\n" +" cvt.s64.s32 %rd20, %r13;\n" +" mul.wide.s32 %rd21, %r13, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 149 0\n" +" mov.s32 %r17, %r1;\n" +" .loc 16 150 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_25602:\n" +" .loc 16 153 0\n" +" mov.u32 %r19, %r11;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f28,%f29,%f30,%f31},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f32, %f28;\n" +" mov.f32 %f33, %f29;\n" +" mov.f32 %f34, %f30;\n" +" mov.f32 %f35, %f31;\n" +" .loc 16 154 0\n" +" mov.u32 %r26, %r11;\n" +" mov.s32 %r27, 0;\n" +" mov.u32 %r28, %r27;\n" +" mov.s32 %r29, 0;\n" +" mov.u32 %r30, %r29;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[q_tex,{%r26,%r28,%r30,%r32}];\n" +" mov.f32 %f40, %f36;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_38402;\n" +" cvt.rzi.ftz.s32.f32 %r33, %f35;\n" +" cvt.s64.s32 %rd24, %r17;\n" +" ld.param.s32 %r34, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r35, %r34, %r33;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];\n" +" mov.f32 %f41, 0f00000000; \n" +" mov.f32 %f42, 0f00000000; \n" +" mov.f32 %f43, 0f00000000; \n" +" mov.f32 %f44, 0f00000000; \n" +" mov.f32 %f45, 0f00000000; \n" +" mov.u64 %rd26, __cuda___cuda_local_var_32498_33_non_const_sp_lj112;\n" +"$Lt_0_26626:\n" +" .loc 16 158 0\n" +" ld.global.s32 %r36, [%rd16+0];\n" +" .loc 16 161 0\n" +" shr.s32 %r37, %r36, 30;\n" +" and.b32 %r38, %r37, 3;\n" +" cvt.s64.s32 %rd27, %r38;\n" +" mul.wide.s32 %rd28, %r38, 4;\n" +" add.u64 %rd29, %rd26, %rd28;\n" +" ld.shared.f32 %f46, [%rd29+0];\n" +" .loc 16 162 0\n" +" mov.f32 %f47, 0f3f800000; \n" +" ld.shared.f32 %f48, [%rd29+16];\n" +" sub.ftz.f32 %f49, %f47, %f48;\n" +" .loc 16 165 0\n" +" and.b32 %r39, %r36, 1073741823;\n" +" mov.u32 %r40, %r39;\n" +" mov.s32 %r41, 0;\n" +" mov.u32 %r42, %r41;\n" +" mov.s32 %r43, 0;\n" +" mov.u32 %r44, %r43;\n" +" mov.s32 %r45, 0;\n" +" mov.u32 %r46, %r45;\n" +" tex.1d.v4.f32.s32 {%f50,%f51,%f52,%f53},[pos_tex,{%r40,%r42,%r44,%r46}];\n" +" mov.f32 %f54, %f50;\n" +" mov.f32 %f55, %f51;\n" +" mov.f32 %f56, %f52;\n" +" mov.f32 %f57, %f53;\n" +" cvt.rzi.ftz.s32.f32 %r47, %f57;\n" +" sub.ftz.f32 %f58, %f33, %f55;\n" +" sub.ftz.f32 %f59, %f32, %f54;\n" +" sub.ftz.f32 %f60, %f34, %f56;\n" +" mul.ftz.f32 %f61, %f58, %f58;\n" +" fma.rn.ftz.f32 %f62, %f59, %f59, %f61;\n" +" fma.rn.ftz.f32 %f63, %f60, %f60, %f62;\n" +" add.s32 %r48, %r47, %r35;\n" +" cvt.s64.s32 %rd30, %r48;\n" +" mul.wide.s32 %rd31, %r48, 16;\n" +" add.u64 %rd32, %rd31, %rd25;\n" +" ld.global.f32 %f64, [%rd32+0];\n" +" setp.gt.ftz.f32 %p4, %f64, %f63;\n" +" @!%p4 bra $Lt_0_32002;\n" +" rcp.approx.ftz.f32 %f65, %f63;\n" +" ld.global.f32 %f66, [%rd32+4];\n" +" setp.lt.ftz.f32 %p5, %f63, %f66;\n" +" @!%p5 bra $Lt_0_27650;\n" +" ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd34, %rd33, %rd31;\n" +" ld.global.f32 %f67, [%rd34+0];\n" +" mov.f32 %f68, 0f40000000; \n" +" setp.eq.ftz.f32 %p6, %f67, %f68;\n" +" @!%p6 bra $Lt_0_28162;\n" +" .loc 16 181 0\n" +" mul.ftz.f32 %f69, %f65, %f65;\n" +" mov.f32 %f70, %f69;\n" +" mov.f32 %f71, %f70;\n" +" .loc 16 182 0\n" +" mul.ftz.f32 %f72, %f69, %f69;\n" +" mov.f32 %f73, %f72;\n" +" bra.uni $Lt_0_28418;\n" +"$Lt_0_28162:\n" +" mov.f32 %f74, 0f3f800000; \n" +" setp.eq.ftz.f32 %p7, %f67, %f74;\n" +" @!%p7 bra $Lt_0_28674;\n" +" .loc 16 184 0\n" +" sqrt.approx.ftz.f32 %f75, %f65;\n" +" mul.ftz.f32 %f76, %f65, %f75;\n" +" mov.f32 %f72, %f76;\n" +" mov.f32 %f73, %f72;\n" +" .loc 16 185 0\n" +" mul.ftz.f32 %f70, %f76, %f76;\n" +" mov.f32 %f71, %f70;\n" +" bra.uni $Lt_0_28418;\n" +"$Lt_0_28674:\n" +" .loc 16 187 0\n" +" mul.ftz.f32 %f77, %f65, %f65;\n" +" mul.ftz.f32 %f78, %f65, %f77;\n" +" mov.f32 %f70, %f78;\n" +" mov.f32 %f71, %f70;\n" +" .loc 16 188 0\n" +" mov.f32 %f72, %f78;\n" +" mov.f32 %f73, %f72;\n" +"$Lt_0_28418:\n" +"$Lt_0_27906:\n" +" .loc 16 190 0\n" +" mul.ftz.f32 %f79, %f46, %f70;\n" +" ld.global.v2.f32 {%f80,%f81}, [%rd32+8];\n" +" mul.ftz.f32 %f82, %f80, %f72;\n" +" sub.ftz.f32 %f83, %f82, %f81;\n" +" mul.ftz.f32 %f84, %f79, %f83;\n" +" bra.uni $Lt_0_27394;\n" +"$Lt_0_27650:\n" +" .loc 16 192 0\n" +" mov.f32 %f84, 0f00000000; \n" +"$Lt_0_27394:\n" +" setp.lt.ftz.f32 %p8, %f63, %f21;\n" +" @!%p8 bra $Lt_0_29186;\n" +" .loc 16 196 0\n" +" sqrt.approx.ftz.f32 %f85, %f63;\n" +" rcp.approx.ftz.f32 %f86, %f85;\n" +" mov.f32 %f87, %f86;\n" +" .loc 16 197 0\n" +" mov.u32 %r49, %r39;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" tex.1d.v4.f32.s32 {%f88,%f89,%f90,%f91},[q_tex,{%r49,%r51,%r53,%r55}];\n" +" mov.f32 %f92, %f88;\n" +" ld.param.f32 %f93, [__cudaparm_kernel_pair_qqrd2e];\n" +" mul.ftz.f32 %f94, %f93, %f40;\n" +" mul.ftz.f32 %f95, %f94, %f92;\n" +" mov.f32 %f96, %f95;\n" +" .loc 16 198 0\n" +" mul.ftz.f32 %f97, %f63, %f26;\n" +" mov.f32 %f98, %f97;\n" +" .loc 16 199 0\n" +" mul.ftz.f32 %f99, %f97, %f97;\n" +" mov.f32 %f100, %f99;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_smooth];\n" +" mov.u32 %r57, 0;\n" +" setp.ne.s32 %p9, %r56, %r57;\n" +" @%p9 bra $Lt_0_29698;\n" +" .loc 16 201 0\n" +" div.approx.ftz.f32 %f101, %f86, %f63;\n" +" mov.f32 %f102, 0fc08c0000; \n" +" mov.f32 %f103, 0f40a80000; \n" +" fma.rn.ftz.f32 %f104, %f103, %f97, %f102;\n" +" mov.f32 %f105, 0f3ff00000; \n" +" mul.ftz.f32 %f106, %f105, %f99;\n" +" sub.ftz.f32 %f107, %f104, %f106;\n" +" mul.ftz.f32 %f108, %f27, %f107;\n" +" sub.ftz.f32 %f109, %f108, %f101;\n" +" mul.ftz.f32 %f110, %f49, %f86;\n" +" sub.ftz.f32 %f111, %f109, %f110;\n" +" mul.ftz.f32 %f112, %f95, %f111;\n" +" bra.uni $Lt_0_28930;\n" +"$Lt_0_29698:\n" +" .loc 16 205 0\n" +" mul.ftz.f32 %f113, %f99, %f97;\n" +" mov.f32 %f114, %f113;\n" +" .loc 16 206 0\n" +" div.approx.ftz.f32 %f115, %f86, %f63;\n" +" mov.f32 %f116, 0fc0d20000; \n" +" mov.f32 %f117, 0f413d0000; \n" +" fma.rn.ftz.f32 %f118, %f117, %f97, %f116;\n" +" mov.f32 %f119, 0f41070000; \n" +" mul.ftz.f32 %f120, %f119, %f99;\n" +" sub.ftz.f32 %f121, %f118, %f120;\n" +" mov.f32 %f122, 0f400c0000; \n" +" fma.rn.ftz.f32 %f123, %f122, %f113, %f121;\n" +" mul.ftz.f32 %f124, %f27, %f123;\n" +" sub.ftz.f32 %f125, %f124, %f115;\n" +" mul.ftz.f32 %f126, %f49, %f86;\n" +" sub.ftz.f32 %f127, %f125, %f126;\n" +" mul.ftz.f32 %f112, %f95, %f127;\n" +" bra.uni $Lt_0_28930;\n" +"$Lt_0_29186:\n" +" .loc 16 213 0\n" +" mov.f32 %f96, 0f00000000; \n" +" mov.f32 %f112, 0f00000000; \n" +"$Lt_0_28930:\n" +" .loc 16 218 0\n" +" fma.rn.ftz.f32 %f128, %f84, %f65, %f112;\n" +" fma.rn.ftz.f32 %f43, %f59, %f128, %f43;\n" +" .loc 16 219 0\n" +" fma.rn.ftz.f32 %f42, %f58, %f128, %f42;\n" +" .loc 16 220 0\n" +" fma.rn.ftz.f32 %f41, %f60, %f128, %f41;\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p10, %r58, %r59;\n" +" @%p10 bra $Lt_0_31490;\n" +" @!%p8 bra $Lt_0_30978;\n" +" mov.f32 %f129, %f100;\n" +" mov.f32 %f130, %f98;\n" +" mov.f32 %f131, %f87;\n" +" ld.param.s32 %r60, [__cudaparm_kernel_pair_smooth];\n" +" mov.u32 %r61, 0;\n" +" setp.ne.s32 %p11, %r60, %r61;\n" +" @%p11 bra $Lt_0_31234;\n" +" .loc 16 225 0\n" +" mov.f32 %f132, 0f400c0000; \n" +" mov.f32 %f133, 0f400c0000; \n" +" mul.ftz.f32 %f134, %f133, %f130;\n" +" sub.ftz.f32 %f135, %f132, %f134;\n" +" mov.f32 %f136, 0f3fa80000; \n" +" fma.rn.ftz.f32 %f137, %f136, %f129, %f135;\n" +" mov.f32 %f138, 0f3ea00000; \n" +" mul.ftz.f32 %f139, %f129, %f138;\n" +" mul.ftz.f32 %f140, %f130, %f139;\n" +" sub.ftz.f32 %f141, %f137, %f140;\n" +" fma.rn.ftz.f32 %f142, %f24, %f141, %f131;\n" +" mul.ftz.f32 %f143, %f49, %f131;\n" +" sub.ftz.f32 %f144, %f142, %f143;\n" +" fma.rn.ftz.f32 %f44, %f96, %f144, %f44;\n" +" bra.uni $Lt_0_30978;\n" +"$Lt_0_31234:\n" +" .loc 16 230 0\n" +" mov.f32 %f145, 0f401d8000; \n" +" mov.f32 %f146, 0f40520000; \n" +" mul.ftz.f32 %f147, %f146, %f130;\n" +" sub.ftz.f32 %f148, %f145, %f147;\n" +" mov.f32 %f149, 0f403d0000; \n" +" fma.rn.ftz.f32 %f150, %f149, %f129, %f148;\n" +" mov.f32 %f151, 0f3fb40000; \n" +" mov.f32 %f152, %f114;\n" +" mul.ftz.f32 %f153, %f151, %f152;\n" +" sub.ftz.f32 %f154, %f150, %f153;\n" +" mov.f32 %f155, 0f3e8c0000; \n" +" mul.ftz.f32 %f156, %f129, %f155;\n" +" fma.rn.ftz.f32 %f157, %f129, %f156, %f154;\n" +" fma.rn.ftz.f32 %f158, %f24, %f157, %f131;\n" +" fma.rn.ftz.f32 %f44, %f96, %f158, %f44;\n" +"$Lt_0_30978:\n" +"$Lt_0_30466:\n" +" @!%p5 bra $Lt_0_31490;\n" +" .loc 16 236 0\n" +" ld.param.u64 %rd35, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd36, %rd35, %rd31;\n" +" ld.global.v4.f32 {_,%f159,%f160,%f161}, [%rd36+0];\n" +" mov.f32 %f162, %f71;\n" +" mul.ftz.f32 %f163, %f162, %f46;\n" +" mov.f32 %f164, %f73;\n" +" mul.ftz.f32 %f165, %f159, %f164;\n" +" sub.ftz.f32 %f166, %f165, %f160;\n" +" mul.ftz.f32 %f167, %f163, %f166;\n" +" sub.ftz.f32 %f168, %f167, %f161;\n" +" add.ftz.f32 %f45, %f45, %f168;\n" +"$Lt_0_31490:\n" +"$Lt_0_29954:\n" +" ld.param.s32 %r62, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r63, 0;\n" +" setp.le.s32 %p12, %r62, %r63;\n" +" @%p12 bra $Lt_0_32002;\n" +" .loc 16 241 0\n" +" mov.f32 %f169, %f10;\n" +" mul.ftz.f32 %f170, %f59, %f59;\n" +" fma.rn.ftz.f32 %f171, %f128, %f170, %f169;\n" +" mov.f32 %f10, %f171;\n" +" .loc 16 242 0\n" +" mov.f32 %f172, %f12;\n" +" fma.rn.ftz.f32 %f173, %f128, %f61, %f172;\n" +" mov.f32 %f12, %f173;\n" +" .loc 16 243 0\n" +" mov.f32 %f174, %f14;\n" +" mul.ftz.f32 %f175, %f60, %f60;\n" +" fma.rn.ftz.f32 %f176, %f128, %f175, %f174;\n" +" mov.f32 %f14, %f176;\n" +" .loc 16 244 0\n" +" mov.f32 %f177, %f16;\n" +" mul.ftz.f32 %f178, %f58, %f59;\n" +" fma.rn.ftz.f32 %f179, %f128, %f178, %f177;\n" +" mov.f32 %f16, %f179;\n" +" .loc 16 245 0\n" +" mov.f32 %f180, %f18;\n" +" mul.ftz.f32 %f181, %f59, %f60;\n" +" fma.rn.ftz.f32 %f182, %f128, %f181, %f180;\n" +" mov.f32 %f18, %f182;\n" +" .loc 16 246 0\n" +" mul.ftz.f32 %f183, %f58, %f60;\n" +" fma.rn.ftz.f32 %f19, %f128, %f183, %f19;\n" +" mov.f32 %f20, %f19;\n" +"$Lt_0_32002:\n" +"$Lt_0_26882:\n" +" .loc 16 157 0\n" +" mul.lo.u64 %rd37, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd37;\n" +" setp.lt.u64 %p13, %rd16, %rd13;\n" +" @%p13 bra $Lt_0_26626;\n" +" bra.uni $Lt_0_25090;\n" +"$Lt_0_38402:\n" +" mov.f32 %f41, 0f00000000; \n" +" mov.f32 %f42, 0f00000000; \n" +" mov.f32 %f43, 0f00000000; \n" +" mov.f32 %f44, 0f00000000; \n" +" mov.f32 %f45, 0f00000000; \n" +" bra.uni $Lt_0_25090;\n" +"$Lt_0_25346:\n" +" mov.f32 %f41, 0f00000000; \n" +" mov.f32 %f42, 0f00000000; \n" +" mov.f32 %f43, 0f00000000; \n" +" mov.f32 %f44, 0f00000000; \n" +" mov.f32 %f45, 0f00000000; \n" +"$Lt_0_25090:\n" +" mov.u32 %r64, 1;\n" +" setp.le.s32 %p14, %r1, %r64;\n" +" @%p14 bra $Lt_0_34818;\n" +" .loc 16 257 0\n" +" mov.u64 %rd38, __cuda___cuda_local_var_32647_35_non_const_red_acc144;\n" +" cvt.s64.s32 %rd39, %r2;\n" +" mul.wide.s32 %rd40, %r2, 4;\n" +" add.u64 %rd41, %rd38, %rd40;\n" +" mov.f32 %f184, %f43;\n" +" st.shared.f32 [%rd41+0], %f184;\n" +" .loc 16 258 0\n" +" mov.f32 %f185, %f42;\n" +" st.shared.f32 [%rd41+512], %f185;\n" +" .loc 16 259 0\n" +" mov.f32 %f186, %f41;\n" +" st.shared.f32 [%rd41+1024], %f186;\n" +" .loc 16 260 0\n" +" mov.f32 %f187, %f45;\n" +" st.shared.f32 [%rd41+1536], %f187;\n" +" .loc 16 261 0\n" +" mov.f32 %f188, %f44;\n" +" st.shared.f32 [%rd41+2048], %f188;\n" +" .loc 16 263 0\n" +" shr.s32 %r65, %r1, 31;\n" +" mov.s32 %r66, 1;\n" +" and.b32 %r67, %r65, %r66;\n" +" add.s32 %r68, %r67, %r1;\n" +" shr.s32 %r69, %r68, 1;\n" +" mov.s32 %r70, %r69;\n" +" mov.u32 %r71, 0;\n" +" setp.ne.u32 %p15, %r69, %r71;\n" +" @!%p15 bra $Lt_0_33282;\n" +"$Lt_0_33794:\n" +" setp.ge.u32 %p16, %r6, %r70;\n" +" @%p16 bra $Lt_0_34050;\n" +" .loc 16 266 0\n" +" add.u32 %r72, %r2, %r70;\n" +" cvt.u64.u32 %rd42, %r72;\n" +" mul.wide.u32 %rd43, %r72, 4;\n" +" add.u64 %rd44, %rd38, %rd43;\n" +" ld.shared.f32 %f189, [%rd44+0];\n" +" add.ftz.f32 %f184, %f189, %f184;\n" +" st.shared.f32 [%rd41+0], %f184;\n" +" ld.shared.f32 %f190, [%rd44+512];\n" +" add.ftz.f32 %f185, %f190, %f185;\n" +" st.shared.f32 [%rd41+512], %f185;\n" +" ld.shared.f32 %f191, [%rd44+1024];\n" +" add.ftz.f32 %f186, %f191, %f186;\n" +" st.shared.f32 [%rd41+1024], %f186;\n" +" ld.shared.f32 %f192, [%rd44+1536];\n" +" add.ftz.f32 %f187, %f192, %f187;\n" +" st.shared.f32 [%rd41+1536], %f187;\n" +" ld.shared.f32 %f193, [%rd44+2048];\n" +" add.ftz.f32 %f188, %f193, %f188;\n" +" st.shared.f32 [%rd41+2048], %f188;\n" +"$Lt_0_34050:\n" +" .loc 16 263 0\n" +" shr.u32 %r70, %r70, 1;\n" +" mov.u32 %r73, 0;\n" +" setp.ne.u32 %p17, %r70, %r73;\n" +" @%p17 bra $Lt_0_33794;\n" +"$Lt_0_33282:\n" +" .loc 16 270 0\n" +" mov.f32 %f43, %f184;\n" +" .loc 16 271 0\n" +" mov.f32 %f42, %f185;\n" +" .loc 16 272 0\n" +" mov.f32 %f41, %f186;\n" +" .loc 16 273 0\n" +" mov.f32 %f45, %f187;\n" +" .loc 16 274 0\n" +" mov.f32 %f44, %f188;\n" +" ld.param.s32 %r74, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r75, 0;\n" +" setp.le.s32 %p18, %r74, %r75;\n" +" @%p18 bra $Lt_0_34818;\n" +" .loc 16 278 0\n" +" mov.f32 %f184, %f10;\n" +" st.shared.f32 [%rd41+0], %f184;\n" +" mov.f32 %f185, %f12;\n" +" st.shared.f32 [%rd41+512], %f185;\n" +" mov.f32 %f186, %f14;\n" +" st.shared.f32 [%rd41+1024], %f186;\n" +" mov.f32 %f187, %f16;\n" +" st.shared.f32 [%rd41+1536], %f187;\n" +" mov.f32 %f188, %f18;\n" +" st.shared.f32 [%rd41+2048], %f188;\n" +" mov.f32 %f194, %f20;\n" +" st.shared.f32 [%rd41+2560], %f194;\n" +" .loc 16 280 0\n" +" mov.s32 %r76, %r69;\n" +" @!%p15 bra $Lt_0_35330;\n" +"$Lt_0_35842:\n" +" setp.ge.u32 %p19, %r6, %r76;\n" +" @%p19 bra $Lt_0_36098;\n" +" .loc 16 283 0\n" +" add.u32 %r77, %r2, %r76;\n" +" cvt.u64.u32 %rd45, %r77;\n" +" mul.wide.u32 %rd46, %r77, 4;\n" +" add.u64 %rd47, %rd38, %rd46;\n" +" ld.shared.f32 %f195, [%rd47+0];\n" +" add.ftz.f32 %f184, %f195, %f184;\n" +" st.shared.f32 [%rd41+0], %f184;\n" +" ld.shared.f32 %f196, [%rd47+512];\n" +" add.ftz.f32 %f185, %f196, %f185;\n" +" st.shared.f32 [%rd41+512], %f185;\n" +" ld.shared.f32 %f197, [%rd47+1024];\n" +" add.ftz.f32 %f186, %f197, %f186;\n" +" st.shared.f32 [%rd41+1024], %f186;\n" +" ld.shared.f32 %f198, [%rd47+1536];\n" +" add.ftz.f32 %f187, %f198, %f187;\n" +" st.shared.f32 [%rd41+1536], %f187;\n" +" ld.shared.f32 %f199, [%rd47+2048];\n" +" add.ftz.f32 %f188, %f199, %f188;\n" +" st.shared.f32 [%rd41+2048], %f188;\n" +" ld.shared.f32 %f200, [%rd47+2560];\n" +" add.ftz.f32 %f194, %f200, %f194;\n" +" st.shared.f32 [%rd41+2560], %f194;\n" +"$Lt_0_36098:\n" +" .loc 16 280 0\n" +" shr.u32 %r76, %r76, 1;\n" +" mov.u32 %r78, 0;\n" +" setp.ne.u32 %p20, %r76, %r78;\n" +" @%p20 bra $Lt_0_35842;\n" +"$Lt_0_35330:\n" +" .loc 16 288 0\n" +" mov.f32 %f10, %f184;\n" +" mov.f32 %f12, %f185;\n" +" mov.f32 %f14, %f186;\n" +" mov.f32 %f16, %f187;\n" +" mov.f32 %f18, %f188;\n" +" mov.f32 %f20, %f194;\n" +"$Lt_0_34818:\n" +"$Lt_0_32770:\n" +" selp.s32 %r79, 1, 0, %p1;\n" +" mov.s32 %r80, 0;\n" +" set.eq.u32.s32 %r81, %r6, %r80;\n" +" neg.s32 %r82, %r81;\n" +" and.b32 %r83, %r79, %r82;\n" +" mov.u32 %r84, 0;\n" +" setp.eq.s32 %p21, %r83, %r84;\n" +" @%p21 bra $Lt_0_36866;\n" +" .loc 16 294 0\n" +" cvt.s64.s32 %rd48, %r9;\n" +" ld.param.u64 %rd49, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd50, %r9, 4;\n" +" add.u64 %rd51, %rd49, %rd50;\n" +" ld.param.s32 %r85, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r86, 0;\n" +" setp.le.s32 %p22, %r85, %r86;\n" +" @%p22 bra $Lt_0_37378;\n" +" .loc 16 296 0\n" +" st.global.f32 [%rd51+0], %f45;\n" +" .loc 16 297 0\n" +" cvt.s64.s32 %rd52, %r10;\n" +" mul.wide.s32 %rd53, %r10, 4;\n" +" add.u64 %rd54, %rd53, %rd51;\n" +" .loc 16 298 0\n" +" st.global.f32 [%rd54+0], %f44;\n" +" .loc 16 299 0\n" +" add.u64 %rd51, %rd53, %rd54;\n" +"$Lt_0_37378:\n" +" ld.param.s32 %r87, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r88, 0;\n" +" setp.le.s32 %p23, %r87, %r88;\n" +" @%p23 bra $Lt_0_37890;\n" +" .loc 16 303 0\n" +" mov.f32 %f201, %f10;\n" +" st.global.f32 [%rd51+0], %f201;\n" +" .loc 16 304 0\n" +" cvt.s64.s32 %rd55, %r10;\n" +" mul.wide.s32 %rd56, %r10, 4;\n" +" add.u64 %rd57, %rd56, %rd51;\n" +" .loc 16 303 0\n" +" mov.f32 %f202, %f12;\n" +" st.global.f32 [%rd57+0], %f202;\n" +" .loc 16 304 0\n" +" add.u64 %rd58, %rd56, %rd57;\n" +" .loc 16 303 0\n" +" mov.f32 %f203, %f14;\n" +" st.global.f32 [%rd58+0], %f203;\n" +" .loc 16 304 0\n" +" add.u64 %rd59, %rd56, %rd58;\n" +" .loc 16 303 0\n" +" mov.f32 %f204, %f16;\n" +" st.global.f32 [%rd59+0], %f204;\n" +" .loc 16 304 0\n" +" add.u64 %rd51, %rd56, %rd59;\n" +" .loc 16 303 0\n" +" mov.f32 %f205, %f18;\n" +" st.global.f32 [%rd51+0], %f205;\n" +" mov.f32 %f206, %f20;\n" +" add.u64 %rd60, %rd56, %rd51;\n" +" st.global.f32 [%rd60+0], %f206;\n" +"$Lt_0_37890:\n" +" .loc 16 307 0\n" +" ld.param.u64 %rd61, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd62, %rd48, 16;\n" +" add.u64 %rd63, %rd61, %rd62;\n" +" mov.f32 %f207, %f208;\n" +" st.global.v4.f32 [%rd63+0], {%f43,%f42,%f41,%f207};\n" +"$Lt_0_36866:\n" +" .loc 16 309 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" +" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" +" .param .s32 __cudaparm_kernel_pair_fast_smooth,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<90>;\n" +" .reg .u64 %rd<76>;\n" +" .reg .f32 %f<215>;\n" +" .reg .pred %p<27>;\n" +" .shared .f32 __cuda_local_var_32737_33_non_const__ia;\n" +" .shared .f32 __cuda_local_var_32738_33_non_const__ia2;\n" +" .shared .f32 __cuda_local_var_32739_33_non_const__ia3;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32719_33_non_const_sp_lj3320[32];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32717_34_non_const_lj13360[1936];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32718_34_non_const_lj35296[1936];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32866_35_non_const_red_acc7232[3072];\n" +" .loc 16 319 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 7;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_26626;\n" +" .loc 16 329 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32719_33_non_const_sp_lj3320;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_26626:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32719_33_non_const_sp_lj3320;\n" +" mov.u32 %r3, 120;\n" +" setp.gt.s32 %p2, %r1, %r3;\n" +" @%p2 bra $Lt_1_27138;\n" +" .loc 16 331 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32717_34_non_const_lj13360;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32718_34_non_const_lj35296;\n" +" cvt.s64.s32 %rd9, %r1;\n" +" mul.wide.s32 %rd10, %r1, 16;\n" +" ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n" +" add.u64 %rd12, %rd11, %rd10;\n" +" add.u64 %rd13, %rd10, %rd7;\n" +" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0];\n" +" st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5};\n" +" .loc 16 332 0\n" +" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" +" add.u64 %rd15, %rd14, %rd10;\n" +" add.u64 %rd16, %rd10, %rd8;\n" +" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" +" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" +"$Lt_1_27138:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32717_34_non_const_lj13360;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32718_34_non_const_lj35296;\n" +" .loc 16 343 0\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, %f14;\n" +" mov.f32 %f16, 0f00000000; \n" +" mov.f32 %f17, %f16;\n" +" mov.f32 %f18, 0f00000000; \n" +" mov.f32 %f19, %f18;\n" +" mov.f32 %f20, 0f00000000; \n" +" mov.f32 %f21, %f20;\n" +" .loc 16 348 0\n" +" ld.param.f32 %f22, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" +" sqrt.approx.ftz.f32 %f23, %f22;\n" +" mov.f32 %f24, 0fbf800000; \n" +" div.approx.ftz.f32 %f25, %f24, %f23;\n" +" st.shared.f32 [__cuda_local_var_32737_33_non_const__ia], %f25;\n" +" .loc 16 349 0\n" +" rcp.approx.ftz.f32 %f26, %f22;\n" +" st.shared.f32 [__cuda_local_var_32738_33_non_const__ia2], %f26;\n" +" .loc 16 350 0\n" +" mul.ftz.f32 %f27, %f26, %f25;\n" +" st.shared.f32 [__cuda_local_var_32739_33_non_const__ia3], %f27;\n" +" .loc 16 351 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r5, %r1, %r4;\n" +" cvt.s32.u32 %r6, %ntid.x;\n" +" div.s32 %r7, %r6, %r4;\n" +" rem.s32 %r8, %r1, %r4;\n" +" cvt.s32.u32 %r9, %ctaid.x;\n" +" mul.lo.s32 %r10, %r9, %r7;\n" +" add.s32 %r11, %r5, %r10;\n" +" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p3, %r11, %r12;\n" +" @!%p3 bra $Lt_1_27906;\n" +" .loc 16 355 0\n" +" cvt.s64.s32 %rd17, %r11;\n" +" mul.wide.s32 %rd18, %r11, 4;\n" +" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd20, %rd18, %rd19;\n" +" ld.global.s32 %r13, [%rd20+0];\n" +" .loc 16 357 0\n" +" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd21, %r14;\n" +" mul.wide.s32 %rd22, %r14, 4;\n" +" add.u64 %rd23, %rd22, %rd20;\n" +" ld.global.s32 %r15, [%rd23+0];\n" +" add.u64 %rd24, %rd22, %rd23;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p4, %rd25, %rd19;\n" +" @%p4 bra $Lt_1_28418;\n" +" .loc 16 363 0\n" +" cvt.s32.s64 %r16, %rd21;\n" +" mul.lo.s32 %r17, %r16, %r15;\n" +" cvt.s64.s32 %rd26, %r17;\n" +" mul.wide.s32 %rd27, %r17, 4;\n" +" add.u64 %rd28, %rd24, %rd27;\n" +" .loc 16 364 0\n" +" mul.lo.s32 %r18, %r8, %r16;\n" +" cvt.s64.s32 %rd29, %r18;\n" +" mul.wide.s32 %rd30, %r18, 4;\n" +" add.u64 %rd31, %rd24, %rd30;\n" +" .loc 16 365 0\n" +" mul.lo.s32 %r19, %r16, %r4;\n" +" bra.uni $Lt_1_28162;\n" +"$Lt_1_28418:\n" +" .loc 16 367 0\n" +" ld.global.s32 %r20, [%rd24+0];\n" +" cvt.s64.s32 %rd32, %r20;\n" +" mul.wide.s32 %rd33, %r20, 4;\n" +" add.u64 %rd34, %rd25, %rd33;\n" +" .loc 16 368 0\n" +" cvt.s64.s32 %rd35, %r15;\n" +" mul.wide.s32 %rd36, %r15, 4;\n" +" add.u64 %rd28, %rd34, %rd36;\n" +" .loc 16 369 0\n" +" mov.s32 %r19, %r4;\n" +" .loc 16 370 0\n" +" cvt.s64.s32 %rd37, %r8;\n" +" mul.wide.s32 %rd38, %r8, 4;\n" +" add.u64 %rd31, %rd34, %rd38;\n" +"$Lt_1_28162:\n" +" .loc 16 373 0\n" +" mov.u32 %r21, %r13;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" tex.1d.v4.f32.s32 {%f28,%f29,%f30,%f31},[pos_tex,{%r21,%r23,%r25,%r27}];\n" +" mov.f32 %f32, %f28;\n" +" mov.f32 %f33, %f29;\n" +" mov.f32 %f34, %f30;\n" +" mov.f32 %f35, %f31;\n" +" .loc 16 374 0\n" +" mov.u32 %r28, %r13;\n" +" mov.s32 %r29, 0;\n" +" mov.u32 %r30, %r29;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" mov.s32 %r33, 0;\n" +" mov.u32 %r34, %r33;\n" +" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[q_tex,{%r28,%r30,%r32,%r34}];\n" +" mov.f32 %f40, %f36;\n" +" setp.ge.u64 %p5, %rd31, %rd28;\n" +" @%p5 bra $Lt_1_40962;\n" +" cvt.rzi.ftz.s32.f32 %r35, %f35;\n" +" cvt.s64.s32 %rd39, %r19;\n" +" mul.lo.s32 %r36, %r35, 11;\n" +" cvt.rn.f32.s32 %f41, %r36;\n" +" mov.f32 %f42, 0f00000000; \n" +" mov.f32 %f43, 0f00000000; \n" +" mov.f32 %f44, 0f00000000; \n" +" mov.f32 %f45, 0f00000000; \n" +" mov.f32 %f46, 0f00000000; \n" +"$Lt_1_29186:\n" +" .loc 16 379 0\n" +" ld.global.s32 %r37, [%rd31+0];\n" +" .loc 16 382 0\n" +" shr.s32 %r38, %r37, 30;\n" +" and.b32 %r39, %r38, 3;\n" +" cvt.s64.s32 %rd40, %r39;\n" +" mul.wide.s32 %rd41, %r39, 4;\n" +" add.u64 %rd42, %rd1, %rd41;\n" +" ld.shared.f32 %f47, [%rd42+0];\n" +" .loc 16 383 0\n" +" mov.f32 %f48, 0f3f800000; \n" +" ld.shared.f32 %f49, [%rd42+16];\n" +" sub.ftz.f32 %f50, %f48, %f49;\n" +" .loc 16 386 0\n" +" and.b32 %r40, %r37, 1073741823;\n" +" mov.u32 %r41, %r40;\n" +" mov.s32 %r42, 0;\n" +" mov.u32 %r43, %r42;\n" +" mov.s32 %r44, 0;\n" +" mov.u32 %r45, %r44;\n" +" mov.s32 %r46, 0;\n" +" mov.u32 %r47, %r46;\n" +" tex.1d.v4.f32.s32 {%f51,%f52,%f53,%f54},[pos_tex,{%r41,%r43,%r45,%r47}];\n" +" mov.f32 %f55, %f51;\n" +" mov.f32 %f56, %f52;\n" +" mov.f32 %f57, %f53;\n" +" mov.f32 %f58, %f54;\n" +" sub.ftz.f32 %f59, %f33, %f56;\n" +" sub.ftz.f32 %f60, %f32, %f55;\n" +" sub.ftz.f32 %f61, %f34, %f57;\n" +" mul.ftz.f32 %f62, %f59, %f59;\n" +" fma.rn.ftz.f32 %f63, %f60, %f60, %f62;\n" +" fma.rn.ftz.f32 %f64, %f61, %f61, %f63;\n" +" add.ftz.f32 %f65, %f41, %f58;\n" +" cvt.rzi.ftz.s32.f32 %r48, %f65;\n" +" cvt.s64.s32 %rd43, %r48;\n" +" mul.wide.s32 %rd44, %r48, 16;\n" +" add.u64 %rd45, %rd44, %rd7;\n" +" ld.shared.f32 %f66, [%rd45+0];\n" +" setp.gt.ftz.f32 %p6, %f66, %f64;\n" +" @!%p6 bra $Lt_1_34562;\n" +" rcp.approx.ftz.f32 %f67, %f64;\n" +" ld.shared.f32 %f68, [%rd45+4];\n" +" setp.lt.ftz.f32 %p7, %f64, %f68;\n" +" @!%p7 bra $Lt_1_30210;\n" +" add.u64 %rd46, %rd44, %rd8;\n" +" ld.shared.f32 %f69, [%rd46+0];\n" +" mov.f32 %f70, 0f40000000; \n" +" setp.eq.ftz.f32 %p8, %f69, %f70;\n" +" @!%p8 bra $Lt_1_30722;\n" +" .loc 16 401 0\n" +" mul.ftz.f32 %f71, %f67, %f67;\n" +" mov.f32 %f72, %f71;\n" +" mov.f32 %f73, %f72;\n" +" .loc 16 402 0\n" +" mul.ftz.f32 %f74, %f71, %f71;\n" +" mov.f32 %f75, %f74;\n" +" bra.uni $Lt_1_30978;\n" +"$Lt_1_30722:\n" +" mov.f32 %f76, 0f3f800000; \n" +" setp.eq.ftz.f32 %p9, %f69, %f76;\n" +" @!%p9 bra $Lt_1_31234;\n" +" .loc 16 404 0\n" +" sqrt.approx.ftz.f32 %f77, %f67;\n" +" mul.ftz.f32 %f78, %f67, %f77;\n" +" mov.f32 %f74, %f78;\n" +" mov.f32 %f75, %f74;\n" +" .loc 16 405 0\n" +" mul.ftz.f32 %f72, %f78, %f78;\n" +" mov.f32 %f73, %f72;\n" +" bra.uni $Lt_1_30978;\n" +"$Lt_1_31234:\n" +" .loc 16 407 0\n" +" mul.ftz.f32 %f79, %f67, %f67;\n" +" mul.ftz.f32 %f80, %f67, %f79;\n" +" mov.f32 %f72, %f80;\n" +" mov.f32 %f73, %f72;\n" +" .loc 16 408 0\n" +" mov.f32 %f74, %f80;\n" +" mov.f32 %f75, %f74;\n" +"$Lt_1_30978:\n" +"$Lt_1_30466:\n" +" .loc 16 410 0\n" +" mul.ftz.f32 %f81, %f47, %f72;\n" +" ld.shared.v2.f32 {%f82,%f83}, [%rd45+8];\n" +" mul.ftz.f32 %f84, %f82, %f74;\n" +" sub.ftz.f32 %f85, %f84, %f83;\n" +" mul.ftz.f32 %f86, %f81, %f85;\n" +" bra.uni $Lt_1_29954;\n" +"$Lt_1_30210:\n" +" .loc 16 412 0\n" +" mov.f32 %f86, 0f00000000; \n" +"$Lt_1_29954:\n" +" setp.lt.ftz.f32 %p10, %f64, %f22;\n" +" @!%p10 bra $Lt_1_31746;\n" +" .loc 16 416 0\n" +" sqrt.approx.ftz.f32 %f87, %f64;\n" +" rcp.approx.ftz.f32 %f88, %f87;\n" +" mov.f32 %f89, %f88;\n" +" .loc 16 417 0\n" +" mov.u32 %r49, %r40;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" tex.1d.v4.f32.s32 {%f90,%f91,%f92,%f93},[q_tex,{%r49,%r51,%r53,%r55}];\n" +" mov.f32 %f94, %f90;\n" +" ld.param.f32 %f95, [__cudaparm_kernel_pair_fast_qqrd2e];\n" +" mul.ftz.f32 %f96, %f95, %f40;\n" +" mul.ftz.f32 %f97, %f96, %f94;\n" +" mov.f32 %f98, %f97;\n" +" .loc 16 418 0\n" +" ld.shared.f32 %f99, [__cuda_local_var_32738_33_non_const__ia2];\n" +" mul.ftz.f32 %f100, %f99, %f64;\n" +" mov.f32 %f101, %f100;\n" +" .loc 16 419 0\n" +" mul.ftz.f32 %f102, %f100, %f100;\n" +" mov.f32 %f103, %f102;\n" +" ld.shared.f32 %f104, [__cuda_local_var_32739_33_non_const__ia3];\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_fast_smooth];\n" +" mov.u32 %r57, 0;\n" +" setp.ne.s32 %p11, %r56, %r57;\n" +" @%p11 bra $Lt_1_32258;\n" +" .loc 16 421 0\n" +" div.approx.ftz.f32 %f105, %f88, %f64;\n" +" mov.f32 %f106, 0fc08c0000; \n" +" mov.f32 %f107, 0f40a80000; \n" +" fma.rn.ftz.f32 %f108, %f107, %f100, %f106;\n" +" mov.f32 %f109, 0f3ff00000; \n" +" mul.ftz.f32 %f110, %f109, %f102;\n" +" sub.ftz.f32 %f111, %f108, %f110;\n" +" mul.ftz.f32 %f112, %f104, %f111;\n" +" sub.ftz.f32 %f113, %f112, %f105;\n" +" mul.ftz.f32 %f114, %f50, %f88;\n" +" sub.ftz.f32 %f115, %f113, %f114;\n" +" mul.ftz.f32 %f116, %f97, %f115;\n" +" bra.uni $Lt_1_31490;\n" +"$Lt_1_32258:\n" +" .loc 16 425 0\n" +" mul.ftz.f32 %f117, %f102, %f100;\n" +" mov.f32 %f118, %f117;\n" +" .loc 16 426 0\n" +" div.approx.ftz.f32 %f119, %f88, %f64;\n" +" mov.f32 %f120, 0fc0d20000; \n" +" mov.f32 %f121, 0f413d0000; \n" +" fma.rn.ftz.f32 %f122, %f121, %f100, %f120;\n" +" mov.f32 %f123, 0f41070000; \n" +" mul.ftz.f32 %f124, %f123, %f102;\n" +" sub.ftz.f32 %f125, %f122, %f124;\n" +" mov.f32 %f126, 0f400c0000; \n" +" fma.rn.ftz.f32 %f127, %f126, %f117, %f125;\n" +" mul.ftz.f32 %f128, %f104, %f127;\n" +" sub.ftz.f32 %f129, %f128, %f119;\n" +" mul.ftz.f32 %f130, %f50, %f88;\n" +" sub.ftz.f32 %f131, %f129, %f130;\n" +" mul.ftz.f32 %f116, %f97, %f131;\n" +" bra.uni $Lt_1_31490;\n" +"$Lt_1_31746:\n" +" .loc 16 433 0\n" +" mov.f32 %f98, 0f00000000; \n" +" mov.f32 %f116, 0f00000000; \n" +"$Lt_1_31490:\n" +" .loc 16 438 0\n" +" fma.rn.ftz.f32 %f132, %f86, %f67, %f116;\n" +" fma.rn.ftz.f32 %f44, %f60, %f132, %f44;\n" +" .loc 16 439 0\n" +" fma.rn.ftz.f32 %f43, %f59, %f132, %f43;\n" +" .loc 16 440 0\n" +" fma.rn.ftz.f32 %f42, %f61, %f132, %f42;\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p12, %r58, %r59;\n" +" @%p12 bra $Lt_1_34050;\n" +" @!%p10 bra $Lt_1_33538;\n" +" mov.f32 %f133, %f103;\n" +" mov.f32 %f134, %f101;\n" +" mov.f32 %f135, %f89;\n" +" ld.shared.f32 %f136, [__cuda_local_var_32737_33_non_const__ia];\n" +" ld.param.s32 %r60, [__cudaparm_kernel_pair_fast_smooth];\n" +" mov.u32 %r61, 0;\n" +" setp.ne.s32 %p13, %r60, %r61;\n" +" @%p13 bra $Lt_1_33794;\n" +" .loc 16 445 0\n" +" mov.f32 %f137, 0f400c0000; \n" +" mov.f32 %f138, 0f400c0000; \n" +" mul.ftz.f32 %f139, %f138, %f134;\n" +" sub.ftz.f32 %f140, %f137, %f139;\n" +" mov.f32 %f141, 0f3fa80000; \n" +" fma.rn.ftz.f32 %f142, %f141, %f133, %f140;\n" +" mov.f32 %f143, 0f3ea00000; \n" +" mul.ftz.f32 %f144, %f133, %f143;\n" +" mul.ftz.f32 %f145, %f134, %f144;\n" +" sub.ftz.f32 %f146, %f142, %f145;\n" +" fma.rn.ftz.f32 %f147, %f136, %f146, %f135;\n" +" mul.ftz.f32 %f148, %f50, %f135;\n" +" sub.ftz.f32 %f149, %f147, %f148;\n" +" fma.rn.ftz.f32 %f45, %f98, %f149, %f45;\n" +" bra.uni $Lt_1_33538;\n" +"$Lt_1_33794:\n" +" .loc 16 450 0\n" +" mov.f32 %f150, 0f401d8000; \n" +" mov.f32 %f151, 0f40520000; \n" +" mul.ftz.f32 %f152, %f151, %f134;\n" +" sub.ftz.f32 %f153, %f150, %f152;\n" +" mov.f32 %f154, 0f403d0000; \n" +" fma.rn.ftz.f32 %f155, %f154, %f133, %f153;\n" +" mov.f32 %f156, 0f3fb40000; \n" +" mov.f32 %f157, %f118;\n" +" mul.ftz.f32 %f158, %f156, %f157;\n" +" sub.ftz.f32 %f159, %f155, %f158;\n" +" mov.f32 %f160, 0f3e8c0000; \n" +" mul.ftz.f32 %f161, %f133, %f160;\n" +" fma.rn.ftz.f32 %f162, %f133, %f161, %f159;\n" +" fma.rn.ftz.f32 %f163, %f136, %f162, %f135;\n" +" fma.rn.ftz.f32 %f45, %f98, %f163, %f45;\n" +"$Lt_1_33538:\n" +"$Lt_1_33026:\n" +" @!%p7 bra $Lt_1_34050;\n" +" .loc 16 455 0\n" +" add.u64 %rd47, %rd44, %rd8;\n" +" ld.shared.v4.f32 {_,%f164,%f165,%f166}, [%rd47+0];\n" +" mov.f32 %f167, %f73;\n" +" mul.ftz.f32 %f168, %f167, %f47;\n" +" mov.f32 %f169, %f75;\n" +" mul.ftz.f32 %f170, %f164, %f169;\n" +" sub.ftz.f32 %f171, %f170, %f165;\n" +" mul.ftz.f32 %f172, %f168, %f171;\n" +" sub.ftz.f32 %f173, %f172, %f166;\n" +" add.ftz.f32 %f46, %f46, %f173;\n" +"$Lt_1_34050:\n" +"$Lt_1_32514:\n" +" ld.param.s32 %r62, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r63, 0;\n" +" setp.le.s32 %p14, %r62, %r63;\n" +" @%p14 bra $Lt_1_34562;\n" +" .loc 16 460 0\n" +" mov.f32 %f174, %f11;\n" +" mul.ftz.f32 %f175, %f60, %f60;\n" +" fma.rn.ftz.f32 %f176, %f132, %f175, %f174;\n" +" mov.f32 %f11, %f176;\n" +" .loc 16 461 0\n" +" mov.f32 %f177, %f13;\n" +" fma.rn.ftz.f32 %f178, %f132, %f62, %f177;\n" +" mov.f32 %f13, %f178;\n" +" .loc 16 462 0\n" +" mov.f32 %f179, %f15;\n" +" mul.ftz.f32 %f180, %f61, %f61;\n" +" fma.rn.ftz.f32 %f181, %f132, %f180, %f179;\n" +" mov.f32 %f15, %f181;\n" +" .loc 16 463 0\n" +" mov.f32 %f182, %f17;\n" +" mul.ftz.f32 %f183, %f59, %f60;\n" +" fma.rn.ftz.f32 %f184, %f132, %f183, %f182;\n" +" mov.f32 %f17, %f184;\n" +" .loc 16 464 0\n" +" mov.f32 %f185, %f19;\n" +" mul.ftz.f32 %f186, %f60, %f61;\n" +" fma.rn.ftz.f32 %f187, %f132, %f186, %f185;\n" +" mov.f32 %f19, %f187;\n" +" .loc 16 465 0\n" +" mul.ftz.f32 %f188, %f59, %f61;\n" +" fma.rn.ftz.f32 %f20, %f132, %f188, %f20;\n" +" mov.f32 %f21, %f20;\n" +"$Lt_1_34562:\n" +"$Lt_1_29442:\n" +" .loc 16 378 0\n" +" mul.lo.u64 %rd48, %rd39, 4;\n" +" add.u64 %rd31, %rd31, %rd48;\n" +" setp.lt.u64 %p15, %rd31, %rd28;\n" +" @%p15 bra $Lt_1_29186;\n" +" bra.uni $Lt_1_27650;\n" +"$Lt_1_40962:\n" +" mov.f32 %f42, 0f00000000; \n" +" mov.f32 %f43, 0f00000000; \n" +" mov.f32 %f44, 0f00000000; \n" +" mov.f32 %f45, 0f00000000; \n" +" mov.f32 %f46, 0f00000000; \n" +" bra.uni $Lt_1_27650;\n" +"$Lt_1_27906:\n" +" mov.f32 %f42, 0f00000000; \n" +" mov.f32 %f43, 0f00000000; \n" +" mov.f32 %f44, 0f00000000; \n" +" mov.f32 %f45, 0f00000000; \n" +" mov.f32 %f46, 0f00000000; \n" +"$Lt_1_27650:\n" +" mov.u32 %r64, 1;\n" +" setp.le.s32 %p16, %r4, %r64;\n" +" @%p16 bra $Lt_1_37378;\n" +" .loc 16 476 0\n" +" mov.u64 %rd49, __cuda___cuda_local_var_32866_35_non_const_red_acc7232;\n" +" cvt.s64.s32 %rd50, %r1;\n" +" mul.wide.s32 %rd51, %r1, 4;\n" +" add.u64 %rd52, %rd49, %rd51;\n" +" mov.f32 %f189, %f44;\n" +" st.shared.f32 [%rd52+0], %f189;\n" +" .loc 16 477 0\n" +" mov.f32 %f190, %f43;\n" +" st.shared.f32 [%rd52+512], %f190;\n" +" .loc 16 478 0\n" +" mov.f32 %f191, %f42;\n" +" st.shared.f32 [%rd52+1024], %f191;\n" +" .loc 16 479 0\n" +" mov.f32 %f192, %f46;\n" +" st.shared.f32 [%rd52+1536], %f192;\n" +" .loc 16 480 0\n" +" mov.f32 %f193, %f45;\n" +" st.shared.f32 [%rd52+2048], %f193;\n" +" .loc 16 482 0\n" +" shr.s32 %r65, %r4, 31;\n" +" mov.s32 %r66, 1;\n" +" and.b32 %r67, %r65, %r66;\n" +" add.s32 %r68, %r67, %r4;\n" +" shr.s32 %r69, %r68, 1;\n" +" mov.s32 %r70, %r69;\n" +" mov.u32 %r71, 0;\n" +" setp.ne.u32 %p17, %r69, %r71;\n" +" @!%p17 bra $Lt_1_35842;\n" +"$Lt_1_36354:\n" +" setp.ge.u32 %p18, %r8, %r70;\n" +" @%p18 bra $Lt_1_36610;\n" +" .loc 16 485 0\n" +" add.u32 %r72, %r1, %r70;\n" +" cvt.u64.u32 %rd53, %r72;\n" +" mul.wide.u32 %rd54, %r72, 4;\n" +" add.u64 %rd55, %rd49, %rd54;\n" +" ld.shared.f32 %f194, [%rd55+0];\n" +" add.ftz.f32 %f189, %f194, %f189;\n" +" st.shared.f32 [%rd52+0], %f189;\n" +" ld.shared.f32 %f195, [%rd55+512];\n" +" add.ftz.f32 %f190, %f195, %f190;\n" +" st.shared.f32 [%rd52+512], %f190;\n" +" ld.shared.f32 %f196, [%rd55+1024];\n" +" add.ftz.f32 %f191, %f196, %f191;\n" +" st.shared.f32 [%rd52+1024], %f191;\n" +" ld.shared.f32 %f197, [%rd55+1536];\n" +" add.ftz.f32 %f192, %f197, %f192;\n" +" st.shared.f32 [%rd52+1536], %f192;\n" +" ld.shared.f32 %f198, [%rd55+2048];\n" +" add.ftz.f32 %f193, %f198, %f193;\n" +" st.shared.f32 [%rd52+2048], %f193;\n" +"$Lt_1_36610:\n" +" .loc 16 482 0\n" +" shr.u32 %r70, %r70, 1;\n" +" mov.u32 %r73, 0;\n" +" setp.ne.u32 %p19, %r70, %r73;\n" +" @%p19 bra $Lt_1_36354;\n" +"$Lt_1_35842:\n" +" .loc 16 489 0\n" +" mov.f32 %f44, %f189;\n" +" .loc 16 490 0\n" +" mov.f32 %f43, %f190;\n" +" .loc 16 491 0\n" +" mov.f32 %f42, %f191;\n" +" .loc 16 492 0\n" +" mov.f32 %f46, %f192;\n" +" .loc 16 493 0\n" +" mov.f32 %f45, %f193;\n" +" ld.param.s32 %r74, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r75, 0;\n" +" setp.le.s32 %p20, %r74, %r75;\n" +" @%p20 bra $Lt_1_37378;\n" +" .loc 16 497 0\n" +" mov.f32 %f189, %f11;\n" +" st.shared.f32 [%rd52+0], %f189;\n" +" mov.f32 %f190, %f13;\n" +" st.shared.f32 [%rd52+512], %f190;\n" +" mov.f32 %f191, %f15;\n" +" st.shared.f32 [%rd52+1024], %f191;\n" +" mov.f32 %f192, %f17;\n" +" st.shared.f32 [%rd52+1536], %f192;\n" +" mov.f32 %f193, %f19;\n" +" st.shared.f32 [%rd52+2048], %f193;\n" +" mov.f32 %f199, %f21;\n" +" st.shared.f32 [%rd52+2560], %f199;\n" +" .loc 16 499 0\n" +" mov.s32 %r76, %r69;\n" +" @!%p17 bra $Lt_1_37890;\n" +"$Lt_1_38402:\n" +" setp.ge.u32 %p21, %r8, %r76;\n" +" @%p21 bra $Lt_1_38658;\n" +" .loc 16 502 0\n" +" add.u32 %r77, %r1, %r76;\n" +" cvt.u64.u32 %rd56, %r77;\n" +" mul.wide.u32 %rd57, %r77, 4;\n" +" add.u64 %rd58, %rd49, %rd57;\n" +" ld.shared.f32 %f200, [%rd58+0];\n" +" add.ftz.f32 %f189, %f200, %f189;\n" +" st.shared.f32 [%rd52+0], %f189;\n" +" ld.shared.f32 %f201, [%rd58+512];\n" +" add.ftz.f32 %f190, %f201, %f190;\n" +" st.shared.f32 [%rd52+512], %f190;\n" +" ld.shared.f32 %f202, [%rd58+1024];\n" +" add.ftz.f32 %f191, %f202, %f191;\n" +" st.shared.f32 [%rd52+1024], %f191;\n" +" ld.shared.f32 %f203, [%rd58+1536];\n" +" add.ftz.f32 %f192, %f203, %f192;\n" +" st.shared.f32 [%rd52+1536], %f192;\n" +" ld.shared.f32 %f204, [%rd58+2048];\n" +" add.ftz.f32 %f193, %f204, %f193;\n" +" st.shared.f32 [%rd52+2048], %f193;\n" +" ld.shared.f32 %f205, [%rd58+2560];\n" +" add.ftz.f32 %f199, %f205, %f199;\n" +" st.shared.f32 [%rd52+2560], %f199;\n" +"$Lt_1_38658:\n" +" .loc 16 499 0\n" +" shr.u32 %r76, %r76, 1;\n" +" mov.u32 %r78, 0;\n" +" setp.ne.u32 %p22, %r76, %r78;\n" +" @%p22 bra $Lt_1_38402;\n" +"$Lt_1_37890:\n" +" .loc 16 507 0\n" +" mov.f32 %f11, %f189;\n" +" mov.f32 %f13, %f190;\n" +" mov.f32 %f15, %f191;\n" +" mov.f32 %f17, %f192;\n" +" mov.f32 %f19, %f193;\n" +" mov.f32 %f21, %f199;\n" +"$Lt_1_37378:\n" +"$Lt_1_35330:\n" +" selp.s32 %r79, 1, 0, %p3;\n" +" mov.s32 %r80, 0;\n" +" set.eq.u32.s32 %r81, %r8, %r80;\n" +" neg.s32 %r82, %r81;\n" +" and.b32 %r83, %r79, %r82;\n" +" mov.u32 %r84, 0;\n" +" setp.eq.s32 %p23, %r83, %r84;\n" +" @%p23 bra $Lt_1_39426;\n" +" .loc 16 513 0\n" +" cvt.s64.s32 %rd59, %r11;\n" +" ld.param.u64 %rd60, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd61, %r11, 4;\n" +" add.u64 %rd62, %rd60, %rd61;\n" +" ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r86, 0;\n" +" setp.le.s32 %p24, %r85, %r86;\n" +" @%p24 bra $Lt_1_39938;\n" +" .loc 16 515 0\n" +" st.global.f32 [%rd62+0], %f46;\n" +" .loc 16 516 0\n" +" cvt.s64.s32 %rd63, %r12;\n" +" mul.wide.s32 %rd64, %r12, 4;\n" +" add.u64 %rd65, %rd64, %rd62;\n" +" .loc 16 517 0\n" +" st.global.f32 [%rd65+0], %f45;\n" +" .loc 16 518 0\n" +" add.u64 %rd62, %rd64, %rd65;\n" +"$Lt_1_39938:\n" +" ld.param.s32 %r87, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r88, 0;\n" +" setp.le.s32 %p25, %r87, %r88;\n" +" @%p25 bra $Lt_1_40450;\n" +" .loc 16 522 0\n" +" mov.f32 %f206, %f11;\n" +" st.global.f32 [%rd62+0], %f206;\n" +" .loc 16 523 0\n" +" cvt.s64.s32 %rd66, %r12;\n" +" mul.wide.s32 %rd67, %r12, 4;\n" +" add.u64 %rd68, %rd67, %rd62;\n" +" .loc 16 522 0\n" +" mov.f32 %f207, %f13;\n" +" st.global.f32 [%rd68+0], %f207;\n" +" .loc 16 523 0\n" +" add.u64 %rd69, %rd67, %rd68;\n" +" .loc 16 522 0\n" +" mov.f32 %f208, %f15;\n" +" st.global.f32 [%rd69+0], %f208;\n" +" .loc 16 523 0\n" +" add.u64 %rd70, %rd67, %rd69;\n" +" .loc 16 522 0\n" +" mov.f32 %f209, %f17;\n" +" st.global.f32 [%rd70+0], %f209;\n" +" .loc 16 523 0\n" +" add.u64 %rd62, %rd67, %rd70;\n" +" .loc 16 522 0\n" +" mov.f32 %f210, %f19;\n" +" st.global.f32 [%rd62+0], %f210;\n" +" mov.f32 %f211, %f21;\n" +" add.u64 %rd71, %rd67, %rd62;\n" +" st.global.f32 [%rd71+0], %f211;\n" +"$Lt_1_40450:\n" +" .loc 16 526 0\n" +" ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd73, %rd59, 16;\n" +" add.u64 %rd74, %rd72, %rd73;\n" +" mov.f32 %f212, %f213;\n" +" st.global.v4.f32 [%rd74+0], {%f44,%f43,%f42,%f212};\n" +"$Lt_1_39426:\n" +" .loc 16 528 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/coul_long_gpu_kernel.ptx b/lib/gpu/coul_long_gpu_kernel.ptx new file mode 100644 index 000000000..45bee976e --- /dev/null +++ b/lib/gpu/coul_long_gpu_kernel.ptx @@ -0,0 +1,1031 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bf4b_00000000-9_coul_long_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.qJTqsI) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bf4b_00000000-8_coul_long_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "coul_long_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + .global .texref q_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .u64 __cudaparm_kernel_pair_lj3, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_cl_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_q_, + .param .f32 __cudaparm_kernel_pair_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_qqrd2e, + .param .f32 __cudaparm_kernel_pair_g_ewald, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<81>; + .reg .u64 %rd<57>; + .reg .f32 %f<132>; + .reg .pred %p<19>; + .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_cl112[16]; + .shared .align 4 .b8 __cuda___cuda_local_var_32585_35_non_const_red_acc128[3072]; + // __cuda_local_var_32505_10_non_const_f = 48 + // __cuda_local_var_32509_9_non_const_virial = 16 + .loc 16 108 0 +$LDWbegin_kernel_pair: + .loc 16 115 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_cl_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 116 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 117 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 118 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_cl112+0], {%f1,%f2,%f3,%f4}; + .loc 16 127 0 + mov.f32 %f5, 0f00000000; // 0 + mov.f32 %f6, %f5; + mov.f32 %f7, 0f00000000; // 0 + mov.f32 %f8, %f7; + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_19202; + .loc 16 131 0 + cvt.s64.s32 %rd2, %r9; + mul.wide.s32 %rd3, %r9, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd5, %rd3, %rd4; + ld.global.s32 %r11, [%rd5+0]; + .loc 16 133 0 + ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd6, %r12; + mul.wide.s32 %rd7, %r12, 4; + add.u64 %rd8, %rd7, %rd5; + ld.global.s32 %r13, [%rd8+0]; + add.u64 %rd9, %rd7, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd4; + @%p2 bra $Lt_0_19714; + .loc 16 139 0 + cvt.s32.s64 %r14, %rd6; + mul.lo.s32 %r15, %r14, %r13; + cvt.s64.s32 %rd11, %r15; + mul.wide.s32 %rd12, %r15, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 140 0 + mul.lo.s32 %r16, %r6, %r14; + cvt.s64.s32 %rd14, %r16; + mul.wide.s32 %rd15, %r16, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 141 0 + mul.lo.s32 %r17, %r14, %r1; + bra.uni $Lt_0_19458; +$Lt_0_19714: + .loc 16 143 0 + ld.global.s32 %r18, [%rd9+0]; + cvt.s64.s32 %rd17, %r18; + mul.wide.s32 %rd18, %r18, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 144 0 + cvt.s64.s32 %rd20, %r13; + mul.wide.s32 %rd21, %r13, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 145 0 + mov.s32 %r17, %r1; + .loc 16 146 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_19458: + .loc 16 149 0 + mov.u32 %r19, %r11; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f21, %f17; + mov.f32 %f22, %f18; + mov.f32 %f23, %f19; + .loc 16 150 0 + mov.u32 %r26, %r11; + mov.s32 %r27, 0; + mov.u32 %r28, %r27; + mov.s32 %r29, 0; + mov.u32 %r30, %r29; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + tex.1d.v4.f32.s32 {%f24,%f25,%f26,%f27},[q_tex,{%r26,%r28,%r30,%r32}]; + mov.f32 %f28, %f24; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_27650; + cvt.s64.s32 %rd24, %r17; + ld.param.f32 %f29, [__cudaparm_kernel_pair_cut_coulsq]; + mov.f32 %f30, 0f00000000; // 0 + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.u64 %rd25, __cuda___cuda_local_var_32498_33_non_const_sp_cl112; +$Lt_0_20482: + //<loop> Loop body line 150, nesting depth: 1, estimated iterations: unknown + .loc 16 153 0 + ld.global.s32 %r33, [%rd16+0]; + .loc 16 156 0 + mov.f32 %f34, 0f3f800000; // 1 + shr.s32 %r34, %r33, 30; + and.b32 %r35, %r34, 3; + cvt.s64.s32 %rd26, %r35; + mul.wide.s32 %rd27, %r35, 4; + add.u64 %rd28, %rd25, %rd27; + ld.shared.f32 %f35, [%rd28+0]; + sub.ftz.f32 %f36, %f34, %f35; + .loc 16 159 0 + and.b32 %r36, %r33, 1073741823; + mov.u32 %r37, %r36; + mov.s32 %r38, 0; + mov.u32 %r39, %r38; + mov.s32 %r40, 0; + mov.u32 %r41, %r40; + mov.s32 %r42, 0; + mov.u32 %r43, %r42; + tex.1d.v4.f32.s32 {%f37,%f38,%f39,%f40},[pos_tex,{%r37,%r39,%r41,%r43}]; + mov.f32 %f41, %f37; + mov.f32 %f42, %f38; + mov.f32 %f43, %f39; + sub.ftz.f32 %f44, %f22, %f42; + sub.ftz.f32 %f45, %f21, %f41; + sub.ftz.f32 %f46, %f23, %f43; + mul.ftz.f32 %f47, %f44, %f44; + fma.rn.ftz.f32 %f48, %f45, %f45, %f47; + fma.rn.ftz.f32 %f49, %f46, %f46, %f48; + setp.lt.ftz.f32 %p4, %f49, %f29; + @!%p4 bra $Lt_0_21250; + .loc 16 175 0 + sqrt.approx.ftz.f32 %f50, %f49; + ld.param.f32 %f51, [__cudaparm_kernel_pair_g_ewald]; + mul.ftz.f32 %f52, %f51, %f50; + mul.ftz.f32 %f53, %f52, %f52; + mov.f32 %f54, 0f3f800000; // 1 + mov.f32 %f55, 0f3ea7ba05; // 0.327591 + fma.rn.ftz.f32 %f56, %f55, %f52, %f54; + neg.ftz.f32 %f57, %f53; + rcp.approx.ftz.f32 %f58, %f56; + mov.f32 %f59, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f60, %f57, %f59; + ex2.approx.ftz.f32 %f61, %f60; + mov.f32 %f62, 0f3e827906; // 0.25483 + mov.f32 %f63, 0fbe91a98e; // -0.284497 + mov.f32 %f64, 0f3fb5f0e3; // 1.42141 + mov.f32 %f65, 0fbfba00e3; // -1.45315 + mov.f32 %f66, 0f3f87dc22; // 1.06141 + fma.rn.ftz.f32 %f67, %f66, %f58, %f65; + fma.rn.ftz.f32 %f68, %f58, %f67, %f64; + fma.rn.ftz.f32 %f69, %f58, %f68, %f63; + fma.rn.ftz.f32 %f70, %f58, %f69, %f62; + mul.ftz.f32 %f71, %f58, %f70; + mul.ftz.f32 %f72, %f61, %f71; + .loc 16 176 0 + mov.u32 %r44, %r36; + mov.s32 %r45, 0; + mov.u32 %r46, %r45; + mov.s32 %r47, 0; + mov.u32 %r48, %r47; + mov.s32 %r49, 0; + mov.u32 %r50, %r49; + tex.1d.v4.f32.s32 {%f73,%f74,%f75,%f76},[q_tex,{%r44,%r46,%r48,%r50}]; + mov.f32 %f77, %f73; + .loc 16 177 0 + ld.param.f32 %f78, [__cudaparm_kernel_pair_qqrd2e]; + mul.ftz.f32 %f79, %f78, %f28; + mul.ftz.f32 %f80, %f79, %f77; + div.approx.ftz.f32 %f81, %f80, %f50; + mov.f32 %f82, 0f3f906ebb; // 1.12838 + mul.ftz.f32 %f83, %f52, %f82; + fma.rn.ftz.f32 %f84, %f61, %f83, %f72; + sub.ftz.f32 %f85, %f84, %f36; + mul.ftz.f32 %f86, %f81, %f85; + rcp.approx.ftz.f32 %f87, %f49; + mul.ftz.f32 %f88, %f86, %f87; + .loc 16 179 0 + fma.rn.ftz.f32 %f32, %f45, %f88, %f32; + .loc 16 180 0 + fma.rn.ftz.f32 %f31, %f44, %f88, %f31; + .loc 16 181 0 + fma.rn.ftz.f32 %f30, %f46, %f88, %f30; + .loc 16 168 0 + sub.ftz.f32 %f89, %f72, %f36; + fma.rn.ftz.f32 %f90, %f81, %f89, %f33; + ld.param.s32 %r51, [__cudaparm_kernel_pair_eflag]; + mov.s32 %r52, 0; + setp.gt.s32 %p5, %r51, %r52; + selp.f32 %f33, %f90, %f33, %p5; + ld.param.s32 %r53, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r54, 0; + setp.le.s32 %p6, %r53, %r54; + @%p6 bra $Lt_0_21250; + .loc 16 187 0 + mov.f32 %f91, %f6; + mul.ftz.f32 %f92, %f45, %f45; + fma.rn.ftz.f32 %f93, %f88, %f92, %f91; + mov.f32 %f6, %f93; + .loc 16 188 0 + mov.f32 %f94, %f8; + fma.rn.ftz.f32 %f95, %f88, %f47, %f94; + mov.f32 %f8, %f95; + .loc 16 189 0 + mov.f32 %f96, %f10; + mul.ftz.f32 %f97, %f46, %f46; + fma.rn.ftz.f32 %f98, %f88, %f97, %f96; + mov.f32 %f10, %f98; + .loc 16 190 0 + mov.f32 %f99, %f12; + mul.ftz.f32 %f100, %f44, %f45; + fma.rn.ftz.f32 %f101, %f88, %f100, %f99; + mov.f32 %f12, %f101; + .loc 16 191 0 + mov.f32 %f102, %f14; + mul.ftz.f32 %f103, %f45, %f46; + fma.rn.ftz.f32 %f104, %f88, %f103, %f102; + mov.f32 %f14, %f104; + .loc 16 192 0 + mul.ftz.f32 %f105, %f44, %f46; + fma.rn.ftz.f32 %f15, %f88, %f105, %f15; + mov.f32 %f16, %f15; +$Lt_0_21250: +$Lt_0_20738: + .loc 16 152 0 + mul.lo.u64 %rd29, %rd24, 4; + add.u64 %rd16, %rd16, %rd29; + setp.lt.u64 %p7, %rd16, %rd13; + @%p7 bra $Lt_0_20482; + bra.uni $Lt_0_18946; +$Lt_0_27650: + mov.f32 %f30, 0f00000000; // 0 + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + bra.uni $Lt_0_18946; +$Lt_0_19202: + mov.f32 %f30, 0f00000000; // 0 + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 +$Lt_0_18946: + mov.u32 %r55, 1; + setp.le.s32 %p8, %r1, %r55; + @%p8 bra $Lt_0_24066; + .loc 16 203 0 + mov.u64 %rd30, __cuda___cuda_local_var_32585_35_non_const_red_acc128; + cvt.s64.s32 %rd31, %r2; + mul.wide.s32 %rd32, %r2, 4; + add.u64 %rd33, %rd30, %rd32; + mov.f32 %f106, %f32; + st.shared.f32 [%rd33+0], %f106; + .loc 16 204 0 + mov.f32 %f107, %f31; + st.shared.f32 [%rd33+512], %f107; + .loc 16 205 0 + mov.f32 %f108, %f30; + st.shared.f32 [%rd33+1024], %f108; + .loc 16 206 0 + mov.f32 %f109, %f33; + st.shared.f32 [%rd33+1536], %f109; + .loc 16 208 0 + shr.s32 %r56, %r1, 31; + mov.s32 %r57, 1; + and.b32 %r58, %r56, %r57; + add.s32 %r59, %r58, %r1; + shr.s32 %r60, %r59, 1; + mov.s32 %r61, %r60; + mov.u32 %r62, 0; + setp.ne.u32 %p9, %r60, %r62; + @!%p9 bra $Lt_0_22530; +$Lt_0_23042: + setp.ge.u32 %p10, %r6, %r61; + @%p10 bra $Lt_0_23298; + .loc 16 211 0 + add.u32 %r63, %r2, %r61; + cvt.u64.u32 %rd34, %r63; + mul.wide.u32 %rd35, %r63, 4; + add.u64 %rd36, %rd30, %rd35; + ld.shared.f32 %f110, [%rd36+0]; + add.ftz.f32 %f106, %f110, %f106; + st.shared.f32 [%rd33+0], %f106; + ld.shared.f32 %f111, [%rd36+512]; + add.ftz.f32 %f107, %f111, %f107; + st.shared.f32 [%rd33+512], %f107; + ld.shared.f32 %f112, [%rd36+1024]; + add.ftz.f32 %f108, %f112, %f108; + st.shared.f32 [%rd33+1024], %f108; + ld.shared.f32 %f113, [%rd36+1536]; + add.ftz.f32 %f109, %f113, %f109; + st.shared.f32 [%rd33+1536], %f109; +$Lt_0_23298: + .loc 16 208 0 + shr.u32 %r61, %r61, 1; + mov.u32 %r64, 0; + setp.ne.u32 %p11, %r61, %r64; + @%p11 bra $Lt_0_23042; +$Lt_0_22530: + .loc 16 215 0 + mov.f32 %f32, %f106; + .loc 16 216 0 + mov.f32 %f31, %f107; + .loc 16 217 0 + mov.f32 %f30, %f108; + .loc 16 218 0 + mov.f32 %f33, %f109; + ld.param.s32 %r65, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r66, 0; + setp.le.s32 %p12, %r65, %r66; + @%p12 bra $Lt_0_24066; + .loc 16 222 0 + mov.f32 %f106, %f6; + st.shared.f32 [%rd33+0], %f106; + mov.f32 %f107, %f8; + st.shared.f32 [%rd33+512], %f107; + mov.f32 %f108, %f10; + st.shared.f32 [%rd33+1024], %f108; + mov.f32 %f109, %f12; + st.shared.f32 [%rd33+1536], %f109; + mov.f32 %f114, %f14; + st.shared.f32 [%rd33+2048], %f114; + mov.f32 %f115, %f16; + st.shared.f32 [%rd33+2560], %f115; + .loc 16 224 0 + mov.s32 %r67, %r60; + @!%p9 bra $Lt_0_24578; +$Lt_0_25090: + setp.ge.u32 %p13, %r6, %r67; + @%p13 bra $Lt_0_25346; + .loc 16 227 0 + add.u32 %r68, %r2, %r67; + cvt.u64.u32 %rd37, %r68; + mul.wide.u32 %rd38, %r68, 4; + add.u64 %rd39, %rd30, %rd38; + ld.shared.f32 %f116, [%rd39+0]; + add.ftz.f32 %f106, %f116, %f106; + st.shared.f32 [%rd33+0], %f106; + ld.shared.f32 %f117, [%rd39+512]; + add.ftz.f32 %f107, %f117, %f107; + st.shared.f32 [%rd33+512], %f107; + ld.shared.f32 %f118, [%rd39+1024]; + add.ftz.f32 %f108, %f118, %f108; + st.shared.f32 [%rd33+1024], %f108; + ld.shared.f32 %f119, [%rd39+1536]; + add.ftz.f32 %f109, %f119, %f109; + st.shared.f32 [%rd33+1536], %f109; + ld.shared.f32 %f120, [%rd39+2048]; + add.ftz.f32 %f114, %f120, %f114; + st.shared.f32 [%rd33+2048], %f114; + ld.shared.f32 %f121, [%rd39+2560]; + add.ftz.f32 %f115, %f121, %f115; + st.shared.f32 [%rd33+2560], %f115; +$Lt_0_25346: + .loc 16 224 0 + shr.u32 %r67, %r67, 1; + mov.u32 %r69, 0; + setp.ne.u32 %p14, %r67, %r69; + @%p14 bra $Lt_0_25090; +$Lt_0_24578: + .loc 16 232 0 + mov.f32 %f6, %f106; + mov.f32 %f8, %f107; + mov.f32 %f10, %f108; + mov.f32 %f12, %f109; + mov.f32 %f14, %f114; + mov.f32 %f16, %f115; +$Lt_0_24066: +$Lt_0_22018: + selp.s32 %r70, 1, 0, %p1; + mov.s32 %r71, 0; + set.eq.u32.s32 %r72, %r6, %r71; + neg.s32 %r73, %r72; + and.b32 %r74, %r70, %r73; + mov.u32 %r75, 0; + setp.eq.s32 %p15, %r74, %r75; + @%p15 bra $Lt_0_26114; + .loc 16 238 0 + cvt.s64.s32 %rd40, %r9; + ld.param.u64 %rd41, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd42, %r9, 4; + add.u64 %rd43, %rd41, %rd42; + ld.param.s32 %r76, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r77, 0; + setp.le.s32 %p16, %r76, %r77; + @%p16 bra $Lt_0_26626; + .loc 16 240 0 + mov.f32 %f122, 0f00000000; // 0 + st.global.f32 [%rd43+0], %f122; + .loc 16 241 0 + cvt.s64.s32 %rd44, %r10; + mul.wide.s32 %rd45, %r10, 4; + add.u64 %rd46, %rd45, %rd43; + .loc 16 242 0 + st.global.f32 [%rd46+0], %f33; + .loc 16 243 0 + add.u64 %rd43, %rd45, %rd46; +$Lt_0_26626: + ld.param.s32 %r78, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r79, 0; + setp.le.s32 %p17, %r78, %r79; + @%p17 bra $Lt_0_27138; + .loc 16 247 0 + mov.f32 %f123, %f6; + st.global.f32 [%rd43+0], %f123; + .loc 16 248 0 + cvt.s64.s32 %rd47, %r10; + mul.wide.s32 %rd48, %r10, 4; + add.u64 %rd49, %rd48, %rd43; + .loc 16 247 0 + mov.f32 %f124, %f8; + st.global.f32 [%rd49+0], %f124; + .loc 16 248 0 + add.u64 %rd50, %rd48, %rd49; + .loc 16 247 0 + mov.f32 %f125, %f10; + st.global.f32 [%rd50+0], %f125; + .loc 16 248 0 + add.u64 %rd51, %rd48, %rd50; + .loc 16 247 0 + mov.f32 %f126, %f12; + st.global.f32 [%rd51+0], %f126; + .loc 16 248 0 + add.u64 %rd43, %rd48, %rd51; + .loc 16 247 0 + mov.f32 %f127, %f14; + st.global.f32 [%rd43+0], %f127; + mov.f32 %f128, %f16; + add.u64 %rd52, %rd48, %rd43; + st.global.f32 [%rd52+0], %f128; +$Lt_0_27138: + .loc 16 251 0 + ld.param.u64 %rd53, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd54, %rd40, 16; + add.u64 %rd55, %rd53, %rd54; + mov.f32 %f129, %f130; + st.global.v4.f32 [%rd55+0], {%f32,%f31,%f30,%f129}; +$Lt_0_26114: + .loc 16 253 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_lj1_in, + .param .u64 __cudaparm_kernel_pair_fast_lj3_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_cl_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_fast_q_, + .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, + .param .f32 __cudaparm_kernel_pair_fast_g_ewald, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<82>; + .reg .u64 %rd<61>; + .reg .f32 %f<129>; + .reg .pred %p<20>; + .shared .align 4 .b8 __cuda___cuda_local_var_32653_33_non_const_sp_cl3304[16]; + .shared .align 4 .b8 __cuda___cuda_local_var_32740_35_non_const_red_acc3320[3072]; + // __cuda_local_var_32658_10_non_const_f = 48 + // __cuda_local_var_32662_9_non_const_virial = 16 + .loc 16 263 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 3; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_19714; + .loc 16 271 0 + mov.u64 %rd1, __cuda___cuda_local_var_32653_33_non_const_sp_cl3304; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_cl_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_19714: + mov.u64 %rd1, __cuda___cuda_local_var_32653_33_non_const_sp_cl3304; + .loc 16 280 0 + mov.f32 %f2, 0f00000000; // 0 + mov.f32 %f3, %f2; + mov.f32 %f4, 0f00000000; // 0 + mov.f32 %f5, %f4; + mov.f32 %f6, 0f00000000; // 0 + mov.f32 %f7, %f6; + mov.f32 %f8, 0f00000000; // 0 + mov.f32 %f9, %f8; + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + .loc 16 282 0 + bar.sync 0; + ld.param.s32 %r3, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r4, %r1, %r3; + cvt.s32.u32 %r5, %ntid.x; + div.s32 %r6, %r5, %r3; + rem.s32 %r7, %r1, %r3; + cvt.s32.u32 %r8, %ctaid.x; + mul.lo.s32 %r9, %r8, %r6; + add.s32 %r10, %r4, %r9; + ld.param.s32 %r11, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p2, %r10, %r11; + @!%p2 bra $Lt_1_20482; + .loc 16 286 0 + cvt.s64.s32 %rd7, %r10; + mul.wide.s32 %rd8, %r10, 4; + ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd10, %rd8, %rd9; + ld.global.s32 %r12, [%rd10+0]; + .loc 16 288 0 + ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd11, %r13; + mul.wide.s32 %rd12, %r13, 4; + add.u64 %rd13, %rd12, %rd10; + ld.global.s32 %r14, [%rd13+0]; + add.u64 %rd14, %rd12, %rd13; + ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p3, %rd15, %rd9; + @%p3 bra $Lt_1_20994; + .loc 16 294 0 + cvt.s32.s64 %r15, %rd11; + mul.lo.s32 %r16, %r15, %r14; + cvt.s64.s32 %rd16, %r16; + mul.wide.s32 %rd17, %r16, 4; + add.u64 %rd18, %rd14, %rd17; + .loc 16 295 0 + mul.lo.s32 %r17, %r7, %r15; + cvt.s64.s32 %rd19, %r17; + mul.wide.s32 %rd20, %r17, 4; + add.u64 %rd21, %rd14, %rd20; + .loc 16 296 0 + mul.lo.s32 %r18, %r15, %r3; + bra.uni $Lt_1_20738; +$Lt_1_20994: + .loc 16 298 0 + ld.global.s32 %r19, [%rd14+0]; + cvt.s64.s32 %rd22, %r19; + mul.wide.s32 %rd23, %r19, 4; + add.u64 %rd24, %rd15, %rd23; + .loc 16 299 0 + cvt.s64.s32 %rd25, %r14; + mul.wide.s32 %rd26, %r14, 4; + add.u64 %rd18, %rd24, %rd26; + .loc 16 300 0 + mov.s32 %r18, %r3; + .loc 16 301 0 + cvt.s64.s32 %rd27, %r7; + mul.wide.s32 %rd28, %r7, 4; + add.u64 %rd21, %rd24, %rd28; +$Lt_1_20738: + .loc 16 304 0 + mov.u32 %r20, %r12; + mov.s32 %r21, 0; + mov.u32 %r22, %r21; + mov.s32 %r23, 0; + mov.u32 %r24, %r23; + mov.s32 %r25, 0; + mov.u32 %r26, %r25; + tex.1d.v4.f32.s32 {%f14,%f15,%f16,%f17},[pos_tex,{%r20,%r22,%r24,%r26}]; + mov.f32 %f18, %f14; + mov.f32 %f19, %f15; + mov.f32 %f20, %f16; + .loc 16 305 0 + mov.u32 %r27, %r12; + mov.s32 %r28, 0; + mov.u32 %r29, %r28; + mov.s32 %r30, 0; + mov.u32 %r31, %r30; + mov.s32 %r32, 0; + mov.u32 %r33, %r32; + tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[q_tex,{%r27,%r29,%r31,%r33}]; + mov.f32 %f25, %f21; + setp.ge.u64 %p4, %rd21, %rd18; + @%p4 bra $Lt_1_28930; + cvt.s64.s32 %rd29, %r18; + ld.param.f32 %f26, [__cudaparm_kernel_pair_fast_cut_coulsq]; + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + mov.f32 %f29, 0f00000000; // 0 + mov.f32 %f30, 0f00000000; // 0 +$Lt_1_21762: + //<loop> Loop body line 305, nesting depth: 1, estimated iterations: unknown + .loc 16 308 0 + ld.global.s32 %r34, [%rd21+0]; + .loc 16 311 0 + mov.f32 %f31, 0f3f800000; // 1 + shr.s32 %r35, %r34, 30; + and.b32 %r36, %r35, 3; + cvt.s64.s32 %rd30, %r36; + mul.wide.s32 %rd31, %r36, 4; + add.u64 %rd32, %rd1, %rd31; + ld.shared.f32 %f32, [%rd32+0]; + sub.ftz.f32 %f33, %f31, %f32; + .loc 16 314 0 + and.b32 %r37, %r34, 1073741823; + mov.u32 %r38, %r37; + mov.s32 %r39, 0; + mov.u32 %r40, %r39; + mov.s32 %r41, 0; + mov.u32 %r42, %r41; + mov.s32 %r43, 0; + mov.u32 %r44, %r43; + tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r38,%r40,%r42,%r44}]; + mov.f32 %f38, %f34; + mov.f32 %f39, %f35; + mov.f32 %f40, %f36; + sub.ftz.f32 %f41, %f19, %f39; + sub.ftz.f32 %f42, %f18, %f38; + sub.ftz.f32 %f43, %f20, %f40; + mul.ftz.f32 %f44, %f41, %f41; + fma.rn.ftz.f32 %f45, %f42, %f42, %f44; + fma.rn.ftz.f32 %f46, %f43, %f43, %f45; + setp.lt.ftz.f32 %p5, %f46, %f26; + @!%p5 bra $Lt_1_22530; + .loc 16 330 0 + sqrt.approx.ftz.f32 %f47, %f46; + ld.param.f32 %f48, [__cudaparm_kernel_pair_fast_g_ewald]; + mul.ftz.f32 %f49, %f48, %f47; + mul.ftz.f32 %f50, %f49, %f49; + mov.f32 %f51, 0f3f800000; // 1 + mov.f32 %f52, 0f3ea7ba05; // 0.327591 + fma.rn.ftz.f32 %f53, %f52, %f49, %f51; + neg.ftz.f32 %f54, %f50; + rcp.approx.ftz.f32 %f55, %f53; + mov.f32 %f56, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f57, %f54, %f56; + ex2.approx.ftz.f32 %f58, %f57; + mov.f32 %f59, 0f3e827906; // 0.25483 + mov.f32 %f60, 0fbe91a98e; // -0.284497 + mov.f32 %f61, 0f3fb5f0e3; // 1.42141 + mov.f32 %f62, 0fbfba00e3; // -1.45315 + mov.f32 %f63, 0f3f87dc22; // 1.06141 + fma.rn.ftz.f32 %f64, %f63, %f55, %f62; + fma.rn.ftz.f32 %f65, %f55, %f64, %f61; + fma.rn.ftz.f32 %f66, %f55, %f65, %f60; + fma.rn.ftz.f32 %f67, %f55, %f66, %f59; + mul.ftz.f32 %f68, %f55, %f67; + mul.ftz.f32 %f69, %f58, %f68; + .loc 16 331 0 + mov.u32 %r45, %r37; + mov.s32 %r46, 0; + mov.u32 %r47, %r46; + mov.s32 %r48, 0; + mov.u32 %r49, %r48; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + tex.1d.v4.f32.s32 {%f70,%f71,%f72,%f73},[q_tex,{%r45,%r47,%r49,%r51}]; + mov.f32 %f74, %f70; + .loc 16 332 0 + ld.param.f32 %f75, [__cudaparm_kernel_pair_fast_qqrd2e]; + mul.ftz.f32 %f76, %f75, %f25; + mul.ftz.f32 %f77, %f76, %f74; + div.approx.ftz.f32 %f78, %f77, %f47; + mov.f32 %f79, 0f3f906ebb; // 1.12838 + mul.ftz.f32 %f80, %f49, %f79; + fma.rn.ftz.f32 %f81, %f58, %f80, %f69; + sub.ftz.f32 %f82, %f81, %f33; + mul.ftz.f32 %f83, %f78, %f82; + rcp.approx.ftz.f32 %f84, %f46; + mul.ftz.f32 %f85, %f83, %f84; + .loc 16 334 0 + fma.rn.ftz.f32 %f29, %f42, %f85, %f29; + .loc 16 335 0 + fma.rn.ftz.f32 %f28, %f41, %f85, %f28; + .loc 16 336 0 + fma.rn.ftz.f32 %f27, %f43, %f85, %f27; + .loc 16 323 0 + sub.ftz.f32 %f86, %f69, %f33; + fma.rn.ftz.f32 %f87, %f78, %f86, %f30; + ld.param.s32 %r52, [__cudaparm_kernel_pair_fast_eflag]; + mov.s32 %r53, 0; + setp.gt.s32 %p6, %r52, %r53; + selp.f32 %f30, %f87, %f30, %p6; + ld.param.s32 %r54, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r55, 0; + setp.le.s32 %p7, %r54, %r55; + @%p7 bra $Lt_1_22530; + .loc 16 342 0 + mov.f32 %f88, %f3; + mul.ftz.f32 %f89, %f42, %f42; + fma.rn.ftz.f32 %f90, %f85, %f89, %f88; + mov.f32 %f3, %f90; + .loc 16 343 0 + mov.f32 %f91, %f5; + fma.rn.ftz.f32 %f92, %f85, %f44, %f91; + mov.f32 %f5, %f92; + .loc 16 344 0 + mov.f32 %f93, %f7; + mul.ftz.f32 %f94, %f43, %f43; + fma.rn.ftz.f32 %f95, %f85, %f94, %f93; + mov.f32 %f7, %f95; + .loc 16 345 0 + mov.f32 %f96, %f9; + mul.ftz.f32 %f97, %f41, %f42; + fma.rn.ftz.f32 %f98, %f85, %f97, %f96; + mov.f32 %f9, %f98; + .loc 16 346 0 + mov.f32 %f99, %f11; + mul.ftz.f32 %f100, %f42, %f43; + fma.rn.ftz.f32 %f101, %f85, %f100, %f99; + mov.f32 %f11, %f101; + .loc 16 347 0 + mul.ftz.f32 %f102, %f41, %f43; + fma.rn.ftz.f32 %f12, %f85, %f102, %f12; + mov.f32 %f13, %f12; +$Lt_1_22530: +$Lt_1_22018: + .loc 16 307 0 + mul.lo.u64 %rd33, %rd29, 4; + add.u64 %rd21, %rd21, %rd33; + setp.lt.u64 %p8, %rd21, %rd18; + @%p8 bra $Lt_1_21762; + bra.uni $Lt_1_20226; +$Lt_1_28930: + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + mov.f32 %f29, 0f00000000; // 0 + mov.f32 %f30, 0f00000000; // 0 + bra.uni $Lt_1_20226; +$Lt_1_20482: + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + mov.f32 %f29, 0f00000000; // 0 + mov.f32 %f30, 0f00000000; // 0 +$Lt_1_20226: + mov.u32 %r56, 1; + setp.le.s32 %p9, %r3, %r56; + @%p9 bra $Lt_1_25346; + .loc 16 358 0 + mov.u64 %rd34, __cuda___cuda_local_var_32740_35_non_const_red_acc3320; + cvt.s64.s32 %rd35, %r1; + mul.wide.s32 %rd36, %r1, 4; + add.u64 %rd37, %rd34, %rd36; + mov.f32 %f103, %f29; + st.shared.f32 [%rd37+0], %f103; + .loc 16 359 0 + mov.f32 %f104, %f28; + st.shared.f32 [%rd37+512], %f104; + .loc 16 360 0 + mov.f32 %f105, %f27; + st.shared.f32 [%rd37+1024], %f105; + .loc 16 361 0 + mov.f32 %f106, %f30; + st.shared.f32 [%rd37+1536], %f106; + .loc 16 363 0 + shr.s32 %r57, %r3, 31; + mov.s32 %r58, 1; + and.b32 %r59, %r57, %r58; + add.s32 %r60, %r59, %r3; + shr.s32 %r61, %r60, 1; + mov.s32 %r62, %r61; + mov.u32 %r63, 0; + setp.ne.u32 %p10, %r61, %r63; + @!%p10 bra $Lt_1_23810; +$Lt_1_24322: + setp.ge.u32 %p11, %r7, %r62; + @%p11 bra $Lt_1_24578; + .loc 16 366 0 + add.u32 %r64, %r1, %r62; + cvt.u64.u32 %rd38, %r64; + mul.wide.u32 %rd39, %r64, 4; + add.u64 %rd40, %rd34, %rd39; + ld.shared.f32 %f107, [%rd40+0]; + add.ftz.f32 %f103, %f107, %f103; + st.shared.f32 [%rd37+0], %f103; + ld.shared.f32 %f108, [%rd40+512]; + add.ftz.f32 %f104, %f108, %f104; + st.shared.f32 [%rd37+512], %f104; + ld.shared.f32 %f109, [%rd40+1024]; + add.ftz.f32 %f105, %f109, %f105; + st.shared.f32 [%rd37+1024], %f105; + ld.shared.f32 %f110, [%rd40+1536]; + add.ftz.f32 %f106, %f110, %f106; + st.shared.f32 [%rd37+1536], %f106; +$Lt_1_24578: + .loc 16 363 0 + shr.u32 %r62, %r62, 1; + mov.u32 %r65, 0; + setp.ne.u32 %p12, %r62, %r65; + @%p12 bra $Lt_1_24322; +$Lt_1_23810: + .loc 16 370 0 + mov.f32 %f29, %f103; + .loc 16 371 0 + mov.f32 %f28, %f104; + .loc 16 372 0 + mov.f32 %f27, %f105; + .loc 16 373 0 + mov.f32 %f30, %f106; + ld.param.s32 %r66, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r67, 0; + setp.le.s32 %p13, %r66, %r67; + @%p13 bra $Lt_1_25346; + .loc 16 377 0 + mov.f32 %f103, %f3; + st.shared.f32 [%rd37+0], %f103; + mov.f32 %f104, %f5; + st.shared.f32 [%rd37+512], %f104; + mov.f32 %f105, %f7; + st.shared.f32 [%rd37+1024], %f105; + mov.f32 %f106, %f9; + st.shared.f32 [%rd37+1536], %f106; + mov.f32 %f111, %f11; + st.shared.f32 [%rd37+2048], %f111; + mov.f32 %f112, %f13; + st.shared.f32 [%rd37+2560], %f112; + .loc 16 379 0 + mov.s32 %r68, %r61; + @!%p10 bra $Lt_1_25858; +$Lt_1_26370: + setp.ge.u32 %p14, %r7, %r68; + @%p14 bra $Lt_1_26626; + .loc 16 382 0 + add.u32 %r69, %r1, %r68; + cvt.u64.u32 %rd41, %r69; + mul.wide.u32 %rd42, %r69, 4; + add.u64 %rd43, %rd34, %rd42; + ld.shared.f32 %f113, [%rd43+0]; + add.ftz.f32 %f103, %f113, %f103; + st.shared.f32 [%rd37+0], %f103; + ld.shared.f32 %f114, [%rd43+512]; + add.ftz.f32 %f104, %f114, %f104; + st.shared.f32 [%rd37+512], %f104; + ld.shared.f32 %f115, [%rd43+1024]; + add.ftz.f32 %f105, %f115, %f105; + st.shared.f32 [%rd37+1024], %f105; + ld.shared.f32 %f116, [%rd43+1536]; + add.ftz.f32 %f106, %f116, %f106; + st.shared.f32 [%rd37+1536], %f106; + ld.shared.f32 %f117, [%rd43+2048]; + add.ftz.f32 %f111, %f117, %f111; + st.shared.f32 [%rd37+2048], %f111; + ld.shared.f32 %f118, [%rd43+2560]; + add.ftz.f32 %f112, %f118, %f112; + st.shared.f32 [%rd37+2560], %f112; +$Lt_1_26626: + .loc 16 379 0 + shr.u32 %r68, %r68, 1; + mov.u32 %r70, 0; + setp.ne.u32 %p15, %r68, %r70; + @%p15 bra $Lt_1_26370; +$Lt_1_25858: + .loc 16 387 0 + mov.f32 %f3, %f103; + mov.f32 %f5, %f104; + mov.f32 %f7, %f105; + mov.f32 %f9, %f106; + mov.f32 %f11, %f111; + mov.f32 %f13, %f112; +$Lt_1_25346: +$Lt_1_23298: + selp.s32 %r71, 1, 0, %p2; + mov.s32 %r72, 0; + set.eq.u32.s32 %r73, %r7, %r72; + neg.s32 %r74, %r73; + and.b32 %r75, %r71, %r74; + mov.u32 %r76, 0; + setp.eq.s32 %p16, %r75, %r76; + @%p16 bra $Lt_1_27394; + .loc 16 393 0 + cvt.s64.s32 %rd44, %r10; + ld.param.u64 %rd45, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd46, %r10, 4; + add.u64 %rd47, %rd45, %rd46; + ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r78, 0; + setp.le.s32 %p17, %r77, %r78; + @%p17 bra $Lt_1_27906; + .loc 16 395 0 + mov.f32 %f119, 0f00000000; // 0 + st.global.f32 [%rd47+0], %f119; + .loc 16 396 0 + cvt.s64.s32 %rd48, %r11; + mul.wide.s32 %rd49, %r11, 4; + add.u64 %rd50, %rd49, %rd47; + .loc 16 397 0 + st.global.f32 [%rd50+0], %f30; + .loc 16 398 0 + add.u64 %rd47, %rd49, %rd50; +$Lt_1_27906: + ld.param.s32 %r79, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r80, 0; + setp.le.s32 %p18, %r79, %r80; + @%p18 bra $Lt_1_28418; + .loc 16 402 0 + mov.f32 %f120, %f3; + st.global.f32 [%rd47+0], %f120; + .loc 16 403 0 + cvt.s64.s32 %rd51, %r11; + mul.wide.s32 %rd52, %r11, 4; + add.u64 %rd53, %rd52, %rd47; + .loc 16 402 0 + mov.f32 %f121, %f5; + st.global.f32 [%rd53+0], %f121; + .loc 16 403 0 + add.u64 %rd54, %rd52, %rd53; + .loc 16 402 0 + mov.f32 %f122, %f7; + st.global.f32 [%rd54+0], %f122; + .loc 16 403 0 + add.u64 %rd55, %rd52, %rd54; + .loc 16 402 0 + mov.f32 %f123, %f9; + st.global.f32 [%rd55+0], %f123; + .loc 16 403 0 + add.u64 %rd47, %rd52, %rd55; + .loc 16 402 0 + mov.f32 %f124, %f11; + st.global.f32 [%rd47+0], %f124; + mov.f32 %f125, %f13; + add.u64 %rd56, %rd52, %rd47; + st.global.f32 [%rd56+0], %f125; +$Lt_1_28418: + .loc 16 406 0 + ld.param.u64 %rd57, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd58, %rd44, 16; + add.u64 %rd59, %rd57, %rd58; + mov.f32 %f126, %f127; + st.global.v4.f32 [%rd59+0], {%f29,%f28,%f27,%f126}; +$Lt_1_27394: + .loc 16 408 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/coul_long_gpu_ptx.h b/lib/gpu/coul_long_gpu_ptx.h new file mode 100644 index 000000000..0a9a9a34e --- /dev/null +++ b/lib/gpu/coul_long_gpu_ptx.h @@ -0,0 +1,979 @@ +const char * coul_long_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .global .texref q_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .u64 __cudaparm_kernel_pair_lj3,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_cl_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_q_,\n" +" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" +" .param .f32 __cudaparm_kernel_pair_g_ewald,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<81>;\n" +" .reg .u64 %rd<57>;\n" +" .reg .f32 %f<132>;\n" +" .reg .pred %p<19>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_cl112[16];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32585_35_non_const_red_acc128[3072];\n" +" .loc 16 108 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 115 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_cl_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 116 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 117 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 118 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_cl112+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 127 0\n" +" mov.f32 %f5, 0f00000000; \n" +" mov.f32 %f6, %f5;\n" +" mov.f32 %f7, 0f00000000; \n" +" mov.f32 %f8, %f7;\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_19202;\n" +" .loc 16 131 0\n" +" cvt.s64.s32 %rd2, %r9;\n" +" mul.wide.s32 %rd3, %r9, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd5, %rd3, %rd4;\n" +" ld.global.s32 %r11, [%rd5+0];\n" +" .loc 16 133 0\n" +" ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd6, %r12;\n" +" mul.wide.s32 %rd7, %r12, 4;\n" +" add.u64 %rd8, %rd7, %rd5;\n" +" ld.global.s32 %r13, [%rd8+0];\n" +" add.u64 %rd9, %rd7, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd4;\n" +" @%p2 bra $Lt_0_19714;\n" +" .loc 16 139 0\n" +" cvt.s32.s64 %r14, %rd6;\n" +" mul.lo.s32 %r15, %r14, %r13;\n" +" cvt.s64.s32 %rd11, %r15;\n" +" mul.wide.s32 %rd12, %r15, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 140 0\n" +" mul.lo.s32 %r16, %r6, %r14;\n" +" cvt.s64.s32 %rd14, %r16;\n" +" mul.wide.s32 %rd15, %r16, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 141 0\n" +" mul.lo.s32 %r17, %r14, %r1;\n" +" bra.uni $Lt_0_19458;\n" +"$Lt_0_19714:\n" +" .loc 16 143 0\n" +" ld.global.s32 %r18, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r18;\n" +" mul.wide.s32 %rd18, %r18, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 144 0\n" +" cvt.s64.s32 %rd20, %r13;\n" +" mul.wide.s32 %rd21, %r13, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 145 0\n" +" mov.s32 %r17, %r1;\n" +" .loc 16 146 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_19458:\n" +" .loc 16 149 0\n" +" mov.u32 %r19, %r11;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f21, %f17;\n" +" mov.f32 %f22, %f18;\n" +" mov.f32 %f23, %f19;\n" +" .loc 16 150 0\n" +" mov.u32 %r26, %r11;\n" +" mov.s32 %r27, 0;\n" +" mov.u32 %r28, %r27;\n" +" mov.s32 %r29, 0;\n" +" mov.u32 %r30, %r29;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" tex.1d.v4.f32.s32 {%f24,%f25,%f26,%f27},[q_tex,{%r26,%r28,%r30,%r32}];\n" +" mov.f32 %f28, %f24;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_27650;\n" +" cvt.s64.s32 %rd24, %r17;\n" +" ld.param.f32 %f29, [__cudaparm_kernel_pair_cut_coulsq];\n" +" mov.f32 %f30, 0f00000000; \n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.u64 %rd25, __cuda___cuda_local_var_32498_33_non_const_sp_cl112;\n" +"$Lt_0_20482:\n" +" .loc 16 153 0\n" +" ld.global.s32 %r33, [%rd16+0];\n" +" .loc 16 156 0\n" +" mov.f32 %f34, 0f3f800000; \n" +" shr.s32 %r34, %r33, 30;\n" +" and.b32 %r35, %r34, 3;\n" +" cvt.s64.s32 %rd26, %r35;\n" +" mul.wide.s32 %rd27, %r35, 4;\n" +" add.u64 %rd28, %rd25, %rd27;\n" +" ld.shared.f32 %f35, [%rd28+0];\n" +" sub.ftz.f32 %f36, %f34, %f35;\n" +" .loc 16 159 0\n" +" and.b32 %r36, %r33, 1073741823;\n" +" mov.u32 %r37, %r36;\n" +" mov.s32 %r38, 0;\n" +" mov.u32 %r39, %r38;\n" +" mov.s32 %r40, 0;\n" +" mov.u32 %r41, %r40;\n" +" mov.s32 %r42, 0;\n" +" mov.u32 %r43, %r42;\n" +" tex.1d.v4.f32.s32 {%f37,%f38,%f39,%f40},[pos_tex,{%r37,%r39,%r41,%r43}];\n" +" mov.f32 %f41, %f37;\n" +" mov.f32 %f42, %f38;\n" +" mov.f32 %f43, %f39;\n" +" sub.ftz.f32 %f44, %f22, %f42;\n" +" sub.ftz.f32 %f45, %f21, %f41;\n" +" sub.ftz.f32 %f46, %f23, %f43;\n" +" mul.ftz.f32 %f47, %f44, %f44;\n" +" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" +" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n" +" setp.lt.ftz.f32 %p4, %f49, %f29;\n" +" @!%p4 bra $Lt_0_21250;\n" +" .loc 16 175 0\n" +" sqrt.approx.ftz.f32 %f50, %f49;\n" +" ld.param.f32 %f51, [__cudaparm_kernel_pair_g_ewald];\n" +" mul.ftz.f32 %f52, %f51, %f50;\n" +" mul.ftz.f32 %f53, %f52, %f52;\n" +" mov.f32 %f54, 0f3f800000; \n" +" mov.f32 %f55, 0f3ea7ba05; \n" +" fma.rn.ftz.f32 %f56, %f55, %f52, %f54;\n" +" neg.ftz.f32 %f57, %f53;\n" +" rcp.approx.ftz.f32 %f58, %f56;\n" +" mov.f32 %f59, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f60, %f57, %f59;\n" +" ex2.approx.ftz.f32 %f61, %f60;\n" +" mov.f32 %f62, 0f3e827906; \n" +" mov.f32 %f63, 0fbe91a98e; \n" +" mov.f32 %f64, 0f3fb5f0e3; \n" +" mov.f32 %f65, 0fbfba00e3; \n" +" mov.f32 %f66, 0f3f87dc22; \n" +" fma.rn.ftz.f32 %f67, %f66, %f58, %f65;\n" +" fma.rn.ftz.f32 %f68, %f58, %f67, %f64;\n" +" fma.rn.ftz.f32 %f69, %f58, %f68, %f63;\n" +" fma.rn.ftz.f32 %f70, %f58, %f69, %f62;\n" +" mul.ftz.f32 %f71, %f58, %f70;\n" +" mul.ftz.f32 %f72, %f61, %f71;\n" +" .loc 16 176 0\n" +" mov.u32 %r44, %r36;\n" +" mov.s32 %r45, 0;\n" +" mov.u32 %r46, %r45;\n" +" mov.s32 %r47, 0;\n" +" mov.u32 %r48, %r47;\n" +" mov.s32 %r49, 0;\n" +" mov.u32 %r50, %r49;\n" +" tex.1d.v4.f32.s32 {%f73,%f74,%f75,%f76},[q_tex,{%r44,%r46,%r48,%r50}];\n" +" mov.f32 %f77, %f73;\n" +" .loc 16 177 0\n" +" ld.param.f32 %f78, [__cudaparm_kernel_pair_qqrd2e];\n" +" mul.ftz.f32 %f79, %f78, %f28;\n" +" mul.ftz.f32 %f80, %f79, %f77;\n" +" div.approx.ftz.f32 %f81, %f80, %f50;\n" +" mov.f32 %f82, 0f3f906ebb; \n" +" mul.ftz.f32 %f83, %f52, %f82;\n" +" fma.rn.ftz.f32 %f84, %f61, %f83, %f72;\n" +" sub.ftz.f32 %f85, %f84, %f36;\n" +" mul.ftz.f32 %f86, %f81, %f85;\n" +" rcp.approx.ftz.f32 %f87, %f49;\n" +" mul.ftz.f32 %f88, %f86, %f87;\n" +" .loc 16 179 0\n" +" fma.rn.ftz.f32 %f32, %f45, %f88, %f32;\n" +" .loc 16 180 0\n" +" fma.rn.ftz.f32 %f31, %f44, %f88, %f31;\n" +" .loc 16 181 0\n" +" fma.rn.ftz.f32 %f30, %f46, %f88, %f30;\n" +" .loc 16 168 0\n" +" sub.ftz.f32 %f89, %f72, %f36;\n" +" fma.rn.ftz.f32 %f90, %f81, %f89, %f33;\n" +" ld.param.s32 %r51, [__cudaparm_kernel_pair_eflag];\n" +" mov.s32 %r52, 0;\n" +" setp.gt.s32 %p5, %r51, %r52;\n" +" selp.f32 %f33, %f90, %f33, %p5;\n" +" ld.param.s32 %r53, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r54, 0;\n" +" setp.le.s32 %p6, %r53, %r54;\n" +" @%p6 bra $Lt_0_21250;\n" +" .loc 16 187 0\n" +" mov.f32 %f91, %f6;\n" +" mul.ftz.f32 %f92, %f45, %f45;\n" +" fma.rn.ftz.f32 %f93, %f88, %f92, %f91;\n" +" mov.f32 %f6, %f93;\n" +" .loc 16 188 0\n" +" mov.f32 %f94, %f8;\n" +" fma.rn.ftz.f32 %f95, %f88, %f47, %f94;\n" +" mov.f32 %f8, %f95;\n" +" .loc 16 189 0\n" +" mov.f32 %f96, %f10;\n" +" mul.ftz.f32 %f97, %f46, %f46;\n" +" fma.rn.ftz.f32 %f98, %f88, %f97, %f96;\n" +" mov.f32 %f10, %f98;\n" +" .loc 16 190 0\n" +" mov.f32 %f99, %f12;\n" +" mul.ftz.f32 %f100, %f44, %f45;\n" +" fma.rn.ftz.f32 %f101, %f88, %f100, %f99;\n" +" mov.f32 %f12, %f101;\n" +" .loc 16 191 0\n" +" mov.f32 %f102, %f14;\n" +" mul.ftz.f32 %f103, %f45, %f46;\n" +" fma.rn.ftz.f32 %f104, %f88, %f103, %f102;\n" +" mov.f32 %f14, %f104;\n" +" .loc 16 192 0\n" +" mul.ftz.f32 %f105, %f44, %f46;\n" +" fma.rn.ftz.f32 %f15, %f88, %f105, %f15;\n" +" mov.f32 %f16, %f15;\n" +"$Lt_0_21250:\n" +"$Lt_0_20738:\n" +" .loc 16 152 0\n" +" mul.lo.u64 %rd29, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd29;\n" +" setp.lt.u64 %p7, %rd16, %rd13;\n" +" @%p7 bra $Lt_0_20482;\n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_27650:\n" +" mov.f32 %f30, 0f00000000; \n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_19202:\n" +" mov.f32 %f30, 0f00000000; \n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +"$Lt_0_18946:\n" +" mov.u32 %r55, 1;\n" +" setp.le.s32 %p8, %r1, %r55;\n" +" @%p8 bra $Lt_0_24066;\n" +" .loc 16 203 0\n" +" mov.u64 %rd30, __cuda___cuda_local_var_32585_35_non_const_red_acc128;\n" +" cvt.s64.s32 %rd31, %r2;\n" +" mul.wide.s32 %rd32, %r2, 4;\n" +" add.u64 %rd33, %rd30, %rd32;\n" +" mov.f32 %f106, %f32;\n" +" st.shared.f32 [%rd33+0], %f106;\n" +" .loc 16 204 0\n" +" mov.f32 %f107, %f31;\n" +" st.shared.f32 [%rd33+512], %f107;\n" +" .loc 16 205 0\n" +" mov.f32 %f108, %f30;\n" +" st.shared.f32 [%rd33+1024], %f108;\n" +" .loc 16 206 0\n" +" mov.f32 %f109, %f33;\n" +" st.shared.f32 [%rd33+1536], %f109;\n" +" .loc 16 208 0\n" +" shr.s32 %r56, %r1, 31;\n" +" mov.s32 %r57, 1;\n" +" and.b32 %r58, %r56, %r57;\n" +" add.s32 %r59, %r58, %r1;\n" +" shr.s32 %r60, %r59, 1;\n" +" mov.s32 %r61, %r60;\n" +" mov.u32 %r62, 0;\n" +" setp.ne.u32 %p9, %r60, %r62;\n" +" @!%p9 bra $Lt_0_22530;\n" +"$Lt_0_23042:\n" +" setp.ge.u32 %p10, %r6, %r61;\n" +" @%p10 bra $Lt_0_23298;\n" +" .loc 16 211 0\n" +" add.u32 %r63, %r2, %r61;\n" +" cvt.u64.u32 %rd34, %r63;\n" +" mul.wide.u32 %rd35, %r63, 4;\n" +" add.u64 %rd36, %rd30, %rd35;\n" +" ld.shared.f32 %f110, [%rd36+0];\n" +" add.ftz.f32 %f106, %f110, %f106;\n" +" st.shared.f32 [%rd33+0], %f106;\n" +" ld.shared.f32 %f111, [%rd36+512];\n" +" add.ftz.f32 %f107, %f111, %f107;\n" +" st.shared.f32 [%rd33+512], %f107;\n" +" ld.shared.f32 %f112, [%rd36+1024];\n" +" add.ftz.f32 %f108, %f112, %f108;\n" +" st.shared.f32 [%rd33+1024], %f108;\n" +" ld.shared.f32 %f113, [%rd36+1536];\n" +" add.ftz.f32 %f109, %f113, %f109;\n" +" st.shared.f32 [%rd33+1536], %f109;\n" +"$Lt_0_23298:\n" +" .loc 16 208 0\n" +" shr.u32 %r61, %r61, 1;\n" +" mov.u32 %r64, 0;\n" +" setp.ne.u32 %p11, %r61, %r64;\n" +" @%p11 bra $Lt_0_23042;\n" +"$Lt_0_22530:\n" +" .loc 16 215 0\n" +" mov.f32 %f32, %f106;\n" +" .loc 16 216 0\n" +" mov.f32 %f31, %f107;\n" +" .loc 16 217 0\n" +" mov.f32 %f30, %f108;\n" +" .loc 16 218 0\n" +" mov.f32 %f33, %f109;\n" +" ld.param.s32 %r65, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r66, 0;\n" +" setp.le.s32 %p12, %r65, %r66;\n" +" @%p12 bra $Lt_0_24066;\n" +" .loc 16 222 0\n" +" mov.f32 %f106, %f6;\n" +" st.shared.f32 [%rd33+0], %f106;\n" +" mov.f32 %f107, %f8;\n" +" st.shared.f32 [%rd33+512], %f107;\n" +" mov.f32 %f108, %f10;\n" +" st.shared.f32 [%rd33+1024], %f108;\n" +" mov.f32 %f109, %f12;\n" +" st.shared.f32 [%rd33+1536], %f109;\n" +" mov.f32 %f114, %f14;\n" +" st.shared.f32 [%rd33+2048], %f114;\n" +" mov.f32 %f115, %f16;\n" +" st.shared.f32 [%rd33+2560], %f115;\n" +" .loc 16 224 0\n" +" mov.s32 %r67, %r60;\n" +" @!%p9 bra $Lt_0_24578;\n" +"$Lt_0_25090:\n" +" setp.ge.u32 %p13, %r6, %r67;\n" +" @%p13 bra $Lt_0_25346;\n" +" .loc 16 227 0\n" +" add.u32 %r68, %r2, %r67;\n" +" cvt.u64.u32 %rd37, %r68;\n" +" mul.wide.u32 %rd38, %r68, 4;\n" +" add.u64 %rd39, %rd30, %rd38;\n" +" ld.shared.f32 %f116, [%rd39+0];\n" +" add.ftz.f32 %f106, %f116, %f106;\n" +" st.shared.f32 [%rd33+0], %f106;\n" +" ld.shared.f32 %f117, [%rd39+512];\n" +" add.ftz.f32 %f107, %f117, %f107;\n" +" st.shared.f32 [%rd33+512], %f107;\n" +" ld.shared.f32 %f118, [%rd39+1024];\n" +" add.ftz.f32 %f108, %f118, %f108;\n" +" st.shared.f32 [%rd33+1024], %f108;\n" +" ld.shared.f32 %f119, [%rd39+1536];\n" +" add.ftz.f32 %f109, %f119, %f109;\n" +" st.shared.f32 [%rd33+1536], %f109;\n" +" ld.shared.f32 %f120, [%rd39+2048];\n" +" add.ftz.f32 %f114, %f120, %f114;\n" +" st.shared.f32 [%rd33+2048], %f114;\n" +" ld.shared.f32 %f121, [%rd39+2560];\n" +" add.ftz.f32 %f115, %f121, %f115;\n" +" st.shared.f32 [%rd33+2560], %f115;\n" +"$Lt_0_25346:\n" +" .loc 16 224 0\n" +" shr.u32 %r67, %r67, 1;\n" +" mov.u32 %r69, 0;\n" +" setp.ne.u32 %p14, %r67, %r69;\n" +" @%p14 bra $Lt_0_25090;\n" +"$Lt_0_24578:\n" +" .loc 16 232 0\n" +" mov.f32 %f6, %f106;\n" +" mov.f32 %f8, %f107;\n" +" mov.f32 %f10, %f108;\n" +" mov.f32 %f12, %f109;\n" +" mov.f32 %f14, %f114;\n" +" mov.f32 %f16, %f115;\n" +"$Lt_0_24066:\n" +"$Lt_0_22018:\n" +" selp.s32 %r70, 1, 0, %p1;\n" +" mov.s32 %r71, 0;\n" +" set.eq.u32.s32 %r72, %r6, %r71;\n" +" neg.s32 %r73, %r72;\n" +" and.b32 %r74, %r70, %r73;\n" +" mov.u32 %r75, 0;\n" +" setp.eq.s32 %p15, %r74, %r75;\n" +" @%p15 bra $Lt_0_26114;\n" +" .loc 16 238 0\n" +" cvt.s64.s32 %rd40, %r9;\n" +" ld.param.u64 %rd41, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd42, %r9, 4;\n" +" add.u64 %rd43, %rd41, %rd42;\n" +" ld.param.s32 %r76, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r77, 0;\n" +" setp.le.s32 %p16, %r76, %r77;\n" +" @%p16 bra $Lt_0_26626;\n" +" .loc 16 240 0\n" +" mov.f32 %f122, 0f00000000; \n" +" st.global.f32 [%rd43+0], %f122;\n" +" .loc 16 241 0\n" +" cvt.s64.s32 %rd44, %r10;\n" +" mul.wide.s32 %rd45, %r10, 4;\n" +" add.u64 %rd46, %rd45, %rd43;\n" +" .loc 16 242 0\n" +" st.global.f32 [%rd46+0], %f33;\n" +" .loc 16 243 0\n" +" add.u64 %rd43, %rd45, %rd46;\n" +"$Lt_0_26626:\n" +" ld.param.s32 %r78, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r79, 0;\n" +" setp.le.s32 %p17, %r78, %r79;\n" +" @%p17 bra $Lt_0_27138;\n" +" .loc 16 247 0\n" +" mov.f32 %f123, %f6;\n" +" st.global.f32 [%rd43+0], %f123;\n" +" .loc 16 248 0\n" +" cvt.s64.s32 %rd47, %r10;\n" +" mul.wide.s32 %rd48, %r10, 4;\n" +" add.u64 %rd49, %rd48, %rd43;\n" +" .loc 16 247 0\n" +" mov.f32 %f124, %f8;\n" +" st.global.f32 [%rd49+0], %f124;\n" +" .loc 16 248 0\n" +" add.u64 %rd50, %rd48, %rd49;\n" +" .loc 16 247 0\n" +" mov.f32 %f125, %f10;\n" +" st.global.f32 [%rd50+0], %f125;\n" +" .loc 16 248 0\n" +" add.u64 %rd51, %rd48, %rd50;\n" +" .loc 16 247 0\n" +" mov.f32 %f126, %f12;\n" +" st.global.f32 [%rd51+0], %f126;\n" +" .loc 16 248 0\n" +" add.u64 %rd43, %rd48, %rd51;\n" +" .loc 16 247 0\n" +" mov.f32 %f127, %f14;\n" +" st.global.f32 [%rd43+0], %f127;\n" +" mov.f32 %f128, %f16;\n" +" add.u64 %rd52, %rd48, %rd43;\n" +" st.global.f32 [%rd52+0], %f128;\n" +"$Lt_0_27138:\n" +" .loc 16 251 0\n" +" ld.param.u64 %rd53, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd54, %rd40, 16;\n" +" add.u64 %rd55, %rd53, %rd54;\n" +" mov.f32 %f129, %f130;\n" +" st.global.v4.f32 [%rd55+0], {%f32,%f31,%f30,%f129};\n" +"$Lt_0_26114:\n" +" .loc 16 253 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_cl_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" +" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" +" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<82>;\n" +" .reg .u64 %rd<61>;\n" +" .reg .f32 %f<129>;\n" +" .reg .pred %p<20>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32653_33_non_const_sp_cl3304[16];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32740_35_non_const_red_acc3320[3072];\n" +" .loc 16 263 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 3;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_19714;\n" +" .loc 16 271 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32653_33_non_const_sp_cl3304;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_cl_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_19714:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32653_33_non_const_sp_cl3304;\n" +" .loc 16 280 0\n" +" mov.f32 %f2, 0f00000000; \n" +" mov.f32 %f3, %f2;\n" +" mov.f32 %f4, 0f00000000; \n" +" mov.f32 %f5, %f4;\n" +" mov.f32 %f6, 0f00000000; \n" +" mov.f32 %f7, %f6;\n" +" mov.f32 %f8, 0f00000000; \n" +" mov.f32 %f9, %f8;\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" .loc 16 282 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r3, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r4, %r1, %r3;\n" +" cvt.s32.u32 %r5, %ntid.x;\n" +" div.s32 %r6, %r5, %r3;\n" +" rem.s32 %r7, %r1, %r3;\n" +" cvt.s32.u32 %r8, %ctaid.x;\n" +" mul.lo.s32 %r9, %r8, %r6;\n" +" add.s32 %r10, %r4, %r9;\n" +" ld.param.s32 %r11, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p2, %r10, %r11;\n" +" @!%p2 bra $Lt_1_20482;\n" +" .loc 16 286 0\n" +" cvt.s64.s32 %rd7, %r10;\n" +" mul.wide.s32 %rd8, %r10, 4;\n" +" ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd10, %rd8, %rd9;\n" +" ld.global.s32 %r12, [%rd10+0];\n" +" .loc 16 288 0\n" +" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd11, %r13;\n" +" mul.wide.s32 %rd12, %r13, 4;\n" +" add.u64 %rd13, %rd12, %rd10;\n" +" ld.global.s32 %r14, [%rd13+0];\n" +" add.u64 %rd14, %rd12, %rd13;\n" +" ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p3, %rd15, %rd9;\n" +" @%p3 bra $Lt_1_20994;\n" +" .loc 16 294 0\n" +" cvt.s32.s64 %r15, %rd11;\n" +" mul.lo.s32 %r16, %r15, %r14;\n" +" cvt.s64.s32 %rd16, %r16;\n" +" mul.wide.s32 %rd17, %r16, 4;\n" +" add.u64 %rd18, %rd14, %rd17;\n" +" .loc 16 295 0\n" +" mul.lo.s32 %r17, %r7, %r15;\n" +" cvt.s64.s32 %rd19, %r17;\n" +" mul.wide.s32 %rd20, %r17, 4;\n" +" add.u64 %rd21, %rd14, %rd20;\n" +" .loc 16 296 0\n" +" mul.lo.s32 %r18, %r15, %r3;\n" +" bra.uni $Lt_1_20738;\n" +"$Lt_1_20994:\n" +" .loc 16 298 0\n" +" ld.global.s32 %r19, [%rd14+0];\n" +" cvt.s64.s32 %rd22, %r19;\n" +" mul.wide.s32 %rd23, %r19, 4;\n" +" add.u64 %rd24, %rd15, %rd23;\n" +" .loc 16 299 0\n" +" cvt.s64.s32 %rd25, %r14;\n" +" mul.wide.s32 %rd26, %r14, 4;\n" +" add.u64 %rd18, %rd24, %rd26;\n" +" .loc 16 300 0\n" +" mov.s32 %r18, %r3;\n" +" .loc 16 301 0\n" +" cvt.s64.s32 %rd27, %r7;\n" +" mul.wide.s32 %rd28, %r7, 4;\n" +" add.u64 %rd21, %rd24, %rd28;\n" +"$Lt_1_20738:\n" +" .loc 16 304 0\n" +" mov.u32 %r20, %r12;\n" +" mov.s32 %r21, 0;\n" +" mov.u32 %r22, %r21;\n" +" mov.s32 %r23, 0;\n" +" mov.u32 %r24, %r23;\n" +" mov.s32 %r25, 0;\n" +" mov.u32 %r26, %r25;\n" +" tex.1d.v4.f32.s32 {%f14,%f15,%f16,%f17},[pos_tex,{%r20,%r22,%r24,%r26}];\n" +" mov.f32 %f18, %f14;\n" +" mov.f32 %f19, %f15;\n" +" mov.f32 %f20, %f16;\n" +" .loc 16 305 0\n" +" mov.u32 %r27, %r12;\n" +" mov.s32 %r28, 0;\n" +" mov.u32 %r29, %r28;\n" +" mov.s32 %r30, 0;\n" +" mov.u32 %r31, %r30;\n" +" mov.s32 %r32, 0;\n" +" mov.u32 %r33, %r32;\n" +" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[q_tex,{%r27,%r29,%r31,%r33}];\n" +" mov.f32 %f25, %f21;\n" +" setp.ge.u64 %p4, %rd21, %rd18;\n" +" @%p4 bra $Lt_1_28930;\n" +" cvt.s64.s32 %rd29, %r18;\n" +" ld.param.f32 %f26, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" mov.f32 %f29, 0f00000000; \n" +" mov.f32 %f30, 0f00000000; \n" +"$Lt_1_21762:\n" +" .loc 16 308 0\n" +" ld.global.s32 %r34, [%rd21+0];\n" +" .loc 16 311 0\n" +" mov.f32 %f31, 0f3f800000; \n" +" shr.s32 %r35, %r34, 30;\n" +" and.b32 %r36, %r35, 3;\n" +" cvt.s64.s32 %rd30, %r36;\n" +" mul.wide.s32 %rd31, %r36, 4;\n" +" add.u64 %rd32, %rd1, %rd31;\n" +" ld.shared.f32 %f32, [%rd32+0];\n" +" sub.ftz.f32 %f33, %f31, %f32;\n" +" .loc 16 314 0\n" +" and.b32 %r37, %r34, 1073741823;\n" +" mov.u32 %r38, %r37;\n" +" mov.s32 %r39, 0;\n" +" mov.u32 %r40, %r39;\n" +" mov.s32 %r41, 0;\n" +" mov.u32 %r42, %r41;\n" +" mov.s32 %r43, 0;\n" +" mov.u32 %r44, %r43;\n" +" tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r38,%r40,%r42,%r44}];\n" +" mov.f32 %f38, %f34;\n" +" mov.f32 %f39, %f35;\n" +" mov.f32 %f40, %f36;\n" +" sub.ftz.f32 %f41, %f19, %f39;\n" +" sub.ftz.f32 %f42, %f18, %f38;\n" +" sub.ftz.f32 %f43, %f20, %f40;\n" +" mul.ftz.f32 %f44, %f41, %f41;\n" +" fma.rn.ftz.f32 %f45, %f42, %f42, %f44;\n" +" fma.rn.ftz.f32 %f46, %f43, %f43, %f45;\n" +" setp.lt.ftz.f32 %p5, %f46, %f26;\n" +" @!%p5 bra $Lt_1_22530;\n" +" .loc 16 330 0\n" +" sqrt.approx.ftz.f32 %f47, %f46;\n" +" ld.param.f32 %f48, [__cudaparm_kernel_pair_fast_g_ewald];\n" +" mul.ftz.f32 %f49, %f48, %f47;\n" +" mul.ftz.f32 %f50, %f49, %f49;\n" +" mov.f32 %f51, 0f3f800000; \n" +" mov.f32 %f52, 0f3ea7ba05; \n" +" fma.rn.ftz.f32 %f53, %f52, %f49, %f51;\n" +" neg.ftz.f32 %f54, %f50;\n" +" rcp.approx.ftz.f32 %f55, %f53;\n" +" mov.f32 %f56, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f57, %f54, %f56;\n" +" ex2.approx.ftz.f32 %f58, %f57;\n" +" mov.f32 %f59, 0f3e827906; \n" +" mov.f32 %f60, 0fbe91a98e; \n" +" mov.f32 %f61, 0f3fb5f0e3; \n" +" mov.f32 %f62, 0fbfba00e3; \n" +" mov.f32 %f63, 0f3f87dc22; \n" +" fma.rn.ftz.f32 %f64, %f63, %f55, %f62;\n" +" fma.rn.ftz.f32 %f65, %f55, %f64, %f61;\n" +" fma.rn.ftz.f32 %f66, %f55, %f65, %f60;\n" +" fma.rn.ftz.f32 %f67, %f55, %f66, %f59;\n" +" mul.ftz.f32 %f68, %f55, %f67;\n" +" mul.ftz.f32 %f69, %f58, %f68;\n" +" .loc 16 331 0\n" +" mov.u32 %r45, %r37;\n" +" mov.s32 %r46, 0;\n" +" mov.u32 %r47, %r46;\n" +" mov.s32 %r48, 0;\n" +" mov.u32 %r49, %r48;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" tex.1d.v4.f32.s32 {%f70,%f71,%f72,%f73},[q_tex,{%r45,%r47,%r49,%r51}];\n" +" mov.f32 %f74, %f70;\n" +" .loc 16 332 0\n" +" ld.param.f32 %f75, [__cudaparm_kernel_pair_fast_qqrd2e];\n" +" mul.ftz.f32 %f76, %f75, %f25;\n" +" mul.ftz.f32 %f77, %f76, %f74;\n" +" div.approx.ftz.f32 %f78, %f77, %f47;\n" +" mov.f32 %f79, 0f3f906ebb; \n" +" mul.ftz.f32 %f80, %f49, %f79;\n" +" fma.rn.ftz.f32 %f81, %f58, %f80, %f69;\n" +" sub.ftz.f32 %f82, %f81, %f33;\n" +" mul.ftz.f32 %f83, %f78, %f82;\n" +" rcp.approx.ftz.f32 %f84, %f46;\n" +" mul.ftz.f32 %f85, %f83, %f84;\n" +" .loc 16 334 0\n" +" fma.rn.ftz.f32 %f29, %f42, %f85, %f29;\n" +" .loc 16 335 0\n" +" fma.rn.ftz.f32 %f28, %f41, %f85, %f28;\n" +" .loc 16 336 0\n" +" fma.rn.ftz.f32 %f27, %f43, %f85, %f27;\n" +" .loc 16 323 0\n" +" sub.ftz.f32 %f86, %f69, %f33;\n" +" fma.rn.ftz.f32 %f87, %f78, %f86, %f30;\n" +" ld.param.s32 %r52, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.s32 %r53, 0;\n" +" setp.gt.s32 %p6, %r52, %r53;\n" +" selp.f32 %f30, %f87, %f30, %p6;\n" +" ld.param.s32 %r54, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r55, 0;\n" +" setp.le.s32 %p7, %r54, %r55;\n" +" @%p7 bra $Lt_1_22530;\n" +" .loc 16 342 0\n" +" mov.f32 %f88, %f3;\n" +" mul.ftz.f32 %f89, %f42, %f42;\n" +" fma.rn.ftz.f32 %f90, %f85, %f89, %f88;\n" +" mov.f32 %f3, %f90;\n" +" .loc 16 343 0\n" +" mov.f32 %f91, %f5;\n" +" fma.rn.ftz.f32 %f92, %f85, %f44, %f91;\n" +" mov.f32 %f5, %f92;\n" +" .loc 16 344 0\n" +" mov.f32 %f93, %f7;\n" +" mul.ftz.f32 %f94, %f43, %f43;\n" +" fma.rn.ftz.f32 %f95, %f85, %f94, %f93;\n" +" mov.f32 %f7, %f95;\n" +" .loc 16 345 0\n" +" mov.f32 %f96, %f9;\n" +" mul.ftz.f32 %f97, %f41, %f42;\n" +" fma.rn.ftz.f32 %f98, %f85, %f97, %f96;\n" +" mov.f32 %f9, %f98;\n" +" .loc 16 346 0\n" +" mov.f32 %f99, %f11;\n" +" mul.ftz.f32 %f100, %f42, %f43;\n" +" fma.rn.ftz.f32 %f101, %f85, %f100, %f99;\n" +" mov.f32 %f11, %f101;\n" +" .loc 16 347 0\n" +" mul.ftz.f32 %f102, %f41, %f43;\n" +" fma.rn.ftz.f32 %f12, %f85, %f102, %f12;\n" +" mov.f32 %f13, %f12;\n" +"$Lt_1_22530:\n" +"$Lt_1_22018:\n" +" .loc 16 307 0\n" +" mul.lo.u64 %rd33, %rd29, 4;\n" +" add.u64 %rd21, %rd21, %rd33;\n" +" setp.lt.u64 %p8, %rd21, %rd18;\n" +" @%p8 bra $Lt_1_21762;\n" +" bra.uni $Lt_1_20226;\n" +"$Lt_1_28930:\n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" mov.f32 %f29, 0f00000000; \n" +" mov.f32 %f30, 0f00000000; \n" +" bra.uni $Lt_1_20226;\n" +"$Lt_1_20482:\n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" mov.f32 %f29, 0f00000000; \n" +" mov.f32 %f30, 0f00000000; \n" +"$Lt_1_20226:\n" +" mov.u32 %r56, 1;\n" +" setp.le.s32 %p9, %r3, %r56;\n" +" @%p9 bra $Lt_1_25346;\n" +" .loc 16 358 0\n" +" mov.u64 %rd34, __cuda___cuda_local_var_32740_35_non_const_red_acc3320;\n" +" cvt.s64.s32 %rd35, %r1;\n" +" mul.wide.s32 %rd36, %r1, 4;\n" +" add.u64 %rd37, %rd34, %rd36;\n" +" mov.f32 %f103, %f29;\n" +" st.shared.f32 [%rd37+0], %f103;\n" +" .loc 16 359 0\n" +" mov.f32 %f104, %f28;\n" +" st.shared.f32 [%rd37+512], %f104;\n" +" .loc 16 360 0\n" +" mov.f32 %f105, %f27;\n" +" st.shared.f32 [%rd37+1024], %f105;\n" +" .loc 16 361 0\n" +" mov.f32 %f106, %f30;\n" +" st.shared.f32 [%rd37+1536], %f106;\n" +" .loc 16 363 0\n" +" shr.s32 %r57, %r3, 31;\n" +" mov.s32 %r58, 1;\n" +" and.b32 %r59, %r57, %r58;\n" +" add.s32 %r60, %r59, %r3;\n" +" shr.s32 %r61, %r60, 1;\n" +" mov.s32 %r62, %r61;\n" +" mov.u32 %r63, 0;\n" +" setp.ne.u32 %p10, %r61, %r63;\n" +" @!%p10 bra $Lt_1_23810;\n" +"$Lt_1_24322:\n" +" setp.ge.u32 %p11, %r7, %r62;\n" +" @%p11 bra $Lt_1_24578;\n" +" .loc 16 366 0\n" +" add.u32 %r64, %r1, %r62;\n" +" cvt.u64.u32 %rd38, %r64;\n" +" mul.wide.u32 %rd39, %r64, 4;\n" +" add.u64 %rd40, %rd34, %rd39;\n" +" ld.shared.f32 %f107, [%rd40+0];\n" +" add.ftz.f32 %f103, %f107, %f103;\n" +" st.shared.f32 [%rd37+0], %f103;\n" +" ld.shared.f32 %f108, [%rd40+512];\n" +" add.ftz.f32 %f104, %f108, %f104;\n" +" st.shared.f32 [%rd37+512], %f104;\n" +" ld.shared.f32 %f109, [%rd40+1024];\n" +" add.ftz.f32 %f105, %f109, %f105;\n" +" st.shared.f32 [%rd37+1024], %f105;\n" +" ld.shared.f32 %f110, [%rd40+1536];\n" +" add.ftz.f32 %f106, %f110, %f106;\n" +" st.shared.f32 [%rd37+1536], %f106;\n" +"$Lt_1_24578:\n" +" .loc 16 363 0\n" +" shr.u32 %r62, %r62, 1;\n" +" mov.u32 %r65, 0;\n" +" setp.ne.u32 %p12, %r62, %r65;\n" +" @%p12 bra $Lt_1_24322;\n" +"$Lt_1_23810:\n" +" .loc 16 370 0\n" +" mov.f32 %f29, %f103;\n" +" .loc 16 371 0\n" +" mov.f32 %f28, %f104;\n" +" .loc 16 372 0\n" +" mov.f32 %f27, %f105;\n" +" .loc 16 373 0\n" +" mov.f32 %f30, %f106;\n" +" ld.param.s32 %r66, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r67, 0;\n" +" setp.le.s32 %p13, %r66, %r67;\n" +" @%p13 bra $Lt_1_25346;\n" +" .loc 16 377 0\n" +" mov.f32 %f103, %f3;\n" +" st.shared.f32 [%rd37+0], %f103;\n" +" mov.f32 %f104, %f5;\n" +" st.shared.f32 [%rd37+512], %f104;\n" +" mov.f32 %f105, %f7;\n" +" st.shared.f32 [%rd37+1024], %f105;\n" +" mov.f32 %f106, %f9;\n" +" st.shared.f32 [%rd37+1536], %f106;\n" +" mov.f32 %f111, %f11;\n" +" st.shared.f32 [%rd37+2048], %f111;\n" +" mov.f32 %f112, %f13;\n" +" st.shared.f32 [%rd37+2560], %f112;\n" +" .loc 16 379 0\n" +" mov.s32 %r68, %r61;\n" +" @!%p10 bra $Lt_1_25858;\n" +"$Lt_1_26370:\n" +" setp.ge.u32 %p14, %r7, %r68;\n" +" @%p14 bra $Lt_1_26626;\n" +" .loc 16 382 0\n" +" add.u32 %r69, %r1, %r68;\n" +" cvt.u64.u32 %rd41, %r69;\n" +" mul.wide.u32 %rd42, %r69, 4;\n" +" add.u64 %rd43, %rd34, %rd42;\n" +" ld.shared.f32 %f113, [%rd43+0];\n" +" add.ftz.f32 %f103, %f113, %f103;\n" +" st.shared.f32 [%rd37+0], %f103;\n" +" ld.shared.f32 %f114, [%rd43+512];\n" +" add.ftz.f32 %f104, %f114, %f104;\n" +" st.shared.f32 [%rd37+512], %f104;\n" +" ld.shared.f32 %f115, [%rd43+1024];\n" +" add.ftz.f32 %f105, %f115, %f105;\n" +" st.shared.f32 [%rd37+1024], %f105;\n" +" ld.shared.f32 %f116, [%rd43+1536];\n" +" add.ftz.f32 %f106, %f116, %f106;\n" +" st.shared.f32 [%rd37+1536], %f106;\n" +" ld.shared.f32 %f117, [%rd43+2048];\n" +" add.ftz.f32 %f111, %f117, %f111;\n" +" st.shared.f32 [%rd37+2048], %f111;\n" +" ld.shared.f32 %f118, [%rd43+2560];\n" +" add.ftz.f32 %f112, %f118, %f112;\n" +" st.shared.f32 [%rd37+2560], %f112;\n" +"$Lt_1_26626:\n" +" .loc 16 379 0\n" +" shr.u32 %r68, %r68, 1;\n" +" mov.u32 %r70, 0;\n" +" setp.ne.u32 %p15, %r68, %r70;\n" +" @%p15 bra $Lt_1_26370;\n" +"$Lt_1_25858:\n" +" .loc 16 387 0\n" +" mov.f32 %f3, %f103;\n" +" mov.f32 %f5, %f104;\n" +" mov.f32 %f7, %f105;\n" +" mov.f32 %f9, %f106;\n" +" mov.f32 %f11, %f111;\n" +" mov.f32 %f13, %f112;\n" +"$Lt_1_25346:\n" +"$Lt_1_23298:\n" +" selp.s32 %r71, 1, 0, %p2;\n" +" mov.s32 %r72, 0;\n" +" set.eq.u32.s32 %r73, %r7, %r72;\n" +" neg.s32 %r74, %r73;\n" +" and.b32 %r75, %r71, %r74;\n" +" mov.u32 %r76, 0;\n" +" setp.eq.s32 %p16, %r75, %r76;\n" +" @%p16 bra $Lt_1_27394;\n" +" .loc 16 393 0\n" +" cvt.s64.s32 %rd44, %r10;\n" +" ld.param.u64 %rd45, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd46, %r10, 4;\n" +" add.u64 %rd47, %rd45, %rd46;\n" +" ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r78, 0;\n" +" setp.le.s32 %p17, %r77, %r78;\n" +" @%p17 bra $Lt_1_27906;\n" +" .loc 16 395 0\n" +" mov.f32 %f119, 0f00000000; \n" +" st.global.f32 [%rd47+0], %f119;\n" +" .loc 16 396 0\n" +" cvt.s64.s32 %rd48, %r11;\n" +" mul.wide.s32 %rd49, %r11, 4;\n" +" add.u64 %rd50, %rd49, %rd47;\n" +" .loc 16 397 0\n" +" st.global.f32 [%rd50+0], %f30;\n" +" .loc 16 398 0\n" +" add.u64 %rd47, %rd49, %rd50;\n" +"$Lt_1_27906:\n" +" ld.param.s32 %r79, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r80, 0;\n" +" setp.le.s32 %p18, %r79, %r80;\n" +" @%p18 bra $Lt_1_28418;\n" +" .loc 16 402 0\n" +" mov.f32 %f120, %f3;\n" +" st.global.f32 [%rd47+0], %f120;\n" +" .loc 16 403 0\n" +" cvt.s64.s32 %rd51, %r11;\n" +" mul.wide.s32 %rd52, %r11, 4;\n" +" add.u64 %rd53, %rd52, %rd47;\n" +" .loc 16 402 0\n" +" mov.f32 %f121, %f5;\n" +" st.global.f32 [%rd53+0], %f121;\n" +" .loc 16 403 0\n" +" add.u64 %rd54, %rd52, %rd53;\n" +" .loc 16 402 0\n" +" mov.f32 %f122, %f7;\n" +" st.global.f32 [%rd54+0], %f122;\n" +" .loc 16 403 0\n" +" add.u64 %rd55, %rd52, %rd54;\n" +" .loc 16 402 0\n" +" mov.f32 %f123, %f9;\n" +" st.global.f32 [%rd55+0], %f123;\n" +" .loc 16 403 0\n" +" add.u64 %rd47, %rd52, %rd55;\n" +" .loc 16 402 0\n" +" mov.f32 %f124, %f11;\n" +" st.global.f32 [%rd47+0], %f124;\n" +" mov.f32 %f125, %f13;\n" +" add.u64 %rd56, %rd52, %rd47;\n" +" st.global.f32 [%rd56+0], %f125;\n" +"$Lt_1_28418:\n" +" .loc 16 406 0\n" +" ld.param.u64 %rd57, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd58, %rd44, 16;\n" +" add.u64 %rd59, %rd57, %rd58;\n" +" mov.f32 %f126, %f127;\n" +" st.global.v4.f32 [%rd59+0], {%f29,%f28,%f27,%f126};\n" +"$Lt_1_27394:\n" +" .loc 16 408 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/crml_gpu_kernel.ptx b/lib/gpu/crml_gpu_kernel.ptx new file mode 100644 index 000000000..dee2ead87 --- /dev/null +++ b/lib/gpu/crml_gpu_kernel.ptx @@ -0,0 +1,1288 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bfe5_00000000-9_crml_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.xCE2Bc) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bfe5_00000000-8_crml_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "crml_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + .global .texref q_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_q_, + .param .f32 __cudaparm_kernel_pair_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_qqrd2e, + .param .f32 __cudaparm_kernel_pair_g_ewald, + .param .f32 __cudaparm_kernel_pair_denom_lj, + .param .f32 __cudaparm_kernel_pair_cut_bothsq, + .param .f32 __cudaparm_kernel_pair_cut_ljsq, + .param .f32 __cudaparm_kernel_pair_cut_lj_innersq, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<91>; + .reg .u64 %rd<64>; + .reg .f32 %f<190>; + .reg .pred %p<23>; + .shared .align 16 .b8 __cuda___cuda_local_var_32500_33_non_const_sp_lj120[32]; + .shared .align 4 .b8 __cuda___cuda_local_var_32624_35_non_const_red_acc152[3072]; + // __cuda_local_var_32512_10_non_const_f = 64 + // __cuda_local_var_32516_9_non_const_virial = 16 + // __cuda_local_var_32564_43_non_const_r6inv = 40 + // __cuda_local_var_32564_50_non_const_prefactor = 52 + // __cuda_local_var_32564_61_non_const__erfc = 48 + // __cuda_local_var_32564_68_non_const_switch1 = 44 + .loc 16 110 0 +$LDWbegin_kernel_pair: + .loc 16 118 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 119 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 120 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 121 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32500_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4}; + .loc 16 122 0 + ld.global.f32 %f5, [%rd1+16]; + .loc 16 123 0 + ld.global.f32 %f6, [%rd1+20]; + .loc 16 124 0 + ld.global.f32 %f7, [%rd1+24]; + .loc 16 125 0 + ld.global.f32 %f8, [%rd1+28]; + st.shared.v4.f32 [__cuda___cuda_local_var_32500_33_non_const_sp_lj120+16], {%f5,%f6,%f7,%f8}; + .loc 16 135 0 + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + mov.f32 %f17, 0f00000000; // 0 + mov.f32 %f18, %f17; + mov.f32 %f19, 0f00000000; // 0 + mov.f32 %f20, %f19; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_23810; + .loc 16 139 0 + cvt.s64.s32 %rd2, %r9; + mul.wide.s32 %rd3, %r9, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd5, %rd3, %rd4; + ld.global.s32 %r11, [%rd5+0]; + .loc 16 141 0 + ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd6, %r12; + mul.wide.s32 %rd7, %r12, 4; + add.u64 %rd8, %rd7, %rd5; + ld.global.s32 %r13, [%rd8+0]; + add.u64 %rd9, %rd7, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd4; + @%p2 bra $Lt_0_24322; + .loc 16 147 0 + cvt.s32.s64 %r14, %rd6; + mul.lo.s32 %r15, %r14, %r13; + cvt.s64.s32 %rd11, %r15; + mul.wide.s32 %rd12, %r15, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 148 0 + mul.lo.s32 %r16, %r6, %r14; + cvt.s64.s32 %rd14, %r16; + mul.wide.s32 %rd15, %r16, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 149 0 + mul.lo.s32 %r17, %r14, %r1; + bra.uni $Lt_0_24066; +$Lt_0_24322: + .loc 16 151 0 + ld.global.s32 %r18, [%rd9+0]; + cvt.s64.s32 %rd17, %r18; + mul.wide.s32 %rd18, %r18, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 152 0 + cvt.s64.s32 %rd20, %r13; + mul.wide.s32 %rd21, %r13, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 153 0 + mov.s32 %r17, %r1; + .loc 16 154 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_24066: + .loc 16 157 0 + mov.u32 %r19, %r11; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f25, %f21; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + .loc 16 158 0 + mov.u32 %r26, %r11; + mov.s32 %r27, 0; + mov.u32 %r28, %r27; + mov.s32 %r29, 0; + mov.u32 %r30, %r29; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r26,%r28,%r30,%r32}]; + mov.f32 %f33, %f29; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_34818; + cvt.s64.s32 %rd24, %r17; + ld.param.f32 %f34, [__cudaparm_kernel_pair_cut_bothsq]; + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.u64 %rd25, __cuda___cuda_local_var_32500_33_non_const_sp_lj120; +$Lt_0_25090: + //<loop> Loop body line 158, nesting depth: 1, estimated iterations: unknown + .loc 16 162 0 + ld.global.s32 %r33, [%rd16+0]; + .loc 16 165 0 + shr.s32 %r34, %r33, 30; + and.b32 %r35, %r34, 3; + cvt.s64.s32 %rd26, %r35; + mul.wide.s32 %rd27, %r35, 4; + add.u64 %rd28, %rd25, %rd27; + ld.shared.f32 %f40, [%rd28+0]; + .loc 16 166 0 + mov.f32 %f41, 0f3f800000; // 1 + ld.shared.f32 %f42, [%rd28+16]; + sub.ftz.f32 %f43, %f41, %f42; + .loc 16 169 0 + and.b32 %r36, %r33, 1073741823; + mov.u32 %r37, %r36; + mov.s32 %r38, 0; + mov.u32 %r39, %r38; + mov.s32 %r40, 0; + mov.u32 %r41, %r40; + mov.s32 %r42, 0; + mov.u32 %r43, %r42; + tex.1d.v4.f32.s32 {%f44,%f45,%f46,%f47},[pos_tex,{%r37,%r39,%r41,%r43}]; + mov.f32 %f48, %f44; + mov.f32 %f49, %f45; + mov.f32 %f50, %f46; + mov.f32 %f51, %f47; + sub.ftz.f32 %f52, %f26, %f49; + sub.ftz.f32 %f53, %f25, %f48; + sub.ftz.f32 %f54, %f27, %f50; + mul.ftz.f32 %f55, %f52, %f52; + fma.rn.ftz.f32 %f56, %f53, %f53, %f55; + fma.rn.ftz.f32 %f57, %f54, %f54, %f56; + setp.lt.ftz.f32 %p4, %f57, %f34; + @!%p4 bra $Lt_0_28418; + ld.param.f32 %f58, [__cudaparm_kernel_pair_cut_ljsq]; + setp.lt.ftz.f32 %p5, %f57, %f58; + rcp.approx.ftz.f32 %f59, %f57; + @!%p5 bra $Lt_0_26114; + .loc 16 184 0 + mul.ftz.f32 %f60, %f59, %f59; + mul.ftz.f32 %f61, %f59, %f60; + mov.f32 %f62, %f61; + .loc 16 185 0 + cvt.rzi.ftz.s32.f32 %r44, %f51; + cvt.rzi.ftz.s32.f32 %r45, %f28; + ld.param.u64 %rd29, [__cudaparm_kernel_pair_lj1]; + ld.param.s32 %r46, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r47, %r46, %r45; + add.s32 %r48, %r44, %r47; + cvt.s64.s32 %rd30, %r48; + mul.wide.s32 %rd31, %r48, 16; + add.u64 %rd32, %rd29, %rd31; + mul.ftz.f32 %f63, %f61, %f40; + ld.global.v2.f32 {%f64,%f65}, [%rd32+0]; + mul.ftz.f32 %f66, %f64, %f61; + sub.ftz.f32 %f67, %f66, %f65; + mul.ftz.f32 %f68, %f63, %f67; + ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_lj_innersq]; + setp.gt.ftz.f32 %p6, %f57, %f69; + @!%p6 bra $Lt_0_25858; + .loc 16 191 0 + add.ftz.f32 %f70, %f57, %f57; + sub.ftz.f32 %f71, %f58, %f57; + add.ftz.f32 %f72, %f70, %f58; + mul.ftz.f32 %f73, %f71, %f71; + mov.f32 %f74, 0f40400000; // 3 + mul.ftz.f32 %f75, %f74, %f69; + sub.ftz.f32 %f76, %f72, %f75; + ld.param.f32 %f77, [__cudaparm_kernel_pair_denom_lj]; + div.approx.ftz.f32 %f78, %f76, %f77; + mul.ftz.f32 %f79, %f73, %f78; + mov.f32 %f80, %f79; + .loc 16 194 0 + mov.f32 %f81, 0f41400000; // 12 + mul.ftz.f32 %f82, %f57, %f81; + mul.ftz.f32 %f83, %f71, %f82; + sub.ftz.f32 %f84, %f57, %f69; + mul.ftz.f32 %f85, %f83, %f84; + div.approx.ftz.f32 %f86, %f85, %f77; + ld.global.v2.f32 {%f87,%f88}, [%rd32+8]; + mul.ftz.f32 %f89, %f87, %f61; + sub.ftz.f32 %f90, %f89, %f88; + mul.ftz.f32 %f91, %f61, %f90; + mul.ftz.f32 %f92, %f86, %f91; + fma.rn.ftz.f32 %f68, %f68, %f79, %f92; + bra.uni $Lt_0_25858; +$Lt_0_26114: + .loc 16 197 0 + mov.f32 %f68, 0f00000000; // 0 +$Lt_0_25858: + ld.param.f32 %f93, [__cudaparm_kernel_pair_cut_coulsq]; + setp.gt.ftz.f32 %p7, %f93, %f57; + @!%p7 bra $Lt_0_27138; + .loc 16 204 0 + sqrt.approx.ftz.f32 %f94, %f57; + ld.param.f32 %f95, [__cudaparm_kernel_pair_g_ewald]; + mul.ftz.f32 %f96, %f95, %f94; + mul.ftz.f32 %f97, %f96, %f96; + mov.f32 %f98, 0f3f800000; // 1 + mov.f32 %f99, 0f3ea7ba05; // 0.327591 + fma.rn.ftz.f32 %f100, %f99, %f96, %f98; + neg.ftz.f32 %f101, %f97; + rcp.approx.ftz.f32 %f102, %f100; + mov.f32 %f103, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f104, %f101, %f103; + ex2.approx.ftz.f32 %f105, %f104; + mov.f32 %f106, 0f3e827906; // 0.25483 + mov.f32 %f107, 0fbe91a98e; // -0.284497 + mov.f32 %f108, 0f3fb5f0e3; // 1.42141 + mov.f32 %f109, 0fbfba00e3; // -1.45315 + mov.f32 %f110, 0f3f87dc22; // 1.06141 + fma.rn.ftz.f32 %f111, %f110, %f102, %f109; + fma.rn.ftz.f32 %f112, %f102, %f111, %f108; + fma.rn.ftz.f32 %f113, %f102, %f112, %f107; + fma.rn.ftz.f32 %f114, %f102, %f113, %f106; + mul.ftz.f32 %f115, %f102, %f114; + mul.ftz.f32 %f116, %f105, %f115; + mov.f32 %f117, %f116; + .loc 16 205 0 + mov.u32 %r49, %r36; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + tex.1d.v4.f32.s32 {%f118,%f119,%f120,%f121},[q_tex,{%r49,%r51,%r53,%r55}]; + mov.f32 %f122, %f118; + ld.param.f32 %f123, [__cudaparm_kernel_pair_qqrd2e]; + mul.ftz.f32 %f124, %f123, %f33; + mul.ftz.f32 %f125, %f124, %f122; + div.approx.ftz.f32 %f126, %f125, %f94; + mov.f32 %f127, %f126; + .loc 16 206 0 + mov.f32 %f128, 0f3f906ebb; // 1.12838 + mul.ftz.f32 %f129, %f96, %f128; + fma.rn.ftz.f32 %f130, %f105, %f129, %f116; + sub.ftz.f32 %f131, %f130, %f43; + mul.ftz.f32 %f132, %f126, %f131; + bra.uni $Lt_0_26882; +$Lt_0_27138: + .loc 16 208 0 + mov.f32 %f132, 0f00000000; // 0 +$Lt_0_26882: + .loc 16 212 0 + add.ftz.f32 %f133, %f132, %f68; + mul.ftz.f32 %f134, %f133, %f59; + fma.rn.ftz.f32 %f37, %f53, %f134, %f37; + .loc 16 213 0 + fma.rn.ftz.f32 %f36, %f52, %f134, %f36; + .loc 16 214 0 + fma.rn.ftz.f32 %f35, %f54, %f134, %f35; + ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r57, 0; + setp.le.s32 %p8, %r56, %r57; + @%p8 bra $Lt_0_27906; + .loc 16 217 0 + mov.f32 %f135, %f127; + mov.f32 %f136, %f117; + sub.ftz.f32 %f137, %f136, %f43; + fma.rn.ftz.f32 %f138, %f135, %f137, %f38; + selp.f32 %f38, %f138, %f38, %p7; + @!%p5 bra $Lt_0_27906; + .loc 16 220 0 + cvt.rzi.ftz.s32.f32 %r58, %f51; + cvt.rzi.ftz.s32.f32 %r59, %f28; + ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj1]; + ld.param.s32 %r60, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r61, %r60, %r59; + add.s32 %r62, %r58, %r61; + cvt.s64.s32 %rd34, %r62; + mul.wide.s32 %rd35, %r62, 16; + add.u64 %rd32, %rd33, %rd35; + mov.f32 %f139, %f62; + ld.global.v2.f32 {%f140,%f141}, [%rd32+8]; + mul.ftz.f32 %f142, %f140, %f139; + sub.ftz.f32 %f143, %f142, %f141; + mul.ftz.f32 %f144, %f139, %f143; + mov.f32 %f145, %f80; + mul.ftz.f32 %f146, %f145, %f144; + ld.param.f32 %f147, [__cudaparm_kernel_pair_cut_lj_innersq]; + setp.lt.ftz.f32 %p9, %f147, %f57; + selp.f32 %f148, %f146, %f144, %p9; + .loc 16 223 0 + fma.rn.ftz.f32 %f39, %f40, %f148, %f39; +$Lt_0_27906: +$Lt_0_27394: + ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r64, 0; + setp.le.s32 %p10, %r63, %r64; + @%p10 bra $Lt_0_28418; + .loc 16 227 0 + mov.f32 %f149, %f10; + mul.ftz.f32 %f150, %f53, %f53; + fma.rn.ftz.f32 %f151, %f134, %f150, %f149; + mov.f32 %f10, %f151; + .loc 16 228 0 + mov.f32 %f152, %f12; + fma.rn.ftz.f32 %f153, %f134, %f55, %f152; + mov.f32 %f12, %f153; + .loc 16 229 0 + mov.f32 %f154, %f14; + mul.ftz.f32 %f155, %f54, %f54; + fma.rn.ftz.f32 %f156, %f134, %f155, %f154; + mov.f32 %f14, %f156; + .loc 16 230 0 + mov.f32 %f157, %f16; + mul.ftz.f32 %f158, %f52, %f53; + fma.rn.ftz.f32 %f159, %f134, %f158, %f157; + mov.f32 %f16, %f159; + .loc 16 231 0 + mov.f32 %f160, %f18; + mul.ftz.f32 %f161, %f53, %f54; + fma.rn.ftz.f32 %f162, %f134, %f161, %f160; + mov.f32 %f18, %f162; + .loc 16 232 0 + mul.ftz.f32 %f163, %f52, %f54; + fma.rn.ftz.f32 %f19, %f134, %f163, %f19; + mov.f32 %f20, %f19; +$Lt_0_28418: +$Lt_0_25346: + .loc 16 161 0 + mul.lo.u64 %rd36, %rd24, 4; + add.u64 %rd16, %rd16, %rd36; + setp.lt.u64 %p11, %rd16, %rd13; + @%p11 bra $Lt_0_25090; + bra.uni $Lt_0_23554; +$Lt_0_34818: + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + bra.uni $Lt_0_23554; +$Lt_0_23810: + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 +$Lt_0_23554: + mov.u32 %r65, 1; + setp.le.s32 %p12, %r1, %r65; + @%p12 bra $Lt_0_31234; + .loc 16 243 0 + mov.u64 %rd37, __cuda___cuda_local_var_32624_35_non_const_red_acc152; + cvt.s64.s32 %rd38, %r2; + mul.wide.s32 %rd39, %r2, 4; + add.u64 %rd40, %rd37, %rd39; + mov.f32 %f164, %f37; + st.shared.f32 [%rd40+0], %f164; + .loc 16 244 0 + mov.f32 %f165, %f36; + st.shared.f32 [%rd40+512], %f165; + .loc 16 245 0 + mov.f32 %f166, %f35; + st.shared.f32 [%rd40+1024], %f166; + .loc 16 246 0 + mov.f32 %f167, %f39; + st.shared.f32 [%rd40+1536], %f167; + .loc 16 247 0 + mov.f32 %f168, %f38; + st.shared.f32 [%rd40+2048], %f168; + .loc 16 249 0 + shr.s32 %r66, %r1, 31; + mov.s32 %r67, 1; + and.b32 %r68, %r66, %r67; + add.s32 %r69, %r68, %r1; + shr.s32 %r70, %r69, 1; + mov.s32 %r71, %r70; + mov.u32 %r72, 0; + setp.ne.u32 %p13, %r70, %r72; + @!%p13 bra $Lt_0_29698; +$Lt_0_30210: + setp.ge.u32 %p14, %r6, %r71; + @%p14 bra $Lt_0_30466; + .loc 16 252 0 + add.u32 %r73, %r2, %r71; + cvt.u64.u32 %rd41, %r73; + mul.wide.u32 %rd42, %r73, 4; + add.u64 %rd43, %rd37, %rd42; + ld.shared.f32 %f169, [%rd43+0]; + add.ftz.f32 %f164, %f169, %f164; + st.shared.f32 [%rd40+0], %f164; + ld.shared.f32 %f170, [%rd43+512]; + add.ftz.f32 %f165, %f170, %f165; + st.shared.f32 [%rd40+512], %f165; + ld.shared.f32 %f171, [%rd43+1024]; + add.ftz.f32 %f166, %f171, %f166; + st.shared.f32 [%rd40+1024], %f166; + ld.shared.f32 %f172, [%rd43+1536]; + add.ftz.f32 %f167, %f172, %f167; + st.shared.f32 [%rd40+1536], %f167; + ld.shared.f32 %f173, [%rd43+2048]; + add.ftz.f32 %f168, %f173, %f168; + st.shared.f32 [%rd40+2048], %f168; +$Lt_0_30466: + .loc 16 249 0 + shr.u32 %r71, %r71, 1; + mov.u32 %r74, 0; + setp.ne.u32 %p15, %r71, %r74; + @%p15 bra $Lt_0_30210; +$Lt_0_29698: + .loc 16 256 0 + mov.f32 %f37, %f164; + .loc 16 257 0 + mov.f32 %f36, %f165; + .loc 16 258 0 + mov.f32 %f35, %f166; + .loc 16 259 0 + mov.f32 %f39, %f167; + .loc 16 260 0 + mov.f32 %f38, %f168; + ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r76, 0; + setp.le.s32 %p16, %r75, %r76; + @%p16 bra $Lt_0_31234; + .loc 16 264 0 + mov.f32 %f164, %f10; + st.shared.f32 [%rd40+0], %f164; + mov.f32 %f165, %f12; + st.shared.f32 [%rd40+512], %f165; + mov.f32 %f166, %f14; + st.shared.f32 [%rd40+1024], %f166; + mov.f32 %f167, %f16; + st.shared.f32 [%rd40+1536], %f167; + mov.f32 %f168, %f18; + st.shared.f32 [%rd40+2048], %f168; + mov.f32 %f174, %f20; + st.shared.f32 [%rd40+2560], %f174; + .loc 16 266 0 + mov.s32 %r77, %r70; + @!%p13 bra $Lt_0_31746; +$Lt_0_32258: + setp.ge.u32 %p17, %r6, %r77; + @%p17 bra $Lt_0_32514; + .loc 16 269 0 + add.u32 %r78, %r2, %r77; + cvt.u64.u32 %rd44, %r78; + mul.wide.u32 %rd45, %r78, 4; + add.u64 %rd46, %rd37, %rd45; + ld.shared.f32 %f175, [%rd46+0]; + add.ftz.f32 %f164, %f175, %f164; + st.shared.f32 [%rd40+0], %f164; + ld.shared.f32 %f176, [%rd46+512]; + add.ftz.f32 %f165, %f176, %f165; + st.shared.f32 [%rd40+512], %f165; + ld.shared.f32 %f177, [%rd46+1024]; + add.ftz.f32 %f166, %f177, %f166; + st.shared.f32 [%rd40+1024], %f166; + ld.shared.f32 %f178, [%rd46+1536]; + add.ftz.f32 %f167, %f178, %f167; + st.shared.f32 [%rd40+1536], %f167; + ld.shared.f32 %f179, [%rd46+2048]; + add.ftz.f32 %f168, %f179, %f168; + st.shared.f32 [%rd40+2048], %f168; + ld.shared.f32 %f180, [%rd46+2560]; + add.ftz.f32 %f174, %f180, %f174; + st.shared.f32 [%rd40+2560], %f174; +$Lt_0_32514: + .loc 16 266 0 + shr.u32 %r77, %r77, 1; + mov.u32 %r79, 0; + setp.ne.u32 %p18, %r77, %r79; + @%p18 bra $Lt_0_32258; +$Lt_0_31746: + .loc 16 274 0 + mov.f32 %f10, %f164; + mov.f32 %f12, %f165; + mov.f32 %f14, %f166; + mov.f32 %f16, %f167; + mov.f32 %f18, %f168; + mov.f32 %f20, %f174; +$Lt_0_31234: +$Lt_0_29186: + selp.s32 %r80, 1, 0, %p1; + mov.s32 %r81, 0; + set.eq.u32.s32 %r82, %r6, %r81; + neg.s32 %r83, %r82; + and.b32 %r84, %r80, %r83; + mov.u32 %r85, 0; + setp.eq.s32 %p19, %r84, %r85; + @%p19 bra $Lt_0_33282; + .loc 16 280 0 + cvt.s64.s32 %rd47, %r9; + ld.param.u64 %rd48, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd49, %r9, 4; + add.u64 %rd50, %rd48, %rd49; + ld.param.s32 %r86, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r87, 0; + setp.le.s32 %p20, %r86, %r87; + @%p20 bra $Lt_0_33794; + .loc 16 282 0 + st.global.f32 [%rd50+0], %f39; + .loc 16 283 0 + cvt.s64.s32 %rd51, %r10; + mul.wide.s32 %rd52, %r10, 4; + add.u64 %rd53, %rd52, %rd50; + .loc 16 284 0 + st.global.f32 [%rd53+0], %f38; + .loc 16 285 0 + add.u64 %rd50, %rd52, %rd53; +$Lt_0_33794: + ld.param.s32 %r88, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r89, 0; + setp.le.s32 %p21, %r88, %r89; + @%p21 bra $Lt_0_34306; + .loc 16 289 0 + mov.f32 %f181, %f10; + st.global.f32 [%rd50+0], %f181; + .loc 16 290 0 + cvt.s64.s32 %rd54, %r10; + mul.wide.s32 %rd55, %r10, 4; + add.u64 %rd56, %rd55, %rd50; + .loc 16 289 0 + mov.f32 %f182, %f12; + st.global.f32 [%rd56+0], %f182; + .loc 16 290 0 + add.u64 %rd57, %rd55, %rd56; + .loc 16 289 0 + mov.f32 %f183, %f14; + st.global.f32 [%rd57+0], %f183; + .loc 16 290 0 + add.u64 %rd58, %rd55, %rd57; + .loc 16 289 0 + mov.f32 %f184, %f16; + st.global.f32 [%rd58+0], %f184; + .loc 16 290 0 + add.u64 %rd50, %rd55, %rd58; + .loc 16 289 0 + mov.f32 %f185, %f18; + st.global.f32 [%rd50+0], %f185; + mov.f32 %f186, %f20; + add.u64 %rd59, %rd55, %rd50; + st.global.f32 [%rd59+0], %f186; +$Lt_0_34306: + .loc 16 293 0 + ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd61, %rd47, 16; + add.u64 %rd62, %rd60, %rd61; + mov.f32 %f187, %f188; + st.global.v4.f32 [%rd62+0], {%f37,%f36,%f35,%f187}; +$Lt_0_33282: + .loc 16 295 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_ljd_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_fast_q_, + .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, + .param .f32 __cudaparm_kernel_pair_fast_g_ewald, + .param .f32 __cudaparm_kernel_pair_fast_denom_lj, + .param .f32 __cudaparm_kernel_pair_fast_cut_bothsq, + .param .f32 __cudaparm_kernel_pair_fast_cut_ljsq, + .param .f32 __cudaparm_kernel_pair_fast_cut_lj_innersq, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<86>; + .reg .u64 %rd<71>; + .reg .f32 %f<196>; + .reg .pred %p<25>; + .shared .align 4 .b8 __cuda___cuda_local_var_32697_33_non_const_sp_lj3336[32]; + .shared .align 8 .b8 __cuda___cuda_local_var_32696_34_non_const_ljd3368[1024]; + .shared .align 4 .b8 __cuda___cuda_local_var_32826_35_non_const_red_acc4392[3072]; + // __cuda_local_var_32706_10_non_const_f = 64 + // __cuda_local_var_32710_9_non_const_virial = 16 + // __cuda_local_var_32759_43_non_const_prefactor = 56 + // __cuda_local_var_32759_54_non_const__erfc = 52 + // __cuda_local_var_32759_61_non_const_switch1 = 48 + // __cuda_local_var_32760_15_non_const_lj3 = 44 + // __cuda_local_var_32760_20_non_const_lj4 = 40 + .loc 16 307 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + cvt.s64.s32 %rd1, %r1; + mov.u32 %r2, 7; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_25090; + .loc 16 316 0 + mov.u64 %rd2, __cuda___cuda_local_var_32697_33_non_const_sp_lj3336; + mul.lo.u64 %rd3, %rd1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd2; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_25090: + mov.u64 %rd2, __cuda___cuda_local_var_32697_33_non_const_sp_lj3336; + mov.u64 %rd7, __cuda___cuda_local_var_32696_34_non_const_ljd3368; + .loc 16 317 0 + mul.lo.u64 %rd8, %rd1, 8; + ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_ljd_in]; + add.u64 %rd10, %rd9, %rd8; + add.u64 %rd11, %rd8, %rd7; + ld.global.v2.f32 {%f2,%f3}, [%rd10+0]; + st.shared.v2.f32 [%rd11+0], {%f2,%f3}; + add.s32 %r3, %r1, 128; + mov.u32 %r4, 127; + setp.gt.s32 %p2, %r3, %r4; + @%p2 bra $Lt_1_25602; + ld.global.v2.f32 {%f4,%f5}, [%rd10+1024]; + st.shared.v2.f32 [%rd11+1024], {%f4,%f5}; +$Lt_1_25602: + .loc 16 329 0 + mov.f32 %f6, 0f00000000; // 0 + mov.f32 %f7, %f6; + mov.f32 %f8, 0f00000000; // 0 + mov.f32 %f9, %f8; + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, %f14; + mov.f32 %f16, 0f00000000; // 0 + mov.f32 %f17, %f16; + .loc 16 331 0 + bar.sync 0; + ld.param.s32 %r5, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r6, %r1, %r5; + cvt.s32.u32 %r7, %ntid.x; + div.s32 %r8, %r7, %r5; + rem.s32 %r9, %r1, %r5; + cvt.s32.u32 %r10, %ctaid.x; + mul.lo.s32 %r11, %r10, %r8; + add.s32 %r12, %r6, %r11; + ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p3, %r12, %r13; + @!%p3 bra $Lt_1_26370; + .loc 16 335 0 + cvt.s64.s32 %rd12, %r12; + mul.wide.s32 %rd13, %r12, 4; + ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd15, %rd13, %rd14; + ld.global.s32 %r14, [%rd15+0]; + .loc 16 337 0 + ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd16, %r15; + mul.wide.s32 %rd17, %r15, 4; + add.u64 %rd18, %rd17, %rd15; + ld.global.s32 %r16, [%rd18+0]; + add.u64 %rd19, %rd17, %rd18; + ld.param.u64 %rd20, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p4, %rd20, %rd14; + @%p4 bra $Lt_1_26882; + .loc 16 343 0 + cvt.s32.s64 %r17, %rd16; + mul.lo.s32 %r18, %r17, %r16; + cvt.s64.s32 %rd21, %r18; + mul.wide.s32 %rd22, %r18, 4; + add.u64 %rd23, %rd19, %rd22; + .loc 16 344 0 + mul.lo.s32 %r19, %r9, %r17; + cvt.s64.s32 %rd24, %r19; + mul.wide.s32 %rd25, %r19, 4; + add.u64 %rd26, %rd19, %rd25; + .loc 16 345 0 + mul.lo.s32 %r20, %r17, %r5; + bra.uni $Lt_1_26626; +$Lt_1_26882: + .loc 16 347 0 + ld.global.s32 %r21, [%rd19+0]; + cvt.s64.s32 %rd27, %r21; + mul.wide.s32 %rd28, %r21, 4; + add.u64 %rd29, %rd20, %rd28; + .loc 16 348 0 + cvt.s64.s32 %rd30, %r16; + mul.wide.s32 %rd31, %r16, 4; + add.u64 %rd23, %rd29, %rd31; + .loc 16 349 0 + mov.s32 %r20, %r5; + .loc 16 350 0 + cvt.s64.s32 %rd32, %r9; + mul.wide.s32 %rd33, %r9, 4; + add.u64 %rd26, %rd29, %rd33; +$Lt_1_26626: + .loc 16 353 0 + mov.u32 %r22, %r14; + mov.s32 %r23, 0; + mov.u32 %r24, %r23; + mov.s32 %r25, 0; + mov.u32 %r26, %r25; + mov.s32 %r27, 0; + mov.u32 %r28, %r27; + tex.1d.v4.f32.s32 {%f18,%f19,%f20,%f21},[pos_tex,{%r22,%r24,%r26,%r28}]; + mov.f32 %f22, %f18; + mov.f32 %f23, %f19; + mov.f32 %f24, %f20; + mov.f32 %f25, %f21; + .loc 16 354 0 + mov.u32 %r29, %r14; + mov.s32 %r30, 0; + mov.u32 %r31, %r30; + mov.s32 %r32, 0; + mov.u32 %r33, %r32; + mov.s32 %r34, 0; + mov.u32 %r35, %r34; + tex.1d.v4.f32.s32 {%f26,%f27,%f28,%f29},[q_tex,{%r29,%r31,%r33,%r35}]; + mov.f32 %f30, %f26; + setp.ge.u64 %p5, %rd26, %rd23; + @%p5 bra $Lt_1_37378; + cvt.rzi.ftz.s32.f32 %r36, %f25; + cvt.s64.s32 %rd34, %r20; + ld.param.f32 %f31, [__cudaparm_kernel_pair_fast_cut_bothsq]; + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 +$Lt_1_27650: + //<loop> Loop body line 354, nesting depth: 1, estimated iterations: unknown + .loc 16 358 0 + ld.global.s32 %r37, [%rd26+0]; + .loc 16 361 0 + shr.s32 %r38, %r37, 30; + and.b32 %r39, %r38, 3; + cvt.s64.s32 %rd35, %r39; + mul.wide.s32 %rd36, %r39, 4; + add.u64 %rd37, %rd2, %rd36; + ld.shared.f32 %f37, [%rd37+0]; + .loc 16 362 0 + mov.f32 %f38, 0f3f800000; // 1 + ld.shared.f32 %f39, [%rd37+16]; + sub.ftz.f32 %f40, %f38, %f39; + .loc 16 365 0 + and.b32 %r40, %r37, 1073741823; + mov.u32 %r41, %r40; + mov.s32 %r42, 0; + mov.u32 %r43, %r42; + mov.s32 %r44, 0; + mov.u32 %r45, %r44; + mov.s32 %r46, 0; + mov.u32 %r47, %r46; + tex.1d.v4.f32.s32 {%f41,%f42,%f43,%f44},[pos_tex,{%r41,%r43,%r45,%r47}]; + mov.f32 %f45, %f41; + mov.f32 %f46, %f42; + mov.f32 %f47, %f43; + mov.f32 %f48, %f44; + sub.ftz.f32 %f49, %f23, %f46; + sub.ftz.f32 %f50, %f22, %f45; + sub.ftz.f32 %f51, %f24, %f47; + mul.ftz.f32 %f52, %f49, %f49; + fma.rn.ftz.f32 %f53, %f50, %f50, %f52; + fma.rn.ftz.f32 %f54, %f51, %f51, %f53; + setp.lt.ftz.f32 %p6, %f54, %f31; + @!%p6 bra $Lt_1_30978; + ld.param.f32 %f55, [__cudaparm_kernel_pair_fast_cut_ljsq]; + setp.lt.ftz.f32 %p7, %f54, %f55; + rcp.approx.ftz.f32 %f56, %f54; + @!%p7 bra $Lt_1_28674; + .loc 16 380 0 + cvt.rzi.ftz.s32.f32 %r48, %f48; + cvt.s64.s32 %rd38, %r36; + mul.wide.s32 %rd39, %r36, 8; + add.u64 %rd40, %rd7, %rd39; + cvt.s64.s32 %rd41, %r48; + mul.wide.s32 %rd42, %r48, 8; + add.u64 %rd43, %rd7, %rd42; + ld.shared.v2.f32 {%f57,%f58}, [%rd40+0]; + ld.shared.v2.f32 {%f59,%f60}, [%rd43+0]; + mul.ftz.f32 %f61, %f57, %f59; + .loc 16 381 0 + add.ftz.f32 %f62, %f58, %f60; + mov.f32 %f63, 0f3f000000; // 0.5 + mul.ftz.f32 %f64, %f62, %f63; + .loc 16 385 0 + mul.ftz.f32 %f65, %f64, %f64; + sqrt.approx.ftz.f32 %f66, %f61; + mov.f32 %f67, 0f40800000; // 4 + mul.ftz.f32 %f68, %f66, %f67; + mul.ftz.f32 %f69, %f65, %f56; + mul.ftz.f32 %f70, %f69, %f69; + mul.ftz.f32 %f71, %f69, %f70; + mul.ftz.f32 %f72, %f68, %f71; + mov.f32 %f73, %f72; + .loc 16 386 0 + mul.ftz.f32 %f74, %f71, %f72; + mov.f32 %f75, %f74; + .loc 16 387 0 + mov.f32 %f76, 0f40c00000; // 6 + mul.ftz.f32 %f77, %f72, %f76; + mov.f32 %f78, 0f41400000; // 12 + mul.ftz.f32 %f79, %f78, %f74; + sub.ftz.f32 %f80, %f79, %f77; + mul.ftz.f32 %f81, %f37, %f80; + ld.param.f32 %f82, [__cudaparm_kernel_pair_fast_cut_lj_innersq]; + setp.gt.ftz.f32 %p8, %f54, %f82; + @!%p8 bra $Lt_1_28418; + .loc 16 393 0 + add.ftz.f32 %f83, %f54, %f54; + sub.ftz.f32 %f84, %f55, %f54; + add.ftz.f32 %f85, %f83, %f55; + mul.ftz.f32 %f86, %f84, %f84; + mov.f32 %f87, 0f40400000; // 3 + mul.ftz.f32 %f88, %f87, %f82; + sub.ftz.f32 %f89, %f85, %f88; + ld.param.f32 %f90, [__cudaparm_kernel_pair_fast_denom_lj]; + div.approx.ftz.f32 %f91, %f89, %f90; + mul.ftz.f32 %f92, %f86, %f91; + mov.f32 %f93, %f92; + .loc 16 396 0 + mov.f32 %f94, 0f41400000; // 12 + mul.ftz.f32 %f95, %f54, %f94; + mul.ftz.f32 %f96, %f84, %f95; + sub.ftz.f32 %f97, %f54, %f82; + mul.ftz.f32 %f98, %f96, %f97; + div.approx.ftz.f32 %f99, %f98, %f90; + sub.ftz.f32 %f100, %f74, %f72; + mul.ftz.f32 %f101, %f99, %f100; + fma.rn.ftz.f32 %f81, %f81, %f92, %f101; + bra.uni $Lt_1_28418; +$Lt_1_28674: + .loc 16 399 0 + mov.f32 %f81, 0f00000000; // 0 +$Lt_1_28418: + ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_cut_coulsq]; + setp.gt.ftz.f32 %p9, %f102, %f54; + @!%p9 bra $Lt_1_29698; + .loc 16 406 0 + sqrt.approx.ftz.f32 %f103, %f54; + ld.param.f32 %f104, [__cudaparm_kernel_pair_fast_g_ewald]; + mul.ftz.f32 %f105, %f104, %f103; + mul.ftz.f32 %f106, %f105, %f105; + mov.f32 %f107, 0f3f800000; // 1 + mov.f32 %f108, 0f3ea7ba05; // 0.327591 + fma.rn.ftz.f32 %f109, %f108, %f105, %f107; + neg.ftz.f32 %f110, %f106; + rcp.approx.ftz.f32 %f111, %f109; + mov.f32 %f112, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f113, %f110, %f112; + ex2.approx.ftz.f32 %f114, %f113; + mov.f32 %f115, 0f3e827906; // 0.25483 + mov.f32 %f116, 0fbe91a98e; // -0.284497 + mov.f32 %f117, 0f3fb5f0e3; // 1.42141 + mov.f32 %f118, 0fbfba00e3; // -1.45315 + mov.f32 %f119, 0f3f87dc22; // 1.06141 + fma.rn.ftz.f32 %f120, %f119, %f111, %f118; + fma.rn.ftz.f32 %f121, %f111, %f120, %f117; + fma.rn.ftz.f32 %f122, %f111, %f121, %f116; + fma.rn.ftz.f32 %f123, %f111, %f122, %f115; + mul.ftz.f32 %f124, %f111, %f123; + mul.ftz.f32 %f125, %f114, %f124; + mov.f32 %f126, %f125; + .loc 16 407 0 + mov.u32 %r49, %r40; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + tex.1d.v4.f32.s32 {%f127,%f128,%f129,%f130},[q_tex,{%r49,%r51,%r53,%r55}]; + mov.f32 %f131, %f127; + ld.param.f32 %f132, [__cudaparm_kernel_pair_fast_qqrd2e]; + mul.ftz.f32 %f133, %f132, %f30; + mul.ftz.f32 %f134, %f133, %f131; + div.approx.ftz.f32 %f135, %f134, %f103; + mov.f32 %f136, %f135; + .loc 16 408 0 + mov.f32 %f137, 0f3f906ebb; // 1.12838 + mul.ftz.f32 %f138, %f105, %f137; + fma.rn.ftz.f32 %f139, %f114, %f138, %f125; + sub.ftz.f32 %f140, %f139, %f40; + mul.ftz.f32 %f141, %f135, %f140; + bra.uni $Lt_1_29442; +$Lt_1_29698: + .loc 16 410 0 + mov.f32 %f141, 0f00000000; // 0 +$Lt_1_29442: + .loc 16 414 0 + add.ftz.f32 %f142, %f141, %f81; + mul.ftz.f32 %f143, %f142, %f56; + fma.rn.ftz.f32 %f34, %f50, %f143, %f34; + .loc 16 415 0 + fma.rn.ftz.f32 %f33, %f49, %f143, %f33; + .loc 16 416 0 + fma.rn.ftz.f32 %f32, %f51, %f143, %f32; + ld.param.s32 %r56, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r57, 0; + setp.le.s32 %p10, %r56, %r57; + @%p10 bra $Lt_1_30466; + .loc 16 419 0 + mov.f32 %f144, %f136; + mov.f32 %f145, %f126; + sub.ftz.f32 %f146, %f145, %f40; + fma.rn.ftz.f32 %f147, %f144, %f146, %f35; + selp.f32 %f35, %f147, %f35, %p9; + @!%p7 bra $Lt_1_30466; + .loc 16 425 0 + mov.f32 %f148, %f75; + mov.f32 %f149, %f73; + sub.ftz.f32 %f150, %f148, %f149; + mov.f32 %f151, %f93; + mul.ftz.f32 %f152, %f151, %f150; + ld.param.f32 %f153, [__cudaparm_kernel_pair_fast_cut_lj_innersq]; + setp.lt.ftz.f32 %p11, %f153, %f54; + selp.f32 %f154, %f152, %f150, %p11; + fma.rn.ftz.f32 %f36, %f37, %f154, %f36; +$Lt_1_30466: +$Lt_1_29954: + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p12, %r58, %r59; + @%p12 bra $Lt_1_30978; + .loc 16 429 0 + mov.f32 %f155, %f7; + mul.ftz.f32 %f156, %f50, %f50; + fma.rn.ftz.f32 %f157, %f143, %f156, %f155; + mov.f32 %f7, %f157; + .loc 16 430 0 + mov.f32 %f158, %f9; + fma.rn.ftz.f32 %f159, %f143, %f52, %f158; + mov.f32 %f9, %f159; + .loc 16 431 0 + mov.f32 %f160, %f11; + mul.ftz.f32 %f161, %f51, %f51; + fma.rn.ftz.f32 %f162, %f143, %f161, %f160; + mov.f32 %f11, %f162; + .loc 16 432 0 + mov.f32 %f163, %f13; + mul.ftz.f32 %f164, %f49, %f50; + fma.rn.ftz.f32 %f165, %f143, %f164, %f163; + mov.f32 %f13, %f165; + .loc 16 433 0 + mov.f32 %f166, %f15; + mul.ftz.f32 %f167, %f50, %f51; + fma.rn.ftz.f32 %f168, %f143, %f167, %f166; + mov.f32 %f15, %f168; + .loc 16 434 0 + mul.ftz.f32 %f169, %f49, %f51; + fma.rn.ftz.f32 %f16, %f143, %f169, %f16; + mov.f32 %f17, %f16; +$Lt_1_30978: +$Lt_1_27906: + .loc 16 357 0 + mul.lo.u64 %rd44, %rd34, 4; + add.u64 %rd26, %rd26, %rd44; + setp.lt.u64 %p13, %rd26, %rd23; + @%p13 bra $Lt_1_27650; + bra.uni $Lt_1_26114; +$Lt_1_37378: + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + bra.uni $Lt_1_26114; +$Lt_1_26370: + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 +$Lt_1_26114: + mov.u32 %r60, 1; + setp.le.s32 %p14, %r5, %r60; + @%p14 bra $Lt_1_33794; + .loc 16 445 0 + mov.u64 %rd45, __cuda___cuda_local_var_32826_35_non_const_red_acc4392; + mul.lo.u64 %rd46, %rd1, 4; + add.u64 %rd47, %rd45, %rd46; + mov.f32 %f170, %f34; + st.shared.f32 [%rd47+0], %f170; + .loc 16 446 0 + mov.f32 %f171, %f33; + st.shared.f32 [%rd47+512], %f171; + .loc 16 447 0 + mov.f32 %f172, %f32; + st.shared.f32 [%rd47+1024], %f172; + .loc 16 448 0 + mov.f32 %f173, %f36; + st.shared.f32 [%rd47+1536], %f173; + .loc 16 449 0 + mov.f32 %f174, %f35; + st.shared.f32 [%rd47+2048], %f174; + .loc 16 451 0 + shr.s32 %r61, %r5, 31; + mov.s32 %r62, 1; + and.b32 %r63, %r61, %r62; + add.s32 %r64, %r63, %r5; + shr.s32 %r65, %r64, 1; + mov.s32 %r66, %r65; + mov.u32 %r67, 0; + setp.ne.u32 %p15, %r65, %r67; + @!%p15 bra $Lt_1_32258; +$Lt_1_32770: + setp.ge.u32 %p16, %r9, %r66; + @%p16 bra $Lt_1_33026; + .loc 16 454 0 + add.u32 %r68, %r1, %r66; + cvt.u64.u32 %rd48, %r68; + mul.wide.u32 %rd49, %r68, 4; + add.u64 %rd50, %rd45, %rd49; + ld.shared.f32 %f175, [%rd50+0]; + add.ftz.f32 %f170, %f175, %f170; + st.shared.f32 [%rd47+0], %f170; + ld.shared.f32 %f176, [%rd50+512]; + add.ftz.f32 %f171, %f176, %f171; + st.shared.f32 [%rd47+512], %f171; + ld.shared.f32 %f177, [%rd50+1024]; + add.ftz.f32 %f172, %f177, %f172; + st.shared.f32 [%rd47+1024], %f172; + ld.shared.f32 %f178, [%rd50+1536]; + add.ftz.f32 %f173, %f178, %f173; + st.shared.f32 [%rd47+1536], %f173; + ld.shared.f32 %f179, [%rd50+2048]; + add.ftz.f32 %f174, %f179, %f174; + st.shared.f32 [%rd47+2048], %f174; +$Lt_1_33026: + .loc 16 451 0 + shr.u32 %r66, %r66, 1; + mov.u32 %r69, 0; + setp.ne.u32 %p17, %r66, %r69; + @%p17 bra $Lt_1_32770; +$Lt_1_32258: + .loc 16 458 0 + mov.f32 %f34, %f170; + .loc 16 459 0 + mov.f32 %f33, %f171; + .loc 16 460 0 + mov.f32 %f32, %f172; + .loc 16 461 0 + mov.f32 %f36, %f173; + .loc 16 462 0 + mov.f32 %f35, %f174; + ld.param.s32 %r70, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r71, 0; + setp.le.s32 %p18, %r70, %r71; + @%p18 bra $Lt_1_33794; + .loc 16 466 0 + mov.f32 %f170, %f7; + st.shared.f32 [%rd47+0], %f170; + mov.f32 %f171, %f9; + st.shared.f32 [%rd47+512], %f171; + mov.f32 %f172, %f11; + st.shared.f32 [%rd47+1024], %f172; + mov.f32 %f173, %f13; + st.shared.f32 [%rd47+1536], %f173; + mov.f32 %f174, %f15; + st.shared.f32 [%rd47+2048], %f174; + mov.f32 %f180, %f17; + st.shared.f32 [%rd47+2560], %f180; + .loc 16 468 0 + mov.s32 %r72, %r65; + @!%p15 bra $Lt_1_34306; +$Lt_1_34818: + setp.ge.u32 %p19, %r9, %r72; + @%p19 bra $Lt_1_35074; + .loc 16 471 0 + add.u32 %r73, %r1, %r72; + cvt.u64.u32 %rd51, %r73; + mul.wide.u32 %rd52, %r73, 4; + add.u64 %rd53, %rd45, %rd52; + ld.shared.f32 %f181, [%rd53+0]; + add.ftz.f32 %f170, %f181, %f170; + st.shared.f32 [%rd47+0], %f170; + ld.shared.f32 %f182, [%rd53+512]; + add.ftz.f32 %f171, %f182, %f171; + st.shared.f32 [%rd47+512], %f171; + ld.shared.f32 %f183, [%rd53+1024]; + add.ftz.f32 %f172, %f183, %f172; + st.shared.f32 [%rd47+1024], %f172; + ld.shared.f32 %f184, [%rd53+1536]; + add.ftz.f32 %f173, %f184, %f173; + st.shared.f32 [%rd47+1536], %f173; + ld.shared.f32 %f185, [%rd53+2048]; + add.ftz.f32 %f174, %f185, %f174; + st.shared.f32 [%rd47+2048], %f174; + ld.shared.f32 %f186, [%rd53+2560]; + add.ftz.f32 %f180, %f186, %f180; + st.shared.f32 [%rd47+2560], %f180; +$Lt_1_35074: + .loc 16 468 0 + shr.u32 %r72, %r72, 1; + mov.u32 %r74, 0; + setp.ne.u32 %p20, %r72, %r74; + @%p20 bra $Lt_1_34818; +$Lt_1_34306: + .loc 16 476 0 + mov.f32 %f7, %f170; + mov.f32 %f9, %f171; + mov.f32 %f11, %f172; + mov.f32 %f13, %f173; + mov.f32 %f15, %f174; + mov.f32 %f17, %f180; +$Lt_1_33794: +$Lt_1_31746: + selp.s32 %r75, 1, 0, %p3; + mov.s32 %r76, 0; + set.eq.u32.s32 %r77, %r9, %r76; + neg.s32 %r78, %r77; + and.b32 %r79, %r75, %r78; + mov.u32 %r80, 0; + setp.eq.s32 %p21, %r79, %r80; + @%p21 bra $Lt_1_35842; + .loc 16 482 0 + cvt.s64.s32 %rd54, %r12; + ld.param.u64 %rd55, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd56, %r12, 4; + add.u64 %rd57, %rd55, %rd56; + ld.param.s32 %r81, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r82, 0; + setp.le.s32 %p22, %r81, %r82; + @%p22 bra $Lt_1_36354; + .loc 16 484 0 + st.global.f32 [%rd57+0], %f36; + .loc 16 485 0 + cvt.s64.s32 %rd58, %r13; + mul.wide.s32 %rd59, %r13, 4; + add.u64 %rd60, %rd59, %rd57; + .loc 16 486 0 + st.global.f32 [%rd60+0], %f35; + .loc 16 487 0 + add.u64 %rd57, %rd59, %rd60; +$Lt_1_36354: + ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r84, 0; + setp.le.s32 %p23, %r83, %r84; + @%p23 bra $Lt_1_36866; + .loc 16 491 0 + mov.f32 %f187, %f7; + st.global.f32 [%rd57+0], %f187; + .loc 16 492 0 + cvt.s64.s32 %rd61, %r13; + mul.wide.s32 %rd62, %r13, 4; + add.u64 %rd63, %rd62, %rd57; + .loc 16 491 0 + mov.f32 %f188, %f9; + st.global.f32 [%rd63+0], %f188; + .loc 16 492 0 + add.u64 %rd64, %rd62, %rd63; + .loc 16 491 0 + mov.f32 %f189, %f11; + st.global.f32 [%rd64+0], %f189; + .loc 16 492 0 + add.u64 %rd65, %rd62, %rd64; + .loc 16 491 0 + mov.f32 %f190, %f13; + st.global.f32 [%rd65+0], %f190; + .loc 16 492 0 + add.u64 %rd57, %rd62, %rd65; + .loc 16 491 0 + mov.f32 %f191, %f15; + st.global.f32 [%rd57+0], %f191; + mov.f32 %f192, %f17; + add.u64 %rd66, %rd62, %rd57; + st.global.f32 [%rd66+0], %f192; +$Lt_1_36866: + .loc 16 495 0 + ld.param.u64 %rd67, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd68, %rd54, 16; + add.u64 %rd69, %rd67, %rd68; + mov.f32 %f193, %f194; + st.global.v4.f32 [%rd69+0], {%f34,%f33,%f32,%f193}; +$Lt_1_35842: + .loc 16 497 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/crml_gpu_ptx.h b/lib/gpu/crml_gpu_ptx.h new file mode 100644 index 000000000..a81d95df0 --- /dev/null +++ b/lib/gpu/crml_gpu_ptx.h @@ -0,0 +1,1227 @@ +const char * crml_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .global .texref q_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_q_,\n" +" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" +" .param .f32 __cudaparm_kernel_pair_g_ewald,\n" +" .param .f32 __cudaparm_kernel_pair_denom_lj,\n" +" .param .f32 __cudaparm_kernel_pair_cut_bothsq,\n" +" .param .f32 __cudaparm_kernel_pair_cut_ljsq,\n" +" .param .f32 __cudaparm_kernel_pair_cut_lj_innersq,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<91>;\n" +" .reg .u64 %rd<64>;\n" +" .reg .f32 %f<190>;\n" +" .reg .pred %p<23>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32500_33_non_const_sp_lj120[32];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32624_35_non_const_red_acc152[3072];\n" +" .loc 16 110 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 118 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 119 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 120 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 121 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32500_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 122 0\n" +" ld.global.f32 %f5, [%rd1+16];\n" +" .loc 16 123 0\n" +" ld.global.f32 %f6, [%rd1+20];\n" +" .loc 16 124 0\n" +" ld.global.f32 %f7, [%rd1+24];\n" +" .loc 16 125 0\n" +" ld.global.f32 %f8, [%rd1+28];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32500_33_non_const_sp_lj120+16], {%f5,%f6,%f7,%f8};\n" +" .loc 16 135 0\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" mov.f32 %f17, 0f00000000; \n" +" mov.f32 %f18, %f17;\n" +" mov.f32 %f19, 0f00000000; \n" +" mov.f32 %f20, %f19;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_23810;\n" +" .loc 16 139 0\n" +" cvt.s64.s32 %rd2, %r9;\n" +" mul.wide.s32 %rd3, %r9, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd5, %rd3, %rd4;\n" +" ld.global.s32 %r11, [%rd5+0];\n" +" .loc 16 141 0\n" +" ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd6, %r12;\n" +" mul.wide.s32 %rd7, %r12, 4;\n" +" add.u64 %rd8, %rd7, %rd5;\n" +" ld.global.s32 %r13, [%rd8+0];\n" +" add.u64 %rd9, %rd7, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd4;\n" +" @%p2 bra $Lt_0_24322;\n" +" .loc 16 147 0\n" +" cvt.s32.s64 %r14, %rd6;\n" +" mul.lo.s32 %r15, %r14, %r13;\n" +" cvt.s64.s32 %rd11, %r15;\n" +" mul.wide.s32 %rd12, %r15, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 148 0\n" +" mul.lo.s32 %r16, %r6, %r14;\n" +" cvt.s64.s32 %rd14, %r16;\n" +" mul.wide.s32 %rd15, %r16, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 149 0\n" +" mul.lo.s32 %r17, %r14, %r1;\n" +" bra.uni $Lt_0_24066;\n" +"$Lt_0_24322:\n" +" .loc 16 151 0\n" +" ld.global.s32 %r18, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r18;\n" +" mul.wide.s32 %rd18, %r18, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 152 0\n" +" cvt.s64.s32 %rd20, %r13;\n" +" mul.wide.s32 %rd21, %r13, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 153 0\n" +" mov.s32 %r17, %r1;\n" +" .loc 16 154 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_24066:\n" +" .loc 16 157 0\n" +" mov.u32 %r19, %r11;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f25, %f21;\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" .loc 16 158 0\n" +" mov.u32 %r26, %r11;\n" +" mov.s32 %r27, 0;\n" +" mov.u32 %r28, %r27;\n" +" mov.s32 %r29, 0;\n" +" mov.u32 %r30, %r29;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r26,%r28,%r30,%r32}];\n" +" mov.f32 %f33, %f29;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_34818;\n" +" cvt.s64.s32 %rd24, %r17;\n" +" ld.param.f32 %f34, [__cudaparm_kernel_pair_cut_bothsq];\n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.u64 %rd25, __cuda___cuda_local_var_32500_33_non_const_sp_lj120;\n" +"$Lt_0_25090:\n" +" .loc 16 162 0\n" +" ld.global.s32 %r33, [%rd16+0];\n" +" .loc 16 165 0\n" +" shr.s32 %r34, %r33, 30;\n" +" and.b32 %r35, %r34, 3;\n" +" cvt.s64.s32 %rd26, %r35;\n" +" mul.wide.s32 %rd27, %r35, 4;\n" +" add.u64 %rd28, %rd25, %rd27;\n" +" ld.shared.f32 %f40, [%rd28+0];\n" +" .loc 16 166 0\n" +" mov.f32 %f41, 0f3f800000; \n" +" ld.shared.f32 %f42, [%rd28+16];\n" +" sub.ftz.f32 %f43, %f41, %f42;\n" +" .loc 16 169 0\n" +" and.b32 %r36, %r33, 1073741823;\n" +" mov.u32 %r37, %r36;\n" +" mov.s32 %r38, 0;\n" +" mov.u32 %r39, %r38;\n" +" mov.s32 %r40, 0;\n" +" mov.u32 %r41, %r40;\n" +" mov.s32 %r42, 0;\n" +" mov.u32 %r43, %r42;\n" +" tex.1d.v4.f32.s32 {%f44,%f45,%f46,%f47},[pos_tex,{%r37,%r39,%r41,%r43}];\n" +" mov.f32 %f48, %f44;\n" +" mov.f32 %f49, %f45;\n" +" mov.f32 %f50, %f46;\n" +" mov.f32 %f51, %f47;\n" +" sub.ftz.f32 %f52, %f26, %f49;\n" +" sub.ftz.f32 %f53, %f25, %f48;\n" +" sub.ftz.f32 %f54, %f27, %f50;\n" +" mul.ftz.f32 %f55, %f52, %f52;\n" +" fma.rn.ftz.f32 %f56, %f53, %f53, %f55;\n" +" fma.rn.ftz.f32 %f57, %f54, %f54, %f56;\n" +" setp.lt.ftz.f32 %p4, %f57, %f34;\n" +" @!%p4 bra $Lt_0_28418;\n" +" ld.param.f32 %f58, [__cudaparm_kernel_pair_cut_ljsq];\n" +" setp.lt.ftz.f32 %p5, %f57, %f58;\n" +" rcp.approx.ftz.f32 %f59, %f57;\n" +" @!%p5 bra $Lt_0_26114;\n" +" .loc 16 184 0\n" +" mul.ftz.f32 %f60, %f59, %f59;\n" +" mul.ftz.f32 %f61, %f59, %f60;\n" +" mov.f32 %f62, %f61;\n" +" .loc 16 185 0\n" +" cvt.rzi.ftz.s32.f32 %r44, %f51;\n" +" cvt.rzi.ftz.s32.f32 %r45, %f28;\n" +" ld.param.u64 %rd29, [__cudaparm_kernel_pair_lj1];\n" +" ld.param.s32 %r46, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r47, %r46, %r45;\n" +" add.s32 %r48, %r44, %r47;\n" +" cvt.s64.s32 %rd30, %r48;\n" +" mul.wide.s32 %rd31, %r48, 16;\n" +" add.u64 %rd32, %rd29, %rd31;\n" +" mul.ftz.f32 %f63, %f61, %f40;\n" +" ld.global.v2.f32 {%f64,%f65}, [%rd32+0];\n" +" mul.ftz.f32 %f66, %f64, %f61;\n" +" sub.ftz.f32 %f67, %f66, %f65;\n" +" mul.ftz.f32 %f68, %f63, %f67;\n" +" ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_lj_innersq];\n" +" setp.gt.ftz.f32 %p6, %f57, %f69;\n" +" @!%p6 bra $Lt_0_25858;\n" +" .loc 16 191 0\n" +" add.ftz.f32 %f70, %f57, %f57;\n" +" sub.ftz.f32 %f71, %f58, %f57;\n" +" add.ftz.f32 %f72, %f70, %f58;\n" +" mul.ftz.f32 %f73, %f71, %f71;\n" +" mov.f32 %f74, 0f40400000; \n" +" mul.ftz.f32 %f75, %f74, %f69;\n" +" sub.ftz.f32 %f76, %f72, %f75;\n" +" ld.param.f32 %f77, [__cudaparm_kernel_pair_denom_lj];\n" +" div.approx.ftz.f32 %f78, %f76, %f77;\n" +" mul.ftz.f32 %f79, %f73, %f78;\n" +" mov.f32 %f80, %f79;\n" +" .loc 16 194 0\n" +" mov.f32 %f81, 0f41400000; \n" +" mul.ftz.f32 %f82, %f57, %f81;\n" +" mul.ftz.f32 %f83, %f71, %f82;\n" +" sub.ftz.f32 %f84, %f57, %f69;\n" +" mul.ftz.f32 %f85, %f83, %f84;\n" +" div.approx.ftz.f32 %f86, %f85, %f77;\n" +" ld.global.v2.f32 {%f87,%f88}, [%rd32+8];\n" +" mul.ftz.f32 %f89, %f87, %f61;\n" +" sub.ftz.f32 %f90, %f89, %f88;\n" +" mul.ftz.f32 %f91, %f61, %f90;\n" +" mul.ftz.f32 %f92, %f86, %f91;\n" +" fma.rn.ftz.f32 %f68, %f68, %f79, %f92;\n" +" bra.uni $Lt_0_25858;\n" +"$Lt_0_26114:\n" +" .loc 16 197 0\n" +" mov.f32 %f68, 0f00000000; \n" +"$Lt_0_25858:\n" +" ld.param.f32 %f93, [__cudaparm_kernel_pair_cut_coulsq];\n" +" setp.gt.ftz.f32 %p7, %f93, %f57;\n" +" @!%p7 bra $Lt_0_27138;\n" +" .loc 16 204 0\n" +" sqrt.approx.ftz.f32 %f94, %f57;\n" +" ld.param.f32 %f95, [__cudaparm_kernel_pair_g_ewald];\n" +" mul.ftz.f32 %f96, %f95, %f94;\n" +" mul.ftz.f32 %f97, %f96, %f96;\n" +" mov.f32 %f98, 0f3f800000; \n" +" mov.f32 %f99, 0f3ea7ba05; \n" +" fma.rn.ftz.f32 %f100, %f99, %f96, %f98;\n" +" neg.ftz.f32 %f101, %f97;\n" +" rcp.approx.ftz.f32 %f102, %f100;\n" +" mov.f32 %f103, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f104, %f101, %f103;\n" +" ex2.approx.ftz.f32 %f105, %f104;\n" +" mov.f32 %f106, 0f3e827906; \n" +" mov.f32 %f107, 0fbe91a98e; \n" +" mov.f32 %f108, 0f3fb5f0e3; \n" +" mov.f32 %f109, 0fbfba00e3; \n" +" mov.f32 %f110, 0f3f87dc22; \n" +" fma.rn.ftz.f32 %f111, %f110, %f102, %f109;\n" +" fma.rn.ftz.f32 %f112, %f102, %f111, %f108;\n" +" fma.rn.ftz.f32 %f113, %f102, %f112, %f107;\n" +" fma.rn.ftz.f32 %f114, %f102, %f113, %f106;\n" +" mul.ftz.f32 %f115, %f102, %f114;\n" +" mul.ftz.f32 %f116, %f105, %f115;\n" +" mov.f32 %f117, %f116;\n" +" .loc 16 205 0\n" +" mov.u32 %r49, %r36;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" tex.1d.v4.f32.s32 {%f118,%f119,%f120,%f121},[q_tex,{%r49,%r51,%r53,%r55}];\n" +" mov.f32 %f122, %f118;\n" +" ld.param.f32 %f123, [__cudaparm_kernel_pair_qqrd2e];\n" +" mul.ftz.f32 %f124, %f123, %f33;\n" +" mul.ftz.f32 %f125, %f124, %f122;\n" +" div.approx.ftz.f32 %f126, %f125, %f94;\n" +" mov.f32 %f127, %f126;\n" +" .loc 16 206 0\n" +" mov.f32 %f128, 0f3f906ebb; \n" +" mul.ftz.f32 %f129, %f96, %f128;\n" +" fma.rn.ftz.f32 %f130, %f105, %f129, %f116;\n" +" sub.ftz.f32 %f131, %f130, %f43;\n" +" mul.ftz.f32 %f132, %f126, %f131;\n" +" bra.uni $Lt_0_26882;\n" +"$Lt_0_27138:\n" +" .loc 16 208 0\n" +" mov.f32 %f132, 0f00000000; \n" +"$Lt_0_26882:\n" +" .loc 16 212 0\n" +" add.ftz.f32 %f133, %f132, %f68;\n" +" mul.ftz.f32 %f134, %f133, %f59;\n" +" fma.rn.ftz.f32 %f37, %f53, %f134, %f37;\n" +" .loc 16 213 0\n" +" fma.rn.ftz.f32 %f36, %f52, %f134, %f36;\n" +" .loc 16 214 0\n" +" fma.rn.ftz.f32 %f35, %f54, %f134, %f35;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p8, %r56, %r57;\n" +" @%p8 bra $Lt_0_27906;\n" +" .loc 16 217 0\n" +" mov.f32 %f135, %f127;\n" +" mov.f32 %f136, %f117;\n" +" sub.ftz.f32 %f137, %f136, %f43;\n" +" fma.rn.ftz.f32 %f138, %f135, %f137, %f38;\n" +" selp.f32 %f38, %f138, %f38, %p7;\n" +" @!%p5 bra $Lt_0_27906;\n" +" .loc 16 220 0\n" +" cvt.rzi.ftz.s32.f32 %r58, %f51;\n" +" cvt.rzi.ftz.s32.f32 %r59, %f28;\n" +" ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj1];\n" +" ld.param.s32 %r60, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r61, %r60, %r59;\n" +" add.s32 %r62, %r58, %r61;\n" +" cvt.s64.s32 %rd34, %r62;\n" +" mul.wide.s32 %rd35, %r62, 16;\n" +" add.u64 %rd32, %rd33, %rd35;\n" +" mov.f32 %f139, %f62;\n" +" ld.global.v2.f32 {%f140,%f141}, [%rd32+8];\n" +" mul.ftz.f32 %f142, %f140, %f139;\n" +" sub.ftz.f32 %f143, %f142, %f141;\n" +" mul.ftz.f32 %f144, %f139, %f143;\n" +" mov.f32 %f145, %f80;\n" +" mul.ftz.f32 %f146, %f145, %f144;\n" +" ld.param.f32 %f147, [__cudaparm_kernel_pair_cut_lj_innersq];\n" +" setp.lt.ftz.f32 %p9, %f147, %f57;\n" +" selp.f32 %f148, %f146, %f144, %p9;\n" +" .loc 16 223 0\n" +" fma.rn.ftz.f32 %f39, %f40, %f148, %f39;\n" +"$Lt_0_27906:\n" +"$Lt_0_27394:\n" +" ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r64, 0;\n" +" setp.le.s32 %p10, %r63, %r64;\n" +" @%p10 bra $Lt_0_28418;\n" +" .loc 16 227 0\n" +" mov.f32 %f149, %f10;\n" +" mul.ftz.f32 %f150, %f53, %f53;\n" +" fma.rn.ftz.f32 %f151, %f134, %f150, %f149;\n" +" mov.f32 %f10, %f151;\n" +" .loc 16 228 0\n" +" mov.f32 %f152, %f12;\n" +" fma.rn.ftz.f32 %f153, %f134, %f55, %f152;\n" +" mov.f32 %f12, %f153;\n" +" .loc 16 229 0\n" +" mov.f32 %f154, %f14;\n" +" mul.ftz.f32 %f155, %f54, %f54;\n" +" fma.rn.ftz.f32 %f156, %f134, %f155, %f154;\n" +" mov.f32 %f14, %f156;\n" +" .loc 16 230 0\n" +" mov.f32 %f157, %f16;\n" +" mul.ftz.f32 %f158, %f52, %f53;\n" +" fma.rn.ftz.f32 %f159, %f134, %f158, %f157;\n" +" mov.f32 %f16, %f159;\n" +" .loc 16 231 0\n" +" mov.f32 %f160, %f18;\n" +" mul.ftz.f32 %f161, %f53, %f54;\n" +" fma.rn.ftz.f32 %f162, %f134, %f161, %f160;\n" +" mov.f32 %f18, %f162;\n" +" .loc 16 232 0\n" +" mul.ftz.f32 %f163, %f52, %f54;\n" +" fma.rn.ftz.f32 %f19, %f134, %f163, %f19;\n" +" mov.f32 %f20, %f19;\n" +"$Lt_0_28418:\n" +"$Lt_0_25346:\n" +" .loc 16 161 0\n" +" mul.lo.u64 %rd36, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd36;\n" +" setp.lt.u64 %p11, %rd16, %rd13;\n" +" @%p11 bra $Lt_0_25090;\n" +" bra.uni $Lt_0_23554;\n" +"$Lt_0_34818:\n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" bra.uni $Lt_0_23554;\n" +"$Lt_0_23810:\n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +"$Lt_0_23554:\n" +" mov.u32 %r65, 1;\n" +" setp.le.s32 %p12, %r1, %r65;\n" +" @%p12 bra $Lt_0_31234;\n" +" .loc 16 243 0\n" +" mov.u64 %rd37, __cuda___cuda_local_var_32624_35_non_const_red_acc152;\n" +" cvt.s64.s32 %rd38, %r2;\n" +" mul.wide.s32 %rd39, %r2, 4;\n" +" add.u64 %rd40, %rd37, %rd39;\n" +" mov.f32 %f164, %f37;\n" +" st.shared.f32 [%rd40+0], %f164;\n" +" .loc 16 244 0\n" +" mov.f32 %f165, %f36;\n" +" st.shared.f32 [%rd40+512], %f165;\n" +" .loc 16 245 0\n" +" mov.f32 %f166, %f35;\n" +" st.shared.f32 [%rd40+1024], %f166;\n" +" .loc 16 246 0\n" +" mov.f32 %f167, %f39;\n" +" st.shared.f32 [%rd40+1536], %f167;\n" +" .loc 16 247 0\n" +" mov.f32 %f168, %f38;\n" +" st.shared.f32 [%rd40+2048], %f168;\n" +" .loc 16 249 0\n" +" shr.s32 %r66, %r1, 31;\n" +" mov.s32 %r67, 1;\n" +" and.b32 %r68, %r66, %r67;\n" +" add.s32 %r69, %r68, %r1;\n" +" shr.s32 %r70, %r69, 1;\n" +" mov.s32 %r71, %r70;\n" +" mov.u32 %r72, 0;\n" +" setp.ne.u32 %p13, %r70, %r72;\n" +" @!%p13 bra $Lt_0_29698;\n" +"$Lt_0_30210:\n" +" setp.ge.u32 %p14, %r6, %r71;\n" +" @%p14 bra $Lt_0_30466;\n" +" .loc 16 252 0\n" +" add.u32 %r73, %r2, %r71;\n" +" cvt.u64.u32 %rd41, %r73;\n" +" mul.wide.u32 %rd42, %r73, 4;\n" +" add.u64 %rd43, %rd37, %rd42;\n" +" ld.shared.f32 %f169, [%rd43+0];\n" +" add.ftz.f32 %f164, %f169, %f164;\n" +" st.shared.f32 [%rd40+0], %f164;\n" +" ld.shared.f32 %f170, [%rd43+512];\n" +" add.ftz.f32 %f165, %f170, %f165;\n" +" st.shared.f32 [%rd40+512], %f165;\n" +" ld.shared.f32 %f171, [%rd43+1024];\n" +" add.ftz.f32 %f166, %f171, %f166;\n" +" st.shared.f32 [%rd40+1024], %f166;\n" +" ld.shared.f32 %f172, [%rd43+1536];\n" +" add.ftz.f32 %f167, %f172, %f167;\n" +" st.shared.f32 [%rd40+1536], %f167;\n" +" ld.shared.f32 %f173, [%rd43+2048];\n" +" add.ftz.f32 %f168, %f173, %f168;\n" +" st.shared.f32 [%rd40+2048], %f168;\n" +"$Lt_0_30466:\n" +" .loc 16 249 0\n" +" shr.u32 %r71, %r71, 1;\n" +" mov.u32 %r74, 0;\n" +" setp.ne.u32 %p15, %r71, %r74;\n" +" @%p15 bra $Lt_0_30210;\n" +"$Lt_0_29698:\n" +" .loc 16 256 0\n" +" mov.f32 %f37, %f164;\n" +" .loc 16 257 0\n" +" mov.f32 %f36, %f165;\n" +" .loc 16 258 0\n" +" mov.f32 %f35, %f166;\n" +" .loc 16 259 0\n" +" mov.f32 %f39, %f167;\n" +" .loc 16 260 0\n" +" mov.f32 %f38, %f168;\n" +" ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r76, 0;\n" +" setp.le.s32 %p16, %r75, %r76;\n" +" @%p16 bra $Lt_0_31234;\n" +" .loc 16 264 0\n" +" mov.f32 %f164, %f10;\n" +" st.shared.f32 [%rd40+0], %f164;\n" +" mov.f32 %f165, %f12;\n" +" st.shared.f32 [%rd40+512], %f165;\n" +" mov.f32 %f166, %f14;\n" +" st.shared.f32 [%rd40+1024], %f166;\n" +" mov.f32 %f167, %f16;\n" +" st.shared.f32 [%rd40+1536], %f167;\n" +" mov.f32 %f168, %f18;\n" +" st.shared.f32 [%rd40+2048], %f168;\n" +" mov.f32 %f174, %f20;\n" +" st.shared.f32 [%rd40+2560], %f174;\n" +" .loc 16 266 0\n" +" mov.s32 %r77, %r70;\n" +" @!%p13 bra $Lt_0_31746;\n" +"$Lt_0_32258:\n" +" setp.ge.u32 %p17, %r6, %r77;\n" +" @%p17 bra $Lt_0_32514;\n" +" .loc 16 269 0\n" +" add.u32 %r78, %r2, %r77;\n" +" cvt.u64.u32 %rd44, %r78;\n" +" mul.wide.u32 %rd45, %r78, 4;\n" +" add.u64 %rd46, %rd37, %rd45;\n" +" ld.shared.f32 %f175, [%rd46+0];\n" +" add.ftz.f32 %f164, %f175, %f164;\n" +" st.shared.f32 [%rd40+0], %f164;\n" +" ld.shared.f32 %f176, [%rd46+512];\n" +" add.ftz.f32 %f165, %f176, %f165;\n" +" st.shared.f32 [%rd40+512], %f165;\n" +" ld.shared.f32 %f177, [%rd46+1024];\n" +" add.ftz.f32 %f166, %f177, %f166;\n" +" st.shared.f32 [%rd40+1024], %f166;\n" +" ld.shared.f32 %f178, [%rd46+1536];\n" +" add.ftz.f32 %f167, %f178, %f167;\n" +" st.shared.f32 [%rd40+1536], %f167;\n" +" ld.shared.f32 %f179, [%rd46+2048];\n" +" add.ftz.f32 %f168, %f179, %f168;\n" +" st.shared.f32 [%rd40+2048], %f168;\n" +" ld.shared.f32 %f180, [%rd46+2560];\n" +" add.ftz.f32 %f174, %f180, %f174;\n" +" st.shared.f32 [%rd40+2560], %f174;\n" +"$Lt_0_32514:\n" +" .loc 16 266 0\n" +" shr.u32 %r77, %r77, 1;\n" +" mov.u32 %r79, 0;\n" +" setp.ne.u32 %p18, %r77, %r79;\n" +" @%p18 bra $Lt_0_32258;\n" +"$Lt_0_31746:\n" +" .loc 16 274 0\n" +" mov.f32 %f10, %f164;\n" +" mov.f32 %f12, %f165;\n" +" mov.f32 %f14, %f166;\n" +" mov.f32 %f16, %f167;\n" +" mov.f32 %f18, %f168;\n" +" mov.f32 %f20, %f174;\n" +"$Lt_0_31234:\n" +"$Lt_0_29186:\n" +" selp.s32 %r80, 1, 0, %p1;\n" +" mov.s32 %r81, 0;\n" +" set.eq.u32.s32 %r82, %r6, %r81;\n" +" neg.s32 %r83, %r82;\n" +" and.b32 %r84, %r80, %r83;\n" +" mov.u32 %r85, 0;\n" +" setp.eq.s32 %p19, %r84, %r85;\n" +" @%p19 bra $Lt_0_33282;\n" +" .loc 16 280 0\n" +" cvt.s64.s32 %rd47, %r9;\n" +" ld.param.u64 %rd48, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd49, %r9, 4;\n" +" add.u64 %rd50, %rd48, %rd49;\n" +" ld.param.s32 %r86, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r87, 0;\n" +" setp.le.s32 %p20, %r86, %r87;\n" +" @%p20 bra $Lt_0_33794;\n" +" .loc 16 282 0\n" +" st.global.f32 [%rd50+0], %f39;\n" +" .loc 16 283 0\n" +" cvt.s64.s32 %rd51, %r10;\n" +" mul.wide.s32 %rd52, %r10, 4;\n" +" add.u64 %rd53, %rd52, %rd50;\n" +" .loc 16 284 0\n" +" st.global.f32 [%rd53+0], %f38;\n" +" .loc 16 285 0\n" +" add.u64 %rd50, %rd52, %rd53;\n" +"$Lt_0_33794:\n" +" ld.param.s32 %r88, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r89, 0;\n" +" setp.le.s32 %p21, %r88, %r89;\n" +" @%p21 bra $Lt_0_34306;\n" +" .loc 16 289 0\n" +" mov.f32 %f181, %f10;\n" +" st.global.f32 [%rd50+0], %f181;\n" +" .loc 16 290 0\n" +" cvt.s64.s32 %rd54, %r10;\n" +" mul.wide.s32 %rd55, %r10, 4;\n" +" add.u64 %rd56, %rd55, %rd50;\n" +" .loc 16 289 0\n" +" mov.f32 %f182, %f12;\n" +" st.global.f32 [%rd56+0], %f182;\n" +" .loc 16 290 0\n" +" add.u64 %rd57, %rd55, %rd56;\n" +" .loc 16 289 0\n" +" mov.f32 %f183, %f14;\n" +" st.global.f32 [%rd57+0], %f183;\n" +" .loc 16 290 0\n" +" add.u64 %rd58, %rd55, %rd57;\n" +" .loc 16 289 0\n" +" mov.f32 %f184, %f16;\n" +" st.global.f32 [%rd58+0], %f184;\n" +" .loc 16 290 0\n" +" add.u64 %rd50, %rd55, %rd58;\n" +" .loc 16 289 0\n" +" mov.f32 %f185, %f18;\n" +" st.global.f32 [%rd50+0], %f185;\n" +" mov.f32 %f186, %f20;\n" +" add.u64 %rd59, %rd55, %rd50;\n" +" st.global.f32 [%rd59+0], %f186;\n" +"$Lt_0_34306:\n" +" .loc 16 293 0\n" +" ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd61, %rd47, 16;\n" +" add.u64 %rd62, %rd60, %rd61;\n" +" mov.f32 %f187, %f188;\n" +" st.global.v4.f32 [%rd62+0], {%f37,%f36,%f35,%f187};\n" +"$Lt_0_33282:\n" +" .loc 16 295 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ljd_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" +" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" +" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n" +" .param .f32 __cudaparm_kernel_pair_fast_denom_lj,\n" +" .param .f32 __cudaparm_kernel_pair_fast_cut_bothsq,\n" +" .param .f32 __cudaparm_kernel_pair_fast_cut_ljsq,\n" +" .param .f32 __cudaparm_kernel_pair_fast_cut_lj_innersq,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<86>;\n" +" .reg .u64 %rd<71>;\n" +" .reg .f32 %f<196>;\n" +" .reg .pred %p<25>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32697_33_non_const_sp_lj3336[32];\n" +" .shared .align 8 .b8 __cuda___cuda_local_var_32696_34_non_const_ljd3368[1024];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32826_35_non_const_red_acc4392[3072];\n" +" .loc 16 307 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" cvt.s64.s32 %rd1, %r1;\n" +" mov.u32 %r2, 7;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_25090;\n" +" .loc 16 316 0\n" +" mov.u64 %rd2, __cuda___cuda_local_var_32697_33_non_const_sp_lj3336;\n" +" mul.lo.u64 %rd3, %rd1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd2;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_25090:\n" +" mov.u64 %rd2, __cuda___cuda_local_var_32697_33_non_const_sp_lj3336;\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32696_34_non_const_ljd3368;\n" +" .loc 16 317 0\n" +" mul.lo.u64 %rd8, %rd1, 8;\n" +" ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_ljd_in];\n" +" add.u64 %rd10, %rd9, %rd8;\n" +" add.u64 %rd11, %rd8, %rd7;\n" +" ld.global.v2.f32 {%f2,%f3}, [%rd10+0];\n" +" st.shared.v2.f32 [%rd11+0], {%f2,%f3};\n" +" add.s32 %r3, %r1, 128;\n" +" mov.u32 %r4, 127;\n" +" setp.gt.s32 %p2, %r3, %r4;\n" +" @%p2 bra $Lt_1_25602;\n" +" ld.global.v2.f32 {%f4,%f5}, [%rd10+1024];\n" +" st.shared.v2.f32 [%rd11+1024], {%f4,%f5};\n" +"$Lt_1_25602:\n" +" .loc 16 329 0\n" +" mov.f32 %f6, 0f00000000; \n" +" mov.f32 %f7, %f6;\n" +" mov.f32 %f8, 0f00000000; \n" +" mov.f32 %f9, %f8;\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, %f14;\n" +" mov.f32 %f16, 0f00000000; \n" +" mov.f32 %f17, %f16;\n" +" .loc 16 331 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r5, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r6, %r1, %r5;\n" +" cvt.s32.u32 %r7, %ntid.x;\n" +" div.s32 %r8, %r7, %r5;\n" +" rem.s32 %r9, %r1, %r5;\n" +" cvt.s32.u32 %r10, %ctaid.x;\n" +" mul.lo.s32 %r11, %r10, %r8;\n" +" add.s32 %r12, %r6, %r11;\n" +" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p3, %r12, %r13;\n" +" @!%p3 bra $Lt_1_26370;\n" +" .loc 16 335 0\n" +" cvt.s64.s32 %rd12, %r12;\n" +" mul.wide.s32 %rd13, %r12, 4;\n" +" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd15, %rd13, %rd14;\n" +" ld.global.s32 %r14, [%rd15+0];\n" +" .loc 16 337 0\n" +" ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd16, %r15;\n" +" mul.wide.s32 %rd17, %r15, 4;\n" +" add.u64 %rd18, %rd17, %rd15;\n" +" ld.global.s32 %r16, [%rd18+0];\n" +" add.u64 %rd19, %rd17, %rd18;\n" +" ld.param.u64 %rd20, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p4, %rd20, %rd14;\n" +" @%p4 bra $Lt_1_26882;\n" +" .loc 16 343 0\n" +" cvt.s32.s64 %r17, %rd16;\n" +" mul.lo.s32 %r18, %r17, %r16;\n" +" cvt.s64.s32 %rd21, %r18;\n" +" mul.wide.s32 %rd22, %r18, 4;\n" +" add.u64 %rd23, %rd19, %rd22;\n" +" .loc 16 344 0\n" +" mul.lo.s32 %r19, %r9, %r17;\n" +" cvt.s64.s32 %rd24, %r19;\n" +" mul.wide.s32 %rd25, %r19, 4;\n" +" add.u64 %rd26, %rd19, %rd25;\n" +" .loc 16 345 0\n" +" mul.lo.s32 %r20, %r17, %r5;\n" +" bra.uni $Lt_1_26626;\n" +"$Lt_1_26882:\n" +" .loc 16 347 0\n" +" ld.global.s32 %r21, [%rd19+0];\n" +" cvt.s64.s32 %rd27, %r21;\n" +" mul.wide.s32 %rd28, %r21, 4;\n" +" add.u64 %rd29, %rd20, %rd28;\n" +" .loc 16 348 0\n" +" cvt.s64.s32 %rd30, %r16;\n" +" mul.wide.s32 %rd31, %r16, 4;\n" +" add.u64 %rd23, %rd29, %rd31;\n" +" .loc 16 349 0\n" +" mov.s32 %r20, %r5;\n" +" .loc 16 350 0\n" +" cvt.s64.s32 %rd32, %r9;\n" +" mul.wide.s32 %rd33, %r9, 4;\n" +" add.u64 %rd26, %rd29, %rd33;\n" +"$Lt_1_26626:\n" +" .loc 16 353 0\n" +" mov.u32 %r22, %r14;\n" +" mov.s32 %r23, 0;\n" +" mov.u32 %r24, %r23;\n" +" mov.s32 %r25, 0;\n" +" mov.u32 %r26, %r25;\n" +" mov.s32 %r27, 0;\n" +" mov.u32 %r28, %r27;\n" +" tex.1d.v4.f32.s32 {%f18,%f19,%f20,%f21},[pos_tex,{%r22,%r24,%r26,%r28}];\n" +" mov.f32 %f22, %f18;\n" +" mov.f32 %f23, %f19;\n" +" mov.f32 %f24, %f20;\n" +" mov.f32 %f25, %f21;\n" +" .loc 16 354 0\n" +" mov.u32 %r29, %r14;\n" +" mov.s32 %r30, 0;\n" +" mov.u32 %r31, %r30;\n" +" mov.s32 %r32, 0;\n" +" mov.u32 %r33, %r32;\n" +" mov.s32 %r34, 0;\n" +" mov.u32 %r35, %r34;\n" +" tex.1d.v4.f32.s32 {%f26,%f27,%f28,%f29},[q_tex,{%r29,%r31,%r33,%r35}];\n" +" mov.f32 %f30, %f26;\n" +" setp.ge.u64 %p5, %rd26, %rd23;\n" +" @%p5 bra $Lt_1_37378;\n" +" cvt.rzi.ftz.s32.f32 %r36, %f25;\n" +" cvt.s64.s32 %rd34, %r20;\n" +" ld.param.f32 %f31, [__cudaparm_kernel_pair_fast_cut_bothsq];\n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +"$Lt_1_27650:\n" +" .loc 16 358 0\n" +" ld.global.s32 %r37, [%rd26+0];\n" +" .loc 16 361 0\n" +" shr.s32 %r38, %r37, 30;\n" +" and.b32 %r39, %r38, 3;\n" +" cvt.s64.s32 %rd35, %r39;\n" +" mul.wide.s32 %rd36, %r39, 4;\n" +" add.u64 %rd37, %rd2, %rd36;\n" +" ld.shared.f32 %f37, [%rd37+0];\n" +" .loc 16 362 0\n" +" mov.f32 %f38, 0f3f800000; \n" +" ld.shared.f32 %f39, [%rd37+16];\n" +" sub.ftz.f32 %f40, %f38, %f39;\n" +" .loc 16 365 0\n" +" and.b32 %r40, %r37, 1073741823;\n" +" mov.u32 %r41, %r40;\n" +" mov.s32 %r42, 0;\n" +" mov.u32 %r43, %r42;\n" +" mov.s32 %r44, 0;\n" +" mov.u32 %r45, %r44;\n" +" mov.s32 %r46, 0;\n" +" mov.u32 %r47, %r46;\n" +" tex.1d.v4.f32.s32 {%f41,%f42,%f43,%f44},[pos_tex,{%r41,%r43,%r45,%r47}];\n" +" mov.f32 %f45, %f41;\n" +" mov.f32 %f46, %f42;\n" +" mov.f32 %f47, %f43;\n" +" mov.f32 %f48, %f44;\n" +" sub.ftz.f32 %f49, %f23, %f46;\n" +" sub.ftz.f32 %f50, %f22, %f45;\n" +" sub.ftz.f32 %f51, %f24, %f47;\n" +" mul.ftz.f32 %f52, %f49, %f49;\n" +" fma.rn.ftz.f32 %f53, %f50, %f50, %f52;\n" +" fma.rn.ftz.f32 %f54, %f51, %f51, %f53;\n" +" setp.lt.ftz.f32 %p6, %f54, %f31;\n" +" @!%p6 bra $Lt_1_30978;\n" +" ld.param.f32 %f55, [__cudaparm_kernel_pair_fast_cut_ljsq];\n" +" setp.lt.ftz.f32 %p7, %f54, %f55;\n" +" rcp.approx.ftz.f32 %f56, %f54;\n" +" @!%p7 bra $Lt_1_28674;\n" +" .loc 16 380 0\n" +" cvt.rzi.ftz.s32.f32 %r48, %f48;\n" +" cvt.s64.s32 %rd38, %r36;\n" +" mul.wide.s32 %rd39, %r36, 8;\n" +" add.u64 %rd40, %rd7, %rd39;\n" +" cvt.s64.s32 %rd41, %r48;\n" +" mul.wide.s32 %rd42, %r48, 8;\n" +" add.u64 %rd43, %rd7, %rd42;\n" +" ld.shared.v2.f32 {%f57,%f58}, [%rd40+0];\n" +" ld.shared.v2.f32 {%f59,%f60}, [%rd43+0];\n" +" mul.ftz.f32 %f61, %f57, %f59;\n" +" .loc 16 381 0\n" +" add.ftz.f32 %f62, %f58, %f60;\n" +" mov.f32 %f63, 0f3f000000; \n" +" mul.ftz.f32 %f64, %f62, %f63;\n" +" .loc 16 385 0\n" +" mul.ftz.f32 %f65, %f64, %f64;\n" +" sqrt.approx.ftz.f32 %f66, %f61;\n" +" mov.f32 %f67, 0f40800000; \n" +" mul.ftz.f32 %f68, %f66, %f67;\n" +" mul.ftz.f32 %f69, %f65, %f56;\n" +" mul.ftz.f32 %f70, %f69, %f69;\n" +" mul.ftz.f32 %f71, %f69, %f70;\n" +" mul.ftz.f32 %f72, %f68, %f71;\n" +" mov.f32 %f73, %f72;\n" +" .loc 16 386 0\n" +" mul.ftz.f32 %f74, %f71, %f72;\n" +" mov.f32 %f75, %f74;\n" +" .loc 16 387 0\n" +" mov.f32 %f76, 0f40c00000; \n" +" mul.ftz.f32 %f77, %f72, %f76;\n" +" mov.f32 %f78, 0f41400000; \n" +" mul.ftz.f32 %f79, %f78, %f74;\n" +" sub.ftz.f32 %f80, %f79, %f77;\n" +" mul.ftz.f32 %f81, %f37, %f80;\n" +" ld.param.f32 %f82, [__cudaparm_kernel_pair_fast_cut_lj_innersq];\n" +" setp.gt.ftz.f32 %p8, %f54, %f82;\n" +" @!%p8 bra $Lt_1_28418;\n" +" .loc 16 393 0\n" +" add.ftz.f32 %f83, %f54, %f54;\n" +" sub.ftz.f32 %f84, %f55, %f54;\n" +" add.ftz.f32 %f85, %f83, %f55;\n" +" mul.ftz.f32 %f86, %f84, %f84;\n" +" mov.f32 %f87, 0f40400000; \n" +" mul.ftz.f32 %f88, %f87, %f82;\n" +" sub.ftz.f32 %f89, %f85, %f88;\n" +" ld.param.f32 %f90, [__cudaparm_kernel_pair_fast_denom_lj];\n" +" div.approx.ftz.f32 %f91, %f89, %f90;\n" +" mul.ftz.f32 %f92, %f86, %f91;\n" +" mov.f32 %f93, %f92;\n" +" .loc 16 396 0\n" +" mov.f32 %f94, 0f41400000; \n" +" mul.ftz.f32 %f95, %f54, %f94;\n" +" mul.ftz.f32 %f96, %f84, %f95;\n" +" sub.ftz.f32 %f97, %f54, %f82;\n" +" mul.ftz.f32 %f98, %f96, %f97;\n" +" div.approx.ftz.f32 %f99, %f98, %f90;\n" +" sub.ftz.f32 %f100, %f74, %f72;\n" +" mul.ftz.f32 %f101, %f99, %f100;\n" +" fma.rn.ftz.f32 %f81, %f81, %f92, %f101;\n" +" bra.uni $Lt_1_28418;\n" +"$Lt_1_28674:\n" +" .loc 16 399 0\n" +" mov.f32 %f81, 0f00000000; \n" +"$Lt_1_28418:\n" +" ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" +" setp.gt.ftz.f32 %p9, %f102, %f54;\n" +" @!%p9 bra $Lt_1_29698;\n" +" .loc 16 406 0\n" +" sqrt.approx.ftz.f32 %f103, %f54;\n" +" ld.param.f32 %f104, [__cudaparm_kernel_pair_fast_g_ewald];\n" +" mul.ftz.f32 %f105, %f104, %f103;\n" +" mul.ftz.f32 %f106, %f105, %f105;\n" +" mov.f32 %f107, 0f3f800000; \n" +" mov.f32 %f108, 0f3ea7ba05; \n" +" fma.rn.ftz.f32 %f109, %f108, %f105, %f107;\n" +" neg.ftz.f32 %f110, %f106;\n" +" rcp.approx.ftz.f32 %f111, %f109;\n" +" mov.f32 %f112, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f113, %f110, %f112;\n" +" ex2.approx.ftz.f32 %f114, %f113;\n" +" mov.f32 %f115, 0f3e827906; \n" +" mov.f32 %f116, 0fbe91a98e; \n" +" mov.f32 %f117, 0f3fb5f0e3; \n" +" mov.f32 %f118, 0fbfba00e3; \n" +" mov.f32 %f119, 0f3f87dc22; \n" +" fma.rn.ftz.f32 %f120, %f119, %f111, %f118;\n" +" fma.rn.ftz.f32 %f121, %f111, %f120, %f117;\n" +" fma.rn.ftz.f32 %f122, %f111, %f121, %f116;\n" +" fma.rn.ftz.f32 %f123, %f111, %f122, %f115;\n" +" mul.ftz.f32 %f124, %f111, %f123;\n" +" mul.ftz.f32 %f125, %f114, %f124;\n" +" mov.f32 %f126, %f125;\n" +" .loc 16 407 0\n" +" mov.u32 %r49, %r40;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" tex.1d.v4.f32.s32 {%f127,%f128,%f129,%f130},[q_tex,{%r49,%r51,%r53,%r55}];\n" +" mov.f32 %f131, %f127;\n" +" ld.param.f32 %f132, [__cudaparm_kernel_pair_fast_qqrd2e];\n" +" mul.ftz.f32 %f133, %f132, %f30;\n" +" mul.ftz.f32 %f134, %f133, %f131;\n" +" div.approx.ftz.f32 %f135, %f134, %f103;\n" +" mov.f32 %f136, %f135;\n" +" .loc 16 408 0\n" +" mov.f32 %f137, 0f3f906ebb; \n" +" mul.ftz.f32 %f138, %f105, %f137;\n" +" fma.rn.ftz.f32 %f139, %f114, %f138, %f125;\n" +" sub.ftz.f32 %f140, %f139, %f40;\n" +" mul.ftz.f32 %f141, %f135, %f140;\n" +" bra.uni $Lt_1_29442;\n" +"$Lt_1_29698:\n" +" .loc 16 410 0\n" +" mov.f32 %f141, 0f00000000; \n" +"$Lt_1_29442:\n" +" .loc 16 414 0\n" +" add.ftz.f32 %f142, %f141, %f81;\n" +" mul.ftz.f32 %f143, %f142, %f56;\n" +" fma.rn.ftz.f32 %f34, %f50, %f143, %f34;\n" +" .loc 16 415 0\n" +" fma.rn.ftz.f32 %f33, %f49, %f143, %f33;\n" +" .loc 16 416 0\n" +" fma.rn.ftz.f32 %f32, %f51, %f143, %f32;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p10, %r56, %r57;\n" +" @%p10 bra $Lt_1_30466;\n" +" .loc 16 419 0\n" +" mov.f32 %f144, %f136;\n" +" mov.f32 %f145, %f126;\n" +" sub.ftz.f32 %f146, %f145, %f40;\n" +" fma.rn.ftz.f32 %f147, %f144, %f146, %f35;\n" +" selp.f32 %f35, %f147, %f35, %p9;\n" +" @!%p7 bra $Lt_1_30466;\n" +" .loc 16 425 0\n" +" mov.f32 %f148, %f75;\n" +" mov.f32 %f149, %f73;\n" +" sub.ftz.f32 %f150, %f148, %f149;\n" +" mov.f32 %f151, %f93;\n" +" mul.ftz.f32 %f152, %f151, %f150;\n" +" ld.param.f32 %f153, [__cudaparm_kernel_pair_fast_cut_lj_innersq];\n" +" setp.lt.ftz.f32 %p11, %f153, %f54;\n" +" selp.f32 %f154, %f152, %f150, %p11;\n" +" fma.rn.ftz.f32 %f36, %f37, %f154, %f36;\n" +"$Lt_1_30466:\n" +"$Lt_1_29954:\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p12, %r58, %r59;\n" +" @%p12 bra $Lt_1_30978;\n" +" .loc 16 429 0\n" +" mov.f32 %f155, %f7;\n" +" mul.ftz.f32 %f156, %f50, %f50;\n" +" fma.rn.ftz.f32 %f157, %f143, %f156, %f155;\n" +" mov.f32 %f7, %f157;\n" +" .loc 16 430 0\n" +" mov.f32 %f158, %f9;\n" +" fma.rn.ftz.f32 %f159, %f143, %f52, %f158;\n" +" mov.f32 %f9, %f159;\n" +" .loc 16 431 0\n" +" mov.f32 %f160, %f11;\n" +" mul.ftz.f32 %f161, %f51, %f51;\n" +" fma.rn.ftz.f32 %f162, %f143, %f161, %f160;\n" +" mov.f32 %f11, %f162;\n" +" .loc 16 432 0\n" +" mov.f32 %f163, %f13;\n" +" mul.ftz.f32 %f164, %f49, %f50;\n" +" fma.rn.ftz.f32 %f165, %f143, %f164, %f163;\n" +" mov.f32 %f13, %f165;\n" +" .loc 16 433 0\n" +" mov.f32 %f166, %f15;\n" +" mul.ftz.f32 %f167, %f50, %f51;\n" +" fma.rn.ftz.f32 %f168, %f143, %f167, %f166;\n" +" mov.f32 %f15, %f168;\n" +" .loc 16 434 0\n" +" mul.ftz.f32 %f169, %f49, %f51;\n" +" fma.rn.ftz.f32 %f16, %f143, %f169, %f16;\n" +" mov.f32 %f17, %f16;\n" +"$Lt_1_30978:\n" +"$Lt_1_27906:\n" +" .loc 16 357 0\n" +" mul.lo.u64 %rd44, %rd34, 4;\n" +" add.u64 %rd26, %rd26, %rd44;\n" +" setp.lt.u64 %p13, %rd26, %rd23;\n" +" @%p13 bra $Lt_1_27650;\n" +" bra.uni $Lt_1_26114;\n" +"$Lt_1_37378:\n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" bra.uni $Lt_1_26114;\n" +"$Lt_1_26370:\n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +"$Lt_1_26114:\n" +" mov.u32 %r60, 1;\n" +" setp.le.s32 %p14, %r5, %r60;\n" +" @%p14 bra $Lt_1_33794;\n" +" .loc 16 445 0\n" +" mov.u64 %rd45, __cuda___cuda_local_var_32826_35_non_const_red_acc4392;\n" +" mul.lo.u64 %rd46, %rd1, 4;\n" +" add.u64 %rd47, %rd45, %rd46;\n" +" mov.f32 %f170, %f34;\n" +" st.shared.f32 [%rd47+0], %f170;\n" +" .loc 16 446 0\n" +" mov.f32 %f171, %f33;\n" +" st.shared.f32 [%rd47+512], %f171;\n" +" .loc 16 447 0\n" +" mov.f32 %f172, %f32;\n" +" st.shared.f32 [%rd47+1024], %f172;\n" +" .loc 16 448 0\n" +" mov.f32 %f173, %f36;\n" +" st.shared.f32 [%rd47+1536], %f173;\n" +" .loc 16 449 0\n" +" mov.f32 %f174, %f35;\n" +" st.shared.f32 [%rd47+2048], %f174;\n" +" .loc 16 451 0\n" +" shr.s32 %r61, %r5, 31;\n" +" mov.s32 %r62, 1;\n" +" and.b32 %r63, %r61, %r62;\n" +" add.s32 %r64, %r63, %r5;\n" +" shr.s32 %r65, %r64, 1;\n" +" mov.s32 %r66, %r65;\n" +" mov.u32 %r67, 0;\n" +" setp.ne.u32 %p15, %r65, %r67;\n" +" @!%p15 bra $Lt_1_32258;\n" +"$Lt_1_32770:\n" +" setp.ge.u32 %p16, %r9, %r66;\n" +" @%p16 bra $Lt_1_33026;\n" +" .loc 16 454 0\n" +" add.u32 %r68, %r1, %r66;\n" +" cvt.u64.u32 %rd48, %r68;\n" +" mul.wide.u32 %rd49, %r68, 4;\n" +" add.u64 %rd50, %rd45, %rd49;\n" +" ld.shared.f32 %f175, [%rd50+0];\n" +" add.ftz.f32 %f170, %f175, %f170;\n" +" st.shared.f32 [%rd47+0], %f170;\n" +" ld.shared.f32 %f176, [%rd50+512];\n" +" add.ftz.f32 %f171, %f176, %f171;\n" +" st.shared.f32 [%rd47+512], %f171;\n" +" ld.shared.f32 %f177, [%rd50+1024];\n" +" add.ftz.f32 %f172, %f177, %f172;\n" +" st.shared.f32 [%rd47+1024], %f172;\n" +" ld.shared.f32 %f178, [%rd50+1536];\n" +" add.ftz.f32 %f173, %f178, %f173;\n" +" st.shared.f32 [%rd47+1536], %f173;\n" +" ld.shared.f32 %f179, [%rd50+2048];\n" +" add.ftz.f32 %f174, %f179, %f174;\n" +" st.shared.f32 [%rd47+2048], %f174;\n" +"$Lt_1_33026:\n" +" .loc 16 451 0\n" +" shr.u32 %r66, %r66, 1;\n" +" mov.u32 %r69, 0;\n" +" setp.ne.u32 %p17, %r66, %r69;\n" +" @%p17 bra $Lt_1_32770;\n" +"$Lt_1_32258:\n" +" .loc 16 458 0\n" +" mov.f32 %f34, %f170;\n" +" .loc 16 459 0\n" +" mov.f32 %f33, %f171;\n" +" .loc 16 460 0\n" +" mov.f32 %f32, %f172;\n" +" .loc 16 461 0\n" +" mov.f32 %f36, %f173;\n" +" .loc 16 462 0\n" +" mov.f32 %f35, %f174;\n" +" ld.param.s32 %r70, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r71, 0;\n" +" setp.le.s32 %p18, %r70, %r71;\n" +" @%p18 bra $Lt_1_33794;\n" +" .loc 16 466 0\n" +" mov.f32 %f170, %f7;\n" +" st.shared.f32 [%rd47+0], %f170;\n" +" mov.f32 %f171, %f9;\n" +" st.shared.f32 [%rd47+512], %f171;\n" +" mov.f32 %f172, %f11;\n" +" st.shared.f32 [%rd47+1024], %f172;\n" +" mov.f32 %f173, %f13;\n" +" st.shared.f32 [%rd47+1536], %f173;\n" +" mov.f32 %f174, %f15;\n" +" st.shared.f32 [%rd47+2048], %f174;\n" +" mov.f32 %f180, %f17;\n" +" st.shared.f32 [%rd47+2560], %f180;\n" +" .loc 16 468 0\n" +" mov.s32 %r72, %r65;\n" +" @!%p15 bra $Lt_1_34306;\n" +"$Lt_1_34818:\n" +" setp.ge.u32 %p19, %r9, %r72;\n" +" @%p19 bra $Lt_1_35074;\n" +" .loc 16 471 0\n" +" add.u32 %r73, %r1, %r72;\n" +" cvt.u64.u32 %rd51, %r73;\n" +" mul.wide.u32 %rd52, %r73, 4;\n" +" add.u64 %rd53, %rd45, %rd52;\n" +" ld.shared.f32 %f181, [%rd53+0];\n" +" add.ftz.f32 %f170, %f181, %f170;\n" +" st.shared.f32 [%rd47+0], %f170;\n" +" ld.shared.f32 %f182, [%rd53+512];\n" +" add.ftz.f32 %f171, %f182, %f171;\n" +" st.shared.f32 [%rd47+512], %f171;\n" +" ld.shared.f32 %f183, [%rd53+1024];\n" +" add.ftz.f32 %f172, %f183, %f172;\n" +" st.shared.f32 [%rd47+1024], %f172;\n" +" ld.shared.f32 %f184, [%rd53+1536];\n" +" add.ftz.f32 %f173, %f184, %f173;\n" +" st.shared.f32 [%rd47+1536], %f173;\n" +" ld.shared.f32 %f185, [%rd53+2048];\n" +" add.ftz.f32 %f174, %f185, %f174;\n" +" st.shared.f32 [%rd47+2048], %f174;\n" +" ld.shared.f32 %f186, [%rd53+2560];\n" +" add.ftz.f32 %f180, %f186, %f180;\n" +" st.shared.f32 [%rd47+2560], %f180;\n" +"$Lt_1_35074:\n" +" .loc 16 468 0\n" +" shr.u32 %r72, %r72, 1;\n" +" mov.u32 %r74, 0;\n" +" setp.ne.u32 %p20, %r72, %r74;\n" +" @%p20 bra $Lt_1_34818;\n" +"$Lt_1_34306:\n" +" .loc 16 476 0\n" +" mov.f32 %f7, %f170;\n" +" mov.f32 %f9, %f171;\n" +" mov.f32 %f11, %f172;\n" +" mov.f32 %f13, %f173;\n" +" mov.f32 %f15, %f174;\n" +" mov.f32 %f17, %f180;\n" +"$Lt_1_33794:\n" +"$Lt_1_31746:\n" +" selp.s32 %r75, 1, 0, %p3;\n" +" mov.s32 %r76, 0;\n" +" set.eq.u32.s32 %r77, %r9, %r76;\n" +" neg.s32 %r78, %r77;\n" +" and.b32 %r79, %r75, %r78;\n" +" mov.u32 %r80, 0;\n" +" setp.eq.s32 %p21, %r79, %r80;\n" +" @%p21 bra $Lt_1_35842;\n" +" .loc 16 482 0\n" +" cvt.s64.s32 %rd54, %r12;\n" +" ld.param.u64 %rd55, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd56, %r12, 4;\n" +" add.u64 %rd57, %rd55, %rd56;\n" +" ld.param.s32 %r81, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r82, 0;\n" +" setp.le.s32 %p22, %r81, %r82;\n" +" @%p22 bra $Lt_1_36354;\n" +" .loc 16 484 0\n" +" st.global.f32 [%rd57+0], %f36;\n" +" .loc 16 485 0\n" +" cvt.s64.s32 %rd58, %r13;\n" +" mul.wide.s32 %rd59, %r13, 4;\n" +" add.u64 %rd60, %rd59, %rd57;\n" +" .loc 16 486 0\n" +" st.global.f32 [%rd60+0], %f35;\n" +" .loc 16 487 0\n" +" add.u64 %rd57, %rd59, %rd60;\n" +"$Lt_1_36354:\n" +" ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r84, 0;\n" +" setp.le.s32 %p23, %r83, %r84;\n" +" @%p23 bra $Lt_1_36866;\n" +" .loc 16 491 0\n" +" mov.f32 %f187, %f7;\n" +" st.global.f32 [%rd57+0], %f187;\n" +" .loc 16 492 0\n" +" cvt.s64.s32 %rd61, %r13;\n" +" mul.wide.s32 %rd62, %r13, 4;\n" +" add.u64 %rd63, %rd62, %rd57;\n" +" .loc 16 491 0\n" +" mov.f32 %f188, %f9;\n" +" st.global.f32 [%rd63+0], %f188;\n" +" .loc 16 492 0\n" +" add.u64 %rd64, %rd62, %rd63;\n" +" .loc 16 491 0\n" +" mov.f32 %f189, %f11;\n" +" st.global.f32 [%rd64+0], %f189;\n" +" .loc 16 492 0\n" +" add.u64 %rd65, %rd62, %rd64;\n" +" .loc 16 491 0\n" +" mov.f32 %f190, %f13;\n" +" st.global.f32 [%rd65+0], %f190;\n" +" .loc 16 492 0\n" +" add.u64 %rd57, %rd62, %rd65;\n" +" .loc 16 491 0\n" +" mov.f32 %f191, %f15;\n" +" st.global.f32 [%rd57+0], %f191;\n" +" mov.f32 %f192, %f17;\n" +" add.u64 %rd66, %rd62, %rd57;\n" +" st.global.f32 [%rd66+0], %f192;\n" +"$Lt_1_36866:\n" +" .loc 16 495 0\n" +" ld.param.u64 %rd67, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd68, %rd54, 16;\n" +" add.u64 %rd69, %rd67, %rd68;\n" +" mov.f32 %f193, %f194;\n" +" st.global.v4.f32 [%rd69+0], {%f34,%f33,%f32,%f193};\n" +"$Lt_1_35842:\n" +" .loc 16 497 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp new file mode 100644 index 000000000..4ead77760 --- /dev/null +++ b/lib/gpu/lal_answer.cpp @@ -0,0 +1,406 @@ +/*************************************************************************** + answer.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for data management of forces, torques, energies, and virials + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include "lal_answer.h" + +using namespace LAMMPS_AL; +#define AnswerT Answer<numtyp,acctyp> + +template <class numtyp, class acctyp> +AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false), + _inum(0),_ilist(NULL),_newton(false) { +} + +template <class numtyp, class acctyp> +int AnswerT::bytes_per_atom() const { + int bytes=11*sizeof(acctyp); + if (_rot) + bytes+=4*sizeof(acctyp); + if (_charge) + bytes+=sizeof(acctyp); + return bytes; +} + +template <class numtyp, class acctyp> +bool AnswerT::alloc(const int inum) { + _max_local=static_cast<int>(static_cast<double>(inum)*1.10); + + bool success=true; + + int ans_elements=4; + if (_rot) + ans_elements+=4; + + // Ignore host/device transfers? + bool cpuview=false; + if (dev->device_type()==UCL_CPU) + cpuview=true; + + // -------------------------- Host allocations + success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); + success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); + + // --------------------------- Device allocations + if (cpuview) { + dev_engv.view(host_engv); + dev_ans.view(host_ans); + } else { + success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, + UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (dev_ans.alloc(ans_elements*_max_local, + *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); + } + _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes(); + + _allocated=true; + return success; +} + +template <class numtyp, class acctyp> +bool AnswerT::init(const int inum, const bool charge, const bool rot, + UCL_Device &devi) { + clear(); + + bool success=true; + _charge=charge; + _rot=rot; + _other=_charge || _rot; + dev=&devi; + + _e_fields=1; + if (_charge) + _e_fields++; + _ev_fields=6+_e_fields; + + // Initialize atom and nbor data + int ef_inum=inum; + if (ef_inum==0) + ef_inum=1000; + + // Initialize timers for the selected device + time_answer.init(*dev); + time_answer.zero(); + _time_cast=0.0; + _time_cpu_idle=0.0; + + return success && alloc(ef_inum); +} + +template <class numtyp, class acctyp> +bool AnswerT::add_fields(const bool charge, const bool rot) { + bool realloc=false; + if (charge && _charge==false) { + _charge=true; + _e_fields++; + _ev_fields++; + realloc=true; + } + if (rot && _rot==false) { + _rot=true; + realloc=true; + } + if (realloc) { + _other=_charge || _rot; + int inum=_max_local; + clear_resize(); + return alloc(inum); + } + return true; +} + +template <class numtyp, class acctyp> +void AnswerT::clear_resize() { + if (!_allocated) + return; + _allocated=false; + + dev_ans.clear(); + dev_engv.clear(); + host_ans.clear(); + host_engv.clear(); +} + +template <class numtyp, class acctyp> +void AnswerT::clear() { + _gpu_bytes=0; + if (!_allocated) + return; + + time_answer.clear(); + clear_resize(); + _inum=0; + _ilist=NULL; + _eflag=false; + _vflag=false; +} + +template <class numtyp, class acctyp> +double AnswerT::host_memory_usage() const { + int atom_bytes=4; + if (_charge) + atom_bytes+=1; + if (_rot) + atom_bytes+=4; + int ans_bytes=atom_bytes+_ev_fields; + return ans_bytes*(_max_local)*sizeof(acctyp)+ + sizeof(Answer<numtyp,acctyp>); +} + +template <class numtyp, class acctyp> +void AnswerT::copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom) { + time_answer.start(); + _eflag=eflag; + _vflag=vflag; + _ef_atom=ef_atom; + _vf_atom=vf_atom; + + int csize=_ev_fields; + if (!eflag) + csize-=_e_fields; + if (!vflag) + csize-=6; + + if (csize>0) + ucl_copy(host_engv,dev_engv,_inum*csize,true); + if (_rot) + ucl_copy(host_ans,dev_ans,_inum*4*2,true); + else + ucl_copy(host_ans,dev_ans,_inum*4,true); + time_answer.stop(); +} + +template <class numtyp, class acctyp> +void AnswerT::copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom, + int *ilist) { + _ilist=ilist; + copy_answers(eflag,vflag,ef_atom,vf_atom); +} + +template <class numtyp, class acctyp> +double AnswerT::energy_virial(double *eatom, double **vatom, + double *virial) { + if (_eflag==false && _vflag==false) + return 0.0; + + double evdwl=0.0; + double virial_acc[6]; + for (int i=0; i<6; i++) virial_acc[i]=0.0; + if (_ilist==NULL) { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[i]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[i][j]+=*ap*0.5; + virial_acc[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial_acc[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]+=virial_acc[j]*0.5; + } else { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + int ii=_ilist[i]; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[ii]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[ii][j]+=*ap*0.5; + virial_acc[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial_acc[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]+=virial_acc[j]*0.5; + } + + evdwl*=0.5; + return evdwl; +} + +template <class numtyp, class acctyp> +double AnswerT::energy_virial(double *eatom, double **vatom, + double *virial, double &ecoul) { + if (_eflag==false && _vflag==false) + return 0.0; + + if (_charge==false) + return energy_virial(eatom,vatom,virial); + + double evdwl=0.0; + double _ecoul=0.0; + double virial_acc[6]; + for (int i=0; i<6; i++) virial_acc[i]=0.0; + if (_ilist==NULL) { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[i]+=*ap*0.5; + ap+=_inum; + _ecoul+=*ap; + eatom[i]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + _ecoul+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[i][j]+=*ap*0.5; + virial_acc[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial_acc[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]+=virial_acc[j]*0.5; + } else { + for (int i=0; i<_inum; i++) { + acctyp *ap=host_engv.begin()+i; + int ii=_ilist[i]; + if (_eflag) { + if (_ef_atom) { + evdwl+=*ap; + eatom[ii]+=*ap*0.5; + ap+=_inum; + _ecoul+=*ap; + eatom[ii]+=*ap*0.5; + ap+=_inum; + } else { + evdwl+=*ap; + ap+=_inum; + _ecoul+=*ap; + ap+=_inum; + } + } + if (_vflag) { + if (_vf_atom) { + for (int j=0; j<6; j++) { + vatom[ii][j]+=*ap*0.5; + virial_acc[j]+=*ap; + ap+=_inum; + } + } else { + for (int j=0; j<6; j++) { + virial_acc[j]+=*ap; + ap+=_inum; + } + } + } + } + for (int j=0; j<6; j++) + virial[j]+=virial_acc[j]*0.5; + } + + evdwl*=0.5; + ecoul+=_ecoul*0.5; + return evdwl; +} + +template <class numtyp, class acctyp> +void AnswerT::get_answers(double **f, double **tor) { + acctyp *ap=host_ans.begin(); + if (_ilist==NULL) { + for (int i=0; i<_inum; i++) { + f[i][0]+=*ap; + ap++; + f[i][1]+=*ap; + ap++; + f[i][2]+=*ap; + ap+=2; + } + if (_rot) { + for (int i=0; i<_inum; i++) { + tor[i][0]+=*ap; + ap++; + tor[i][1]+=*ap; + ap++; + tor[i][2]+=*ap; + ap+=2; + } + } + } else { + for (int i=0; i<_inum; i++) { + int ii=_ilist[i]; + f[ii][0]+=*ap; + ap++; + f[ii][1]+=*ap; + ap++; + f[ii][2]+=*ap; + ap+=2; + } + if (_rot) { + for (int i=0; i<_inum; i++) { + int ii=_ilist[i]; + tor[ii][0]+=*ap; + ap++; + tor[ii][1]+=*ap; + ap++; + tor[ii][2]+=*ap; + ap+=2; + } + } + } +} + +template class Answer<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_answer.h b/lib/gpu/lal_answer.h new file mode 100644 index 000000000..721e16cdd --- /dev/null +++ b/lib/gpu/lal_answer.h @@ -0,0 +1,169 @@ +/*************************************************************************** + answer.h + ------------------- + W. Michael Brown (ORNL) + + Class for data management of forces, torques, energies, and virials + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_ANSWER_H +#define LAL_ANSWER_H + +#include <math.h> +#include "mpi.h" + +#ifdef USE_OPENCL + +#include "geryon/ocl_timer.h" +#include "geryon/ocl_mat.h" +using namespace ucl_opencl; + +#else + +#include "geryon/nvd_timer.h" +#include "geryon/nvd_mat.h" +using namespace ucl_cudadr; + +#endif + +#include "lal_precision.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class Answer { + public: + Answer(); + ~Answer() { clear(); } + + /// Current number of local atoms stored + inline int inum() const { return _inum; } + /// Set number of local atoms for future copy operations + inline void inum(const int n) { _inum=n; } + + /// Memory usage per atom in this class + int bytes_per_atom() const; + + /// Clear any previous data and set up for a new LAMMPS run + /** \param rot True if atom storage needs quaternions **/ + bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev); + + /// Check if we have enough device storage and realloc if not + inline void resize(const int inum, bool &success) { + _inum=inum; + if (inum>_max_local) { + clear_resize(); + success = success && alloc(inum); + } + } + + /// If already initialized by another LAMMPS style, add fields as necessary + /** \param rot True if atom storage needs quaternions **/ + bool add_fields(const bool charge, const bool rot); + + /// Free all memory on host and device needed to realloc for more atoms + void clear_resize(); + + /// Free all memory on host and device + void clear(); + + /// Return the total amount of host memory used by class in bytes + double host_memory_usage() const; + + /// Add copy times to timers + inline void acc_timers() { + time_answer.add_to_total(); + } + + /// Add copy times to timers + inline void zero_timers() { + time_answer.zero(); + } + + /// Return the total time for host/device data transfer + inline double transfer_time() { + return time_answer.total_seconds(); + } + + /// Return the total time for data cast/pack + inline double cast_time() { return _time_cast; } + + /// Return number of bytes used on device + inline double gpu_bytes() { return _gpu_bytes; } + + // -------------------------COPY FROM GPU ------------------------------- + + /// Copy answers from device into read buffer asynchronously + void copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom); + + /// Copy answers from device into read buffer asynchronously + void copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom, int *ilist); + + /// Copy energy and virial data into LAMMPS memory + double energy_virial(double *eatom, double **vatom, double *virial); + + /// Copy energy and virial data into LAMMPS memory + double energy_virial(double *eatom, double **vatom, double *virial, + double &ecoul); + + /// Add forces and torques from the GPU into a LAMMPS pointer + void get_answers(double **f, double **tor); + + inline double get_answers(double **f, double **tor, double *eatom, + double **vatom, double *virial, double &ecoul) { + double ta=MPI_Wtime(); + time_answer.sync_stop(); + _time_cpu_idle+=MPI_Wtime()-ta; + double ts=MPI_Wtime(); + double evdw=energy_virial(eatom,vatom,virial,ecoul); + get_answers(f,tor); + _time_cast+=MPI_Wtime()-ts; + return evdw; + } + + /// Return the time the CPU was idle waiting for GPU + inline double cpu_idle_time() { return _time_cpu_idle; } + + // ------------------------------ DATA ---------------------------------- + + /// Force and possibly torque + UCL_D_Vec<acctyp> dev_ans; + /// Energy and virial per-atom storage + UCL_D_Vec<acctyp> dev_engv; + + /// Force and possibly torque data on host + UCL_H_Vec<acctyp> host_ans; + /// Energy/virial data on host + UCL_H_Vec<acctyp> host_engv; + + /// Device timers + UCL_Timer time_answer; + + /// Geryon device + UCL_Device *dev; + + private: + bool alloc(const int inum); + + bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; + int _max_local, _inum, _e_fields, _ev_fields; + int *_ilist; + double _time_cast, _time_cpu_idle; + + double _gpu_bytes; + + bool _newton; +}; + +} + +#endif diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp new file mode 100644 index 000000000..7a0b2f755 --- /dev/null +++ b/lib/gpu/lal_atom.cpp @@ -0,0 +1,317 @@ +/*************************************************************************** + atom.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for particle data management + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include "lal_atom.h" + +using namespace LAMMPS_AL; +#define AtomT Atom<numtyp,acctyp> + +template <class numtyp, class acctyp> +AtomT::Atom() : _compiled(false),_allocated(false), + _max_gpu_bytes(0) { + #ifndef USE_OPENCL + sort_config.op = CUDPP_ADD; + sort_config.datatype = CUDPP_UINT; + sort_config.algorithm = CUDPP_SORT_RADIX; + sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; + #endif +} + +template <class numtyp, class acctyp> +int AtomT::bytes_per_atom() const { + int id_space=0; + if (_gpu_nbor==1) + id_space=2; + else if (_gpu_nbor==2) + id_space=4; + int bytes=4*sizeof(numtyp)+id_space*sizeof(int); + if (_rot) + bytes+=4*sizeof(numtyp); + if (_charge) + bytes+=sizeof(numtyp); + return bytes; +} + +template <class numtyp, class acctyp> +bool AtomT::alloc(const int nall) { + _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10); + + bool success=true; + + // Ignore host/device transfers? + bool cpuview=false; + if (dev->device_type()==UCL_CPU) + cpuview=true; + + // Allocate storage for CUDPP sort + #ifndef USE_OPENCL + if (_gpu_nbor==1) { + CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); + if (CUDPP_SUCCESS != result) + return false; + } + #endif + + // -------------------------- Host allocations + // Get a host write only buffer + #ifdef GPU_CAST + success=success && (host_x_cast.alloc(_max_atoms*3,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + success=success && (host_type_cast.alloc(_max_atoms,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + #else + success=success && (host_x.alloc(_max_atoms*4,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + #endif + // Buffer for casting only if different precisions + if (_charge) + success=success && (host_q.alloc(_max_atoms,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + // Buffer for casting only if different precisions + if (_rot) + success=success && (host_quat.alloc(_max_atoms*4,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + + + // --------------------------- Device allocations + int gpu_bytes=0; + if (cpuview) { + #ifdef GPU_CAST + assert(0==1); + #else + dev_x.view(host_x); + #endif + if (_rot) + dev_quat.view(host_quat); + if (_charge) + dev_q.view(host_q); + } else { + #ifdef GPU_CAST + success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev)); + success=success && (UCL_SUCCESS== + dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)); + success=success && (UCL_SUCCESS== + dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)); + gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); + #else + success=success && (UCL_SUCCESS== + dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); + #endif + if (_charge) { + success=success && (dev_q.alloc(_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=dev_q.row_bytes(); + } + if (_rot) { + success=success && (dev_quat.alloc(_max_atoms*4,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=dev_quat.row_bytes(); + } + } + if (_gpu_nbor>0) { + success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + gpu_bytes+=dev_particle_id.row_bytes(); + if (_bonds) { + success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); + gpu_bytes+=dev_tag.row_bytes(); + } + if (_gpu_nbor==1) { + success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + gpu_bytes+=dev_cell_id.row_bytes(); + } else { + success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && + (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); + } + } + + gpu_bytes+=dev_x.row_bytes(); + if (gpu_bytes>_max_gpu_bytes) + _max_gpu_bytes=gpu_bytes; + + _allocated=true; + return success; +} + +template <class numtyp, class acctyp> +bool AtomT::add_fields(const bool charge, const bool rot, + const int gpu_nbor, const bool bonds) { + bool realloc=false; + if (charge && _charge==false) { + _charge=true; + realloc=true; + } + if (rot && _rot==false) { + _rot=true; + realloc=true; + } + if (gpu_nbor>0 && _gpu_nbor==0) { + _gpu_nbor=gpu_nbor; + realloc=true; + } + if (bonds && _bonds==false) { + _bonds=true; + realloc=true; + } + if (realloc) { + _other=_charge || _rot; + int max_atoms=_max_atoms; + clear_resize(); + return alloc(max_atoms); + } + return true; +} + +template <class numtyp, class acctyp> +bool AtomT::init(const int nall, const bool charge, const bool rot, + UCL_Device &devi, const int gpu_nbor, const bool bonds) { + clear(); + + bool success=true; + _x_avail=false; + _q_avail=false; + _quat_avail=false; + _resized=false; + _gpu_nbor=gpu_nbor; + _bonds=bonds; + _charge=charge; + _rot=rot; + _other=_charge || _rot; + dev=&devi; + + // Initialize atom and nbor data + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + // Initialize timers for the selected device + time_pos.init(*dev); + time_q.init(*dev); + time_quat.init(*dev); + time_pos.zero(); + time_q.zero(); + time_quat.zero(); + _time_cast=0.0; + + #ifdef GPU_CAST + compile_kernels(*dev); + #endif + + return success && alloc(ef_nall); +} + +template <class numtyp, class acctyp> +void AtomT::clear_resize() { + if (!_allocated) + return; + _allocated=false; + + dev_x.clear(); + if (_charge) { + dev_q.clear(); + host_q.clear(); + } + if (_rot) { + dev_quat.clear(); + host_quat.clear(); + } + #ifndef GPU_CAST + host_x.clear(); + #else + host_x_cast.clear(); + host_type_cast.clear(); + #endif + dev_cell_id.clear(); + dev_particle_id.clear(); + dev_tag.clear(); + #ifdef GPU_CAST + dev_x_cast.clear(); + dev_type_cast.clear(); + #endif + + #ifndef USE_OPENCL + if (_gpu_nbor==1) cudppDestroyPlan(sort_plan); + #endif + + if (_gpu_nbor==2) { + host_particle_id.clear(); + host_cell_id.clear(); + } +} + +template <class numtyp, class acctyp> +void AtomT::clear() { + _max_gpu_bytes=0; + if (!_allocated) + return; + + time_pos.clear(); + time_q.clear(); + time_quat.clear(); + clear_resize(); + + #ifdef GPU_CAST + if (_compiled) { + k_cast_x.clear(); + delete atom_program; + _compiled=false; + } + #endif +} + +template <class numtyp, class acctyp> +double AtomT::host_memory_usage() const { + int atom_bytes=4; + if (_charge) + atom_bytes+=1; + if (_rot) + atom_bytes+=4; + return _max_atoms*atom_bytes*sizeof(numtyp)+ + sizeof(Atom<numtyp,acctyp>); +} + +// Sort arrays for neighbor list calculation +template <class numtyp, class acctyp> +void AtomT::sort_neighbor(const int num_atoms) { + #ifndef USE_OPENCL + CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), + (int *)dev_particle_id.begin(), + 8*sizeof(unsigned), num_atoms); + if (CUDPP_SUCCESS != result) { + printf("Error in cudppSort\n"); + NVD_GERYON_EXIT; + } + #endif +} + +#ifdef GPU_CAST +#ifdef USE_OPENCL +#include "atom_cl.h" +#else +#include "atom_ptx.h" +#endif + +template <class numtyp, class acctyp> +void AtomT::compile_kernels(UCL_Device &dev) { + std::string flags = "-D"+std::string(OCL_VENDOR); + atom_program=new UCL_Program(dev); + atom_program->load_string(atom,flags); + k_cast_x.set_function(*atom_program,"kernel_cast_x"); + _compiled=true; +} + +#endif + +template class Atom<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu new file mode 100644 index 000000000..3446c1d4e --- /dev/null +++ b/lib/gpu/lal_atom.cu @@ -0,0 +1,33 @@ +// ************************************************************************** +// atom.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for atom data casting +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_preprocessor.h" +#endif + +__kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x, + __global int *type, const int nall) { + int ii=GLOBAL_ID_X; + + if (ii<nall) { + numtyp4 xt; + xt.w=type[ii]; + int i=ii*3; + xt.x=x[i]; + xt.y=x[i+1]; + xt.z=x[i+2]; + x_type[ii]=xt; + } // if ii +} diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h new file mode 100644 index 000000000..89dcd6765 --- /dev/null +++ b/lib/gpu/lal_atom.h @@ -0,0 +1,427 @@ +/*************************************************************************** + atom.h + ------------------- + W. Michael Brown (ORNL) + + Class for particle data management + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef PAIR_GPU_ATOM_H +#define PAIR_GPU_ATOM_H + +#include <math.h> +#include "mpi.h" + +#ifdef USE_OPENCL + +#include "geryon/ocl_timer.h" +#include "geryon/ocl_mat.h" +#include "geryon/ocl_kernel.h" +using namespace ucl_opencl; + +#else + +#include "cudpp.h" +#include "geryon/nvd_timer.h" +#include "geryon/nvd_mat.h" +#include "geryon/nvd_kernel.h" +using namespace ucl_cudadr; + +#endif + +#include "lal_precision.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class Atom { + public: + Atom(); + ~Atom() { clear(); } + + /// Maximum number of atoms that can be stored with current allocation + inline int max_atoms() const { return _max_atoms; } + /// Current number of local+ghost atoms stored + inline int nall() const { return _nall; } + + /// Set number of local+ghost atoms for future copy operations + inline void nall(const int n) { _nall=n; } + + /// Memory usage per atom in this class + int bytes_per_atom() const; + + /// Clear any previous data and set up for a new LAMMPS run + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor 0 if neighboring will be performed on host + * gpu_nbor 1 if neighboring will be performed on device + * gpu_nbor 2 if binning on host and neighboring on device **/ + bool init(const int nall, const bool charge, const bool rot, + UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false); + + /// Check if we have enough device storage and realloc if not + /** Returns true if resized with any call during this timestep **/ + inline bool resize(const int nall, bool &success) { + _nall=nall; + if (nall>_max_atoms) { + clear_resize(); + success = success && alloc(nall); + _resized=true; + } + return _resized; + } + + /// If already initialized by another LAMMPS style, add fields as necessary + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor 0 if neighboring will be performed on host + * gpu_nbor 1 if neighboring will be performed on device + * gpu_nbor 2 if binning on host and neighboring on device **/ + bool add_fields(const bool charge, const bool rot, const int gpu_nbor, + const bool bonds); + + /// Returns true if GPU is using charges + bool charge() { return _charge; } + + /// Returns true if GPU is using quaternions + bool quat() { return _rot; } + + /// Only free matrices of length inum or nall for resizing + void clear_resize(); + + /// Free all memory on host and device + void clear(); + + /// Return the total amount of host memory used by class in bytes + double host_memory_usage() const; + + /// Sort arrays for neighbor list calculation on device + void sort_neighbor(const int num_atoms); + + /// Add copy times to timers + inline void acc_timers() { + time_pos.add_to_total(); + if (_charge) + time_q.add_to_total(); + if (_rot) + time_quat.add_to_total(); + } + + /// Add copy times to timers + inline void zero_timers() { + time_pos.zero(); + if (_charge) + time_q.zero(); + if (_rot) + time_quat.zero(); + } + + /// Return the total time for host/device data transfer + /** Zeros the total so that the atom times are only included once **/ + inline double transfer_time() { + double total=time_pos.total_seconds(); + time_pos.zero_total(); + if (_charge) { + total+=time_q.total_seconds(); + time_q.zero_total(); + } + if (_rot) { + total+=time_q.total_seconds(); + time_quat.zero_total(); + } + + return total; + } + + /// Return the total time for data cast/pack + /** Zeros the time so that atom times are only included once **/ + inline double cast_time() + { double t=_time_cast; _time_cast=0.0; return t; } + + /// Pack LAMMPS atom type constants into matrix and copy to device + template <class dev_typ, class t1> + inline void type_pack1(const int n, const int m_size, + UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, + t1 **one) { + int ii=0; + for (int i=0; i<n; i++) { + for (int j=0; j<n; j++) { + buffer[ii]=static_cast<numtyp>(one[i][j]); + ii++; + } + ii+=m_size-n; + } + UCL_H_Vec<dev_typ> view; + view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + ucl_copy(dev_v,view,false); + } + + /// Pack LAMMPS atom type constants into 2 vectors and copy to device + template <class dev_typ, class t1, class t2> + inline void type_pack2(const int n, const int m_size, + UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, + t1 **one, t2 **two) { + int ii=0; + for (int i=0; i<n; i++) { + for (int j=0; j<n; j++) { + buffer[ii*2]=static_cast<numtyp>(one[i][j]); + buffer[ii*2+1]=static_cast<numtyp>(two[i][j]); + ii++; + } + ii+=m_size-n; + } + UCL_H_Vec<dev_typ> view; + view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + ucl_copy(dev_v,view,false); + } + + /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device + template <class dev_typ, class t1, class t2, class t3> + inline void type_pack4(const int n, const int m_size, + UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, + t1 **one, t2 **two, t3 **three) { + int ii=0; + for (int i=0; i<n; i++) { + for (int j=0; j<n; j++) { + buffer[ii*4]=static_cast<numtyp>(one[i][j]); + buffer[ii*4+1]=static_cast<numtyp>(two[i][j]); + buffer[ii*4+2]=static_cast<numtyp>(three[i][j]); + ii++; + } + ii+=m_size-n; + } + UCL_H_Vec<dev_typ> view; + view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + ucl_copy(dev_v,view,false); + } + + /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device + template <class dev_typ, class t1, class t2, class t3, class t4> + inline void type_pack4(const int n, const int m_size, + UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, + t1 **one, t2 **two, t3 **three, t4 **four) { + int ii=0; + for (int i=0; i<n; i++) { + for (int j=0; j<n; j++) { + buffer[ii*4]=static_cast<numtyp>(one[i][j]); + buffer[ii*4+1]=static_cast<numtyp>(two[i][j]); + buffer[ii*4+2]=static_cast<numtyp>(three[i][j]); + buffer[ii*4+3]=static_cast<numtyp>(four[i][j]); + ii++; + } + ii+=m_size-n; + } + UCL_H_Vec<dev_typ> view; + view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); + ucl_copy(dev_v,view,false); + } + + /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device + template <class dev_typ, class t1, class t2> + inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, + UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) { + for (int i=0; i<n; i++) { + buffer[i*2]=static_cast<numtyp>(one[i][i]); + buffer[i*2+1]=static_cast<numtyp>(two[i][i]); + } + UCL_H_Vec<dev_typ> view; + view.view((dev_typ*)buffer.begin(),n,*dev); + ucl_copy(dev_v,view,false); + } + + // -------------------------COPY TO GPU ---------------------------------- + + /// Signal that we need to transfer atom data for next timestep + inline void data_unavail() + { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; } + + /// Cast positions and types to write buffer + inline void cast_x_data(double **host_ptr, const int *host_type) { + if (_x_avail==false) { + double t=MPI_Wtime(); + #ifdef GPU_CAST + memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); + memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); + #else + numtyp *_write_loc=host_x.begin(); + for (int i=0; i<_nall; i++) { + *_write_loc=host_ptr[i][0]; + _write_loc++; + *_write_loc=host_ptr[i][1]; + _write_loc++; + *_write_loc=host_ptr[i][2]; + _write_loc++; + *_write_loc=host_type[i]; + _write_loc++; + } + #endif + _time_cast+=MPI_Wtime()-t; + } + } + + /// Copy positions and types to device asynchronously + /** Copies nall() elements **/ + inline void add_x_data(double **host_ptr, int *host_type) { + time_pos.start(); + if (_x_avail==false) { + #ifdef GPU_CAST + ucl_copy(dev_x_cast,host_x_cast,_nall*3,true); + ucl_copy(dev_type_cast,host_type_cast,_nall,true); + int block_size=64; + int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size)); + k_cast_x.set_size(GX,block_size); + k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), + &_nall); + #else + ucl_copy(dev_x,host_x,_nall*4,true); + #endif + _x_avail=true; + } + time_pos.stop(); + } + + /// Calls cast_x_data and add_x_data and times the routines + inline void cast_copy_x(double **host_ptr, int *host_type) { + cast_x_data(host_ptr,host_type); + add_x_data(host_ptr,host_type); + } + + // Cast charges to write buffer + template<class cpytyp> + inline void cast_q_data(cpytyp *host_ptr) { + if (_q_avail==false) { + double t=MPI_Wtime(); + if (dev->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_q.view((numtyp*)host_ptr,_nall,*dev); + dev_q.view(host_q); + } else + for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); + else + for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + } + _time_cast+=MPI_Wtime()-t; + } + } + + // Copy charges to device asynchronously + inline void add_q_data() { + if (_q_avail==false) { + ucl_copy(dev_q,host_q,_nall,true); + _q_avail=true; + } + } + + // Cast quaternions to write buffer + template<class cpytyp> + inline void cast_quat_data(cpytyp *host_ptr) { + if (_quat_avail==false) { + double t=MPI_Wtime(); + if (dev->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_quat.view((numtyp*)host_ptr,_nall*4,*dev); + dev_quat.view(host_quat); + } else + for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); + else + for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + } + _time_cast+=MPI_Wtime()-t; + } + } + + // Copy quaternions to device + /** Copies nall()*4 elements **/ + inline void add_quat_data() { + if (_quat_avail==false) { + ucl_copy(dev_quat,host_quat,_nall*4,true); + _quat_avail=true; + } + } + + /// Return number of bytes used on device + inline double max_gpu_bytes() + { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } + + // ------------------------------ DATA ---------------------------------- + + /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type + UCL_D_Vec<numtyp> dev_x; + /// Charges + UCL_D_Vec<numtyp> dev_q; + /// Quaterions + UCL_D_Vec<numtyp> dev_quat; + + #ifdef GPU_CAST + UCL_D_Vec<double> dev_x_cast; + UCL_D_Vec<int> dev_type_cast; + UCL_H_Vec<double> host_x_cast; + UCL_H_Vec<int> host_type_cast; + #endif + + /// Buffer for moving positions to device + UCL_H_Vec<numtyp> host_x; + /// Buffer for moving charge data to GPU + UCL_H_Vec<numtyp> host_q; + /// Buffer for moving quat data to GPU + UCL_H_Vec<numtyp> host_quat; + + /// Cell list identifiers for device nbor builds + UCL_D_Vec<unsigned> dev_cell_id; + /// Cell list identifiers for device nbor builds + UCL_D_Vec<int> dev_particle_id; + /// Atom tag information for device nbor builds + UCL_D_Vec<int> dev_tag; + + /// Cell list identifiers for hybrid nbor builds + UCL_H_Vec<int> host_cell_id; + /// Cell list identifiers for hybrid nbor builds + UCL_H_Vec<int> host_particle_id; + + /// Device timers + UCL_Timer time_pos, time_q, time_quat; + + /// Geryon device + UCL_Device *dev; + + private: + #ifdef GPU_CAST + UCL_Program *atom_program; + UCL_Kernel k_cast_x; + void compile_kernels(UCL_Device &dev); + #endif + + bool _compiled; + + // True if data has been copied to device already + bool _x_avail, _q_avail, _quat_avail, _resized; + + bool alloc(const int nall); + + bool _allocated, _rot, _charge, _other; + int _max_atoms, _nall, _gpu_nbor; + bool _bonds; + double _time_cast; + + double _max_gpu_bytes; + + #ifndef USE_OPENCL + CUDPPConfiguration sort_config; + CUDPPHandle sort_plan; + #endif +}; + +} + +#endif diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h new file mode 100644 index 000000000..7eb7a3426 --- /dev/null +++ b/lib/gpu/lal_aux_fun1.h @@ -0,0 +1,139 @@ +// ************************************************************************** +// aux_fun1.h +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for pair style auxiliary functions +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : Sat Oct 22 2011 +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_preprocessor.h" +#endif + +#define atom_info(t_per_atom, ii, tid, offset) \ + tid=THREAD_ID_X; \ + offset=tid & (t_per_atom-1); \ + ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; + +#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \ + i, numj, stride, list_end, nbor) \ + nbor=nbor_mem+ii; \ + i=*nbor; \ + nbor+=nbor_stride; \ + numj=*nbor; \ + if (nbor_mem==packed_mem) { \ + nbor+=nbor_stride+fast_mul(ii,t_per_atom-1); \ + stride=fast_mul(t_per_atom,nbor_stride); \ + list_end=nbor+fast_mul(numj/t_per_atom,stride)+ (numj & (t_per_atom-1)); \ + nbor+=offset; \ + } else { \ + nbor+=nbor_stride; \ + nbor=packed_mem+*nbor; \ + list_end=nbor+numj; \ + stride=t_per_atom; \ + nbor+=offset; \ + } + +#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ + eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + __local acctyp red_acc[6][BLOCK_PAIR]; \ + red_acc[0][tid]=f.x; \ + red_acc[1][tid]=f.y; \ + red_acc[2][tid]=f.z; \ + red_acc[3][tid]=energy; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<4; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + f.x=red_acc[0][tid]; \ + f.y=red_acc[1][tid]; \ + f.z=red_acc[2][tid]; \ + energy=red_acc[3][tid]; \ + if (vflag>0) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv=energy; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + } + +#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + __local acctyp red_acc[6][BLOCK_PAIR]; \ + red_acc[0][tid]=f.x; \ + red_acc[1][tid]=f.y; \ + red_acc[2][tid]=f.z; \ + red_acc[3][tid]=energy; \ + red_acc[4][tid]=e_coul; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<5; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + f.x=red_acc[0][tid]; \ + f.y=red_acc[1][tid]; \ + f.z=red_acc[2][tid]; \ + energy=red_acc[3][tid]; \ + e_coul=red_acc[4][tid]; \ + if (vflag>0) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv=energy; \ + engv+=inum; \ + *engv=e_coul; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + } + diff --git a/lib/gpu/lal_balance.h b/lib/gpu/lal_balance.h new file mode 100644 index 000000000..cf09cf86f --- /dev/null +++ b/lib/gpu/lal_balance.h @@ -0,0 +1,207 @@ +/*************************************************************************** + balance.h + ------------------- + W. Michael Brown (ORNL) + + Class for host-device load balancing + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BALANCE_H +#define LAL_BALANCE_H + +#include "lal_device.h" +#include <math.h> + +#define _HD_BALANCE_EVERY 25 +#define _HD_BALANCE_WEIGHT 0.5 +#define _HD_BALANCE_GAP 1.10 + +namespace LAMMPS_AL { + +/// Host/device load balancer +template<class numtyp, class acctyp> +class Balance { + public: + inline Balance() : _init_done(false), _measure_this_step(false) {} + inline ~Balance() { clear(); } + + /// Clear any old data and setup for new LAMMPS run + inline void init(Device<numtyp, acctyp> *gpu, const int gpu_nbor, + const double split); + + /// Clear all host and device data + inline void clear() { + if (_init_done) { + _device_time.clear(); + _measure_this_step=false; + _init_done=false; + } + } + + /// Return the timestep since initialization + inline int timestep() { return _timestep; } + + /// Get a count of the number of particles host will handle for initial alloc + inline int first_host_count(const int nlocal, const double gpu_split, + const int gpu_nbor) const { + int host_nlocal=0; + if (gpu_nbor>0 && gpu_split!=1.0) { + if (gpu_split>0) + host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal)); + else + host_nlocal=static_cast<int>(ceil(0.05*nlocal)); + } + return host_nlocal; + } + + /// Return the number of particles the device will handle this timestep + inline int get_gpu_count(const int ago, const int inum_full); + + /// Return the average fraction of particles handled by device on all procs + inline double all_avg_split() { + if (_load_balance) { + double _all_avg_split=0.0; + MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0, + _device->replica()); + _all_avg_split/=_device->replica_size(); + return _all_avg_split/_avg_count; + } else + return _actual_split; + } + + /// If CPU neighboring, allow the device fraction to increase on 2nd timestep + inline int ago_first(int ago) const + { if (_avg_count==1 && _actual_split<_desired_split) ago=0; return ago; } + + /// Start the timer for asynchronous device execution + inline void start_timer() { + if (_measure_this_step) { + _device->gpu->sync(); + _device->gpu_barrier(); + _device->start_host_timer(); + _device_time.start(); + _device->gpu->sync(); + _device->gpu_barrier(); + } + } + + /// Stop the timer for asynchronous device execution + inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } } + + /// Calculate the new host/device split based on the cpu and device times + /** \note Only does calculation every _HD_BALANCE_EVERY timesteps + (and first 10) **/ + inline void balance(const double cpu_time); + + /// Calls balance() and then get_gpu_count() + inline int balance(const int ago,const int inum_full,const double cpu_time) { + balance(cpu_time); + return get_gpu_count(ago,inum_full); + } + + private: + Device<numtyp,acctyp> *_device; + UCL_Timer _device_time; + bool _init_done; + int _gpu_nbor; + + bool _load_balance; + double _actual_split, _avg_split, _desired_split, _max_split; + int _avg_count; + + bool _measure_this_step; + int _inum, _inum_full, _timestep; +}; + +#define BalanceT Balance<numtyp,acctyp> + +template <class numtyp, class acctyp> +void BalanceT::init(Device<numtyp, acctyp> *gpu, + const int gpu_nbor, const double split) { + clear(); + _gpu_nbor=gpu_nbor; + _init_done=true; + + _device=gpu; + _device_time.init(*gpu->gpu); + + if (split<0.0) { + _load_balance=true; + _desired_split=0.90; + } else { + _load_balance=false; + _desired_split=split; + } + _actual_split=_desired_split; + _avg_split=0.0; + _avg_count=0; + _timestep=0; +} + +template <class numtyp, class acctyp> +int BalanceT::get_gpu_count(const int ago, const int inum_full) { + _measure_this_step=false; + if (_load_balance) { + if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) { + _measure_this_step=true; + _inum_full=inum_full; + } + if (ago==0) { + _actual_split=_desired_split; + _max_split=_desired_split; + } + } + _inum=static_cast<int>(floor(_actual_split*inum_full)); + if (_inum==0) _inum++; + _timestep++; + return _inum; +} + +template <class numtyp, class acctyp> +void BalanceT::balance(const double cpu_time) { + if (_measure_this_step) { + _measure_this_step=false; + double gpu_time=_device_time.seconds(); + + double max_gpu_time; + MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX, + _device->gpu_comm()); + + if (_inum_full==_inum) { + _desired_split=1.0; + return; + } + + double cpu_time_per_atom=cpu_time/(_inum_full-_inum); + double cpu_other_time=_device->host_time()-cpu_time; + int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/ + cpu_time_per_atom); + + double split=static_cast<double>(_inum_full-host_inum)/_inum_full; + _desired_split=split*_HD_BALANCE_GAP; + if (_desired_split>1.0) + _desired_split=1.0; + if (_desired_split<0.0) + _desired_split=0.0; + + if (_gpu_nbor==0) { + if (_desired_split<_max_split) + _actual_split=_desired_split; + else + _actual_split=_max_split; + } + } + _avg_split+=_desired_split; + _avg_count++; +} + +} + +#endif diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp new file mode 100644 index 000000000..553352e84 --- /dev/null +++ b/lib/gpu/lal_base_atomic.cpp @@ -0,0 +1,287 @@ +/*************************************************************************** + base_atomic.cpp + ------------------- + W. Michael Brown (ORNL) + + Base class for pair styles with per-particle data for position and type + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include "lal_base_atomic.h" +using namespace LAMMPS_AL; +#define BaseAtomicT BaseAtomic<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> global_device; + +template <class numtyp, class acctyp> +BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer<numtyp,acctyp>(); + nbor=new Neighbor(); +} + +template <class numtyp, class acctyp> +BaseAtomicT::~BaseAtomic() { + delete ans; + delete nbor; +} + +template <class numtyp, class acctyp> +int BaseAtomicT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template <class numtyp, class acctyp> +int BaseAtomicT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const char *pair_program) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_atom(); + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,false, + _threads_per_atom); + if (success!=0) + return success; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + compile_kernels(*ucl_device,pair_program); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->dev_x,4); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return 0; +} + +template <class numtyp, class acctyp> +void BaseAtomicT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +} + +template <class numtyp, class acctyp> +void BaseAtomicT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + if (_compiled) { + k_pair_fast.clear(); + k_pair.clear(); + delete pair_program; + _compiled=false; + } + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); + device->clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return false; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, int *tag, + int **nspecial, int **special, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, + nspecial, special, success, mn); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void BaseAtomicT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +int ** BaseAtomicT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return NULL; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return NULL; + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +template <class numtyp, class acctyp> +double BaseAtomicT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseAtomic<numtyp,acctyp>); +} + +template <class numtyp, class acctyp> +void BaseAtomicT::compile_kernels(UCL_Device &dev, const char *pair_str) { + if (_compiled) + return; + + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE)+" -D"+ + std::string(OCL_VENDOR); + + pair_program=new UCL_Program(dev); + pair_program->load_string(pair_str,flags.c_str()); + k_pair_fast.set_function(*pair_program,"kernel_pair_fast"); + k_pair.set_function(*pair_program,"kernel_pair"); + pos_tex.get_texture(*pair_program,"pos_tex"); + + _compiled=true; +} + +template class BaseAtomic<PRECISION,ACC_PRECISION>; + diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h new file mode 100644 index 000000000..7e9a91138 --- /dev/null +++ b/lib/gpu/lal_base_atomic.h @@ -0,0 +1,198 @@ +/*************************************************************************** + base_atomic.h + ------------------- + W. Michael Brown (ORNL) + + Base class for pair styles with per-particle data for position and type + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BASE_ATOMIC_H +#define LAL_BASE_ATOMIC_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class BaseAtomic { + public: + BaseAtomic(); + virtual ~BaseAtomic(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const char *pair_program); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) + pos_tex.bind_float(atom->dev_x,4); + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, + int &host_start, const double cpu_time, bool &success); + + /// Pair loop with device neighboring + int * compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); + + /// Pair loop with device neighboring + int ** compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device<numtyp,acctyp> *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance<numtyp,acctyp> hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom<numtyp,acctyp> *atom; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer<numtyp,acctyp> *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_pair_fast, k_pair; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + + protected: + bool _compiled; + int _block_size, _threads_per_atom; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec<int> *_nbor_data; + + void compile_kernels(UCL_Device &dev, const char *pair_string); + + virtual void loop(const bool _eflag, const bool _vflag) = 0; +}; + +} + +#endif diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp new file mode 100644 index 000000000..3ac63666b --- /dev/null +++ b/lib/gpu/lal_base_charge.cpp @@ -0,0 +1,304 @@ +/*************************************************************************** + base_charge.cpp + ------------------- + W. Michael Brown (ORNL) + + Base class for pair styles needing per-particle data for position, + charge, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include "lal_base_charge.h" +using namespace LAMMPS_AL; +#define BaseChargeT BaseCharge<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> global_device; + +template <class numtyp, class acctyp> +BaseChargeT::BaseCharge() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer<numtyp,acctyp>(); + nbor=new Neighbor(); +} + +template <class numtyp, class acctyp> +BaseChargeT::~BaseCharge() { + delete ans; + delete nbor; +} + +template <class numtyp, class acctyp> +int BaseChargeT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template <class numtyp, class acctyp> +int BaseChargeT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const char *pair_program) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_charge(); + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,false, + _threads_per_atom); + if (success!=0) + return success; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + _block_bio_size=device->block_bio_pair(); + compile_kernels(*ucl_device,pair_program); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->dev_x,4); + q_tex.bind_float(atom->dev_q,1); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return success; +} + +template <class numtyp, class acctyp> +void BaseChargeT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +} + +template <class numtyp, class acctyp> +void BaseChargeT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + if (_compiled) { + k_pair_fast.clear(); + k_pair.clear(); + delete pair_program; + _compiled=false; + } + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); + device->clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +int * BaseChargeT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return false; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, int *tag, + int **nspecial, int **special, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, + nspecial, special, success, mn); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void BaseChargeT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_q_data(); + + device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, + boxlo, prd); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +int** BaseChargeT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success, + double *host_q, double *boxlo, double *prd) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return NULL; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return NULL; + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_q_data(); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, + boxlo, prd); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +template <class numtyp, class acctyp> +double BaseChargeT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseCharge<numtyp,acctyp>); +} + +template <class numtyp, class acctyp> +void BaseChargeT::compile_kernels(UCL_Device &dev, const char *pair_str) { + if (_compiled) + return; + + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE)+" -D"+ + std::string(OCL_VENDOR); + + pair_program=new UCL_Program(dev); + pair_program->load_string(pair_str,flags.c_str()); + k_pair_fast.set_function(*pair_program,"kernel_pair_fast"); + k_pair.set_function(*pair_program,"kernel_pair"); + pos_tex.get_texture(*pair_program,"pos_tex"); + q_tex.get_texture(*pair_program,"q_tex"); + + _compiled=true; +} + +template class BaseCharge<PRECISION,ACC_PRECISION>; + diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h new file mode 100644 index 000000000..a0a42be67 --- /dev/null +++ b/lib/gpu/lal_base_charge.h @@ -0,0 +1,197 @@ +/*************************************************************************** + base_charge.h + ------------------- + W. Michael Brown (ORNL) + + Base class for pair styles needing per-particle data for position, + charge, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BASE_CHARGE_H +#define LAL_BASE_CHARGE_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class BaseCharge { + public: + BaseCharge(); + virtual ~BaseCharge(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const char *pair_program); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->dev_x,4); + q_tex.bind_float(atom->dev_q,1); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge, + const int nlocal, double *boxlo, double *prd); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device<numtyp,acctyp> *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance<numtyp,acctyp> hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom<numtyp,acctyp> *atom; + + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer<numtyp,acctyp> *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_pair_fast, k_pair; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + + protected: + bool _compiled; + int _block_size, _block_bio_size, _threads_per_atom; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec<int> *_nbor_data; + + void compile_kernels(UCL_Device &dev, const char *pair_string); + + virtual void loop(const bool _eflag, const bool _vflag) = 0; +}; + +} + +#endif diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp new file mode 100644 index 000000000..dce3d358c --- /dev/null +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -0,0 +1,469 @@ +/*************************************************************************** + base_ellipsoid.cpp + ------------------- + W. Michael Brown (ORNL) + + Base class for acceleration of ellipsoid potentials + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Thu May 5 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#include "lal_base_ellipsoid.h" +using namespace LAMMPS_AL; + +#ifdef USE_OPENCL +#include "ellipsoid_nbor_cl.h" +#else +#include "ellipsoid_nbor_ptx.h" +#endif + +#define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp> +extern Device<PRECISION,ACC_PRECISION> global_device; + +template <class numtyp, class acctyp> +BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer<numtyp,acctyp>(); + nbor=new Neighbor(); +} + +template <class numtyp, class acctyp> +BaseEllipsoidT::~BaseEllipsoid() { + delete ans; + delete nbor; +} + +template <class numtyp, class acctyp> +int BaseEllipsoidT::bytes_per_atom(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template <class numtyp, class acctyp> +int BaseEllipsoidT::init_base(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, + FILE *_screen, const int ntypes, int **h_form, + const char *ellipsoid_program, + const char *lj_program, const bool ellip_sphere) { + screen=_screen; + _ellipsoid_sphere=ellip_sphere; + + int gpu_nbor=0; + if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_atom(); + + int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,true, + 1); + if (success!=0) + return success; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_lj.init(*ucl_device); + time_nbor1.init(*ucl_device); + time_ellipsoid.init(*ucl_device); + time_nbor2.init(*ucl_device); + time_ellipsoid2.init(*ucl_device); + time_nbor3.init(*ucl_device); + time_ellipsoid3.init(*ucl_device); + time_lj.zero(); + time_nbor1.zero(); + time_ellipsoid.zero(); + time_nbor2.zero(); + time_ellipsoid2.zero(); + time_nbor3.zero(); + time_ellipsoid3.zero(); + + // See if we want fast GB-sphere or sphere-sphere calculations + _host_form=h_form; + _multiple_forms=false; + for (int i=1; i<ntypes; i++) + for (int j=i; j<ntypes; j++) + if (_host_form[i][j]!=ELLIPSE_ELLIPSE) + _multiple_forms=true; + if (_multiple_forms && host_nlocal>0) { + std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n"; + exit(1); + } + + if (_multiple_forms) + ans->dev_ans.zero(); + + // Memory for ilist ordered by particle type + if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS) + return 0; + else return -3; + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return 0; +} + +template <class numtyp, class acctyp> +void BaseEllipsoidT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead); +} + +template <class numtyp, class acctyp> +void BaseEllipsoidT::clear_base() { + // Output any timing information + output_times(); + host_olist.clear(); + + if (_compiled) { + k_nbor_fast.clear(); + k_nbor.clear(); + k_ellipsoid.clear(); + k_ellipsoid_sphere.clear(); + k_sphere_ellipsoid.clear(); + k_lj_fast.clear(); + k_lj.clear(); + delete nbor_program; + delete ellipsoid_program; + delete lj_program; + _compiled=false; + } + + time_nbor1.clear(); + time_ellipsoid.clear(); + time_nbor2.clear(); + time_ellipsoid2.clear(); + time_nbor3.clear(); + time_ellipsoid3.clear(); + time_lj.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); + device->clear(); +} + +template <class numtyp, class acctyp> +void BaseEllipsoidT::output_times() { + // Output any timing information + acc_timers(); + double single[10], times[10]; + + single[0]=atom->transfer_time()+ans->transfer_time(); + single[1]=nbor->time_nbor.total_seconds()+nbor->time_hybrid1.total_seconds()+ + nbor->time_hybrid2.total_seconds(); + single[2]=time_nbor1.total_seconds()+time_nbor2.total_seconds()+ + time_nbor3.total_seconds()+nbor->time_nbor.total_seconds(); + single[3]=time_ellipsoid.total_seconds()+time_ellipsoid2.total_seconds()+ + time_ellipsoid3.total_seconds(); + if (_multiple_forms) + single[4]=time_lj.total_seconds(); + else + single[4]=0; + single[5]=atom->cast_time()+ans->cast_time(); + single[6]=_gpu_overhead; + single[7]=_driver_overhead; + single[8]=ans->cpu_idle_time(); + single[9]=nbor->bin_time(); + + MPI_Reduce(single,times,10,MPI_DOUBLE,MPI_SUM,0,device->replica()); + double avg_split=hd_balancer.all_avg_split(); + + _max_bytes+=atom->max_gpu_bytes(); + double mpi_max_bytes; + MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, + device->replica()); + double max_mb=mpi_max_bytes/(1024*1024); + + if (device->replica_me()==0) + if (screen && times[5]>0.0) { + int replica_size=device->replica_size(); + + fprintf(screen,"\n\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + fprintf(screen," GPU Time Info (average): "); + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + + if (device->procs_per_gpu()==1) { + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); + fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size); + fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size); + if (nbor->gpu_nbor()>0) + fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size); + else + fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size); + fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); + fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); + } + if (nbor->gpu_nbor()==2) + fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[9]/replica_size); + fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size); + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); + fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); + fprintf(screen,"-------------------------------------"); + fprintf(screen,"--------------------------------\n\n"); + } + _max_bytes=0.0; +} + +// --------------------------------------------------------------------------- +// Pack neighbors to limit thread divergence for lj-lj and ellipse +// --------------------------------------------------------------------------- +template<class numtyp, class acctyp> +void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, + const int inum, const int form_low, + const int form_high, const bool shared_types, + int ntypes) { + int stride=nbor->nbor_pitch(); + if (shared_types) { + k_nbor_fast.set_size(GX,BX); + k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(), + &nbor->dev_nbor.begin(), &stride, &start, &inum, + &nbor->dev_packed.begin(), &form_low, &form_high); + } else { + k_nbor.set_size(GX,BX); + k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes, + &nbor->dev_nbor.begin(), &stride, &start, &inum, + &nbor->dev_packed.begin(), &form_low, &form_high); + } +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void BaseEllipsoidT::reset_nbors(const int nall, const int inum, + const int osize, int *ilist, + int *numj, int *type, int **firstneigh, + bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(osize,numj,ilist); + resize_atom(nall,success); + resize_local(inum,0,mn,osize,success); + if (!success) + return; + + if (_multiple_forms) { + int p=0; + for (int i=0; i<osize; i++) { + int itype=type[ilist[i]]; + if (_host_form[itype][itype]==ELLIPSE_ELLIPSE) { + host_olist[p]=ilist[i]; + p++; + } + } + _max_last_ellipse=p; + _last_ellipse=std::min(inum,_max_last_ellipse); + for (int i=0; i<osize; i++) { + int itype=type[ilist[i]]; + if (_host_form[itype][itype]!=ELLIPSE_ELLIPSE) { + host_olist[p]=ilist[i]; + p++; + } + } + nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size()); + nbor->copy_unpacked(inum,mn); + return; + } + _last_ellipse=inum; + _max_last_ellipse=inum; + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + nbor->copy_unpacked(inum,mn); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, int *tag, + int **nspecial, int **special, + bool &success) { + success=true; + resize_atom(nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),0,success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, + nspecial, special, success, mn); + nbor->copy_unpacked(inum,mn); + _last_ellipse=inum; + _max_last_ellipse=inum; + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double **host_quat) { + acc_timers(); + if (inum_full==0) { + host_start=0; + zero_timers(); + return NULL; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + _last_ellipse=std::min(inum,_max_last_ellipse); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, inum_full, ilist, numj, host_type, firstneigh, + success); + if (!success) + return NULL; + } + int *list; + if (_multiple_forms) + list=host_olist.begin(); + else + list=ilist; + + atom->cast_x_data(host_x,host_type); + atom->cast_quat_data(host_quat[0]); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_quat_data(); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom,list); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + return list; +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double **host_quat) { + acc_timers(); + if (inum_full==0) { + host_start=0; + zero_timers(); + return NULL; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + _last_ellipse=std::min(inum,_max_last_ellipse); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return NULL; + atom->cast_quat_data(host_quat[0]); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_quat_data(host_quat[0]); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + + atom->add_quat_data(); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + return nbor->host_jlist.begin()-host_start; +} + +template <class numtyp, class acctyp> +double BaseEllipsoidT::host_memory_usage_base() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseEllipsoid<numtyp,acctyp>); +} + +template <class numtyp, class acctyp> +void BaseEllipsoidT::compile_kernels(UCL_Device &dev, + const char *ellipsoid_string, + const char *lj_string, const bool e_s) { + if (_compiled) + return; + + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE)+" -D"+ + std::string(OCL_VENDOR); + + nbor_program=new UCL_Program(dev); + nbor_program->load_string(ellipsoid_nbor,flags.c_str()); + k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast"); + k_nbor.set_function(*nbor_program,"kernel_nbor"); + + ellipsoid_program=new UCL_Program(dev); + ellipsoid_program->load_string(ellipsoid_string,flags.c_str()); + k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid"); + + lj_program=new UCL_Program(dev); + lj_program->load_string(lj_string,flags.c_str()); + k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid"); + k_lj_fast.set_function(*lj_program,"kernel_lj_fast"); + k_lj.set_function(*lj_program,"kernel_lj"); + if (e_s) + k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere"); + + _compiled=true; +} + +template class BaseEllipsoid<PRECISION,ACC_PRECISION>; + diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h new file mode 100644 index 000000000..7ccf5691d --- /dev/null +++ b/lib/gpu/lal_base_ellipsoid.h @@ -0,0 +1,248 @@ +/*************************************************************************** + base_ellipsoid.h + ------------------- + W. Michael Brown (ORNL) + + Base class for acceleration of ellipsoid potentials + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Thu May 5 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BASE_ELLIPSOID_H +#define LAL_BASE_ELLIPSOID_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class BaseEllipsoid { + public: + BaseEllipsoid(); + virtual ~BaseEllipsoid(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_base(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, const int ntypes, + int **h_form, const char *ellipsoid_program, + const char *lj_program, const bool ellipsoid_sphere=false); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int nall, bool &success) { + atom->resize(nall, success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \param olist_size size of list of particles from CPU neighboring + * \note host_inum is 0 if the host is performing neighboring + * \note if GPU is neighboring nlocal+host_inum=total number local particles + * \note if CPU is neighboring olist_size=total number of local particles + * \note if GPU is neighboring olist_size=0 **/ + inline void resize_local(const int nlocal, const int host_inum, + const int max_nbors, const int olist_size, + bool &success) { + ans->resize(nlocal, success); + if (_multiple_forms) ans->dev_ans.zero(); + + if (olist_size>static_cast<int>(host_olist.numel())) { + host_olist.clear(); + int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10); + success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS); + } + + nbor->resize(nlocal,host_inum,max_nbors,success); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_bytes) + _max_bytes=bytes; + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_base(); + + /// Output any timing information + void output_times(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_base() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(); + time_nbor1.add_to_total(); + time_ellipsoid.add_to_total(); + if (_multiple_forms) { + time_nbor2.add_to_total(); + time_ellipsoid2.add_to_total(); + if (_ellipsoid_sphere) { + time_nbor3.add_to_total(); + time_ellipsoid3.add_to_total(); + } + time_lj.add_to_total(); + } + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_nbor1.zero(); + time_ellipsoid.zero(); + if (_multiple_forms) { + time_nbor2.zero(); + time_ellipsoid2.zero(); + if (_ellipsoid_sphere) { + time_nbor3.zero(); + time_ellipsoid3.zero(); + } + time_lj.zero(); + } + atom->zero_timers(); + ans->zero_timers(); + } + + /// Pack neighbors to limit thread divergence for lj-lj and ellipse + void pack_nbors(const int GX, const int BX, const int start, const int inum, + const int form_low, const int form_high, + const bool shared_types, int ntypes); + + /// Copy neighbor list from host + void reset_nbors(const int nall, const int inum, const int osize, int *ilist, + int *numj, int *type, int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, bool &success); + + /// Pair loop with host neighboring + int* compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **quat); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **host_quat); + + /// Build neighbor list on accelerator + void build_nbor_list(const int inum, const int host_inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, bool &success); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device<numtyp,acctyp> *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_nbor1, time_ellipsoid, time_nbor2, time_ellipsoid2, time_lj; + UCL_Timer time_nbor3, time_ellipsoid3; + + /// Host device load balancer + Balance<numtyp,acctyp> hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom<numtyp,acctyp> *atom; + + // --------------------------- TYPE DATA -------------------------- + + /// cut_form.x = cutsq, cut_form.y = form + UCL_D_Vec<numtyp2> cut_form; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer<numtyp,acctyp> *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + /// ilist with particles sorted by type + UCL_H_Vec<int> host_olist; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *nbor_program, *ellipsoid_program, *lj_program; + UCL_Kernel k_nbor_fast, k_nbor; + UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid; + UCL_Kernel k_lj_fast, k_lj; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + + protected: + bool _compiled, _ellipsoid_sphere; + int _block_size, _threads_per_atom; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec<int> *_nbor_data; + + // True if we want to use fast GB-sphere or sphere-sphere calculations + bool _multiple_forms; + int **_host_form; + int _last_ellipse, _max_last_ellipse; + + void compile_kernels(UCL_Device &dev, const char *ellipsoid_string, + const char *lj_string, const bool e_s); + + virtual void loop(const bool _eflag, const bool _vflag) = 0; +}; + +} + +#endif + diff --git a/lib/gpu/lal_cg_cmm.cpp b/lib/gpu/lal_cg_cmm.cpp new file mode 100644 index 000000000..0742bc823 --- /dev/null +++ b/lib/gpu/lal_cg_cmm.cpp @@ -0,0 +1,154 @@ +/*************************************************************************** + cg_cmm.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the cg/cmm/cut pair style + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "cg_cmm_cl.h" +#else +#include "cg_cmm_ptx.h" +#endif + +#include "lal_cg_cmm.h" +#include <cassert> +using namespace LAMMPS_AL; +#define CGCMMT CGCMM<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +CGCMMT::CGCMM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) { +} + +template <class numtyp, class acctyp> +CGCMMT::~CGCMM() { + clear(); +} + +template <class numtyp, class acctyp> +int CGCMMT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int CGCMMT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,cg_cmm); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int cmm_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) { + cmm_types=max_shared_types; + shared_types=true; + } + _cmm_types=cmm_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<cmm_types*cmm_types; i++) + host_write[i]=0.0; + + lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, + host_cg_type,host_lj1,host_lj2); + + lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + UCL_H_Vec<double> dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void CGCMMT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double CGCMMT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CGCMM<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void CGCMMT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class CGCMM<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu new file mode 100644 index 000000000..90c7376be --- /dev/null +++ b/lib/gpu/lal_cg_cmm.cu @@ -0,0 +1,205 @@ +// ************************************************************************** +// cg_cmm.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the cg/cmm/cut pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int itype=ix.w; + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (r2inv<lj1[mtype].x) { + r2inv=ucl_recip(r2inv); + numtyp inv1,inv2; + + if (lj1[mtype].y == 2) { + inv1=r2inv*r2inv; + inv2=inv1*inv1; + } else if (lj1[mtype].y == 1) { + inv2=r2inv*ucl_sqrt(r2inv); + inv1=inv2*inv2; + } else { + inv1=r2inv*r2inv*r2inv; + inv2=inv1; + } + numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w); + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + if (eflag>0) + energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)- + lj3[mtype].z; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in,__global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + if (eflag>0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv<lj1[mtype].x) { + r2inv=ucl_recip(r2inv); + numtyp inv1,inv2; + + if (lj1[mtype].y == (numtyp)2) { + inv1=r2inv*r2inv; + inv2=inv1*inv1; + } else if (lj1[mtype].y == (numtyp)1) { + inv2=r2inv*ucl_sqrt(r2inv); + inv1=inv2*inv2; + } else { + inv1=r2inv*r2inv*r2inv; + inv2=inv1; + } + numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w); + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + if (eflag>0) + energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)- + lj3[mtype].z; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_cg_cmm.h b/lib/gpu/lal_cg_cmm.h new file mode 100644 index 000000000..c7d50b0b0 --- /dev/null +++ b/lib/gpu/lal_cg_cmm.h @@ -0,0 +1,79 @@ +/*************************************************************************** + cg_cmm.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the cg/cmm/cut pair style + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_CG_CMM_H +#define LAL_CG_CMM_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class CGCMM : public BaseAtomic<numtyp, acctyp> { + public: + CGCMM(); + ~CGCMM(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, int **host_cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = cutsq, lj1.y=cg_type, lj1.z = lj1, lj1.w = lj2 + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec<numtyp4> lj3; + /// Special LJ values + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _cmm_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_cg_cmm_ext.cpp b/lib/gpu/lal_cg_cmm_ext.cpp new file mode 100644 index 000000000..c9fafb789 --- /dev/null +++ b/lib/gpu/lal_cg_cmm_ext.cpp @@ -0,0 +1,121 @@ +/*************************************************************************** + cg_cmm.h + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to cg/cmm/cut pair acceleration routines + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_cg_cmm.h" + +using namespace std; +using namespace LAMMPS_AL; + +static CGCMM<PRECISION,ACC_PRECISION> CMMMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { + CMMMF.clear(); + gpu_mode=CMMMF.device->gpu_mode(); + double gpu_split=CMMMF.device->particle_split(); + int first_gpu=CMMMF.device->first_device(); + int last_gpu=CMMMF.device->last_device(); + int world_me=CMMMF.device->world_me(); + int gpu_rank=CMMMF.device->gpu_rank(); + int procs_per_gpu=CMMMF.device->procs_per_gpu(); + + CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu); + + bool message=false; + if (CMMMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + CMMMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + CMMMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CMMMF.estimate_gpu_overhead(); + return init_ok; +} + +void cmm_gpu_clear() { + CMMMF.clear(); +} + +int** cmm_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void cmm_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double cmm_gpu_bytes() { + return CMMMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_cg_cmm_long.cpp b/lib/gpu/lal_cg_cmm_long.cpp new file mode 100644 index 000000000..ffda7343c --- /dev/null +++ b/lib/gpu/lal_cg_cmm_long.cpp @@ -0,0 +1,169 @@ +/*************************************************************************** + cg_cmm_long.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the cg/cmm/coul/long pair style + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "cg_cmm_long_cl.h" +#else +#include "cg_cmm_long_ptx.h" +#endif + +#include "lal_cg_cmm_long.h" +#include <cassert> +using namespace LAMMPS_AL; +#define CGCMMLongT CGCMMLong<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +CGCMMLongT::CGCMMLong() : BaseCharge<numtyp,acctyp>(), + _allocated(false) { +} + +template <class numtyp, class acctyp> +CGCMMLongT::~CGCMMLong() { + clear(); +} + +template <class numtyp, class acctyp> +int CGCMMLongT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int CGCMMLongT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, + const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,cg_cmm_long); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq, + host_cut_ljsq,host_lj1,host_lj2); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3, + host_lj4,host_offset); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void CGCMMLongT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double CGCMMLongT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CGCMMLong<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void CGCMMLongT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, + &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class CGCMMLong<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu new file mode 100644 index 000000000..45dbf2a86 --- /dev/null +++ b/lib/gpu/lal_cg_cmm_long.cu @@ -0,0 +1,265 @@ +// ************************************************************************** +// cg_cmm_long.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the cg/cmm/coul/long pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +texture<float> q_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +ucl_inline float fetch_q(const int& i, const float *q) + { return tex1Dfetch(q_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_ , + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int itype=ix.w; + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (rsq<lj1[mtype].x) { + numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc; + numtyp r2inv=ucl_recip(rsq); + + if (rsq < lj1[mtype].y) { + if (lj3[mtype].x == (numtyp)2) { + inv1=r2inv*r2inv; + inv2=inv1*inv1; + } else if (lj3[mtype].x == (numtyp)1) { + inv2=r2inv*ucl_rsqrt(rsq); + inv1=inv2*inv2; + } else { + inv1=r2inv*r2inv*r2inv; + inv2=inv1; + } + force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w); + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].y) { + energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- + lj3[mtype].w; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<lj1[mtype].x) { + numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc; + numtyp r2inv=ucl_recip(rsq); + + if (rsq < lj1[mtype].y) { + if (lj3[mtype].x == (numtyp)2) { + inv1=r2inv*r2inv; + inv2=inv1*inv1; + } else if (lj3[mtype].x == (numtyp)1) { + inv2=r2inv*ucl_rsqrt(rsq); + inv1=inv2*inv2; + } else { + inv1=r2inv*r2inv*r2inv; + inv2=inv1; + } + force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w); + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].y) { + energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)- + lj3[mtype].w; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_cg_cmm_long.h b/lib/gpu/lal_cg_cmm_long.h new file mode 100644 index 000000000..1ffbc7c5b --- /dev/null +++ b/lib/gpu/lal_cg_cmm_long.h @@ -0,0 +1,83 @@ +/*************************************************************************** + cg_cmm_long.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the cg/cmm/coul/long pair style + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_CG_CMM_LONG_H +#define LAL_CG_CMM_LONG_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class CGCMMLong : public BaseCharge<numtyp, acctyp> { + public: + CGCMMLong(); + ~CGCMMLong(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, int ** cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset + UCL_D_Vec<numtyp4> lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_cg_cmm_long_ext.cpp b/lib/gpu/lal_cg_cmm_long_ext.cpp new file mode 100644 index 000000000..8da2d65af --- /dev/null +++ b/lib/gpu/lal_cg_cmm_long_ext.cpp @@ -0,0 +1,129 @@ +/*************************************************************************** + cg_cmm_long.h + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to cg/cmm/coul/long acceleration functions + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_cg_cmm_long.h" + +using namespace std; +using namespace LAMMPS_AL; + +static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + CMMLMF.clear(); + gpu_mode=CMMLMF.device->gpu_mode(); + double gpu_split=CMMLMF.device->particle_split(); + int first_gpu=CMMLMF.device->first_device(); + int last_gpu=CMMLMF.device->last_device(); + int world_me=CMMLMF.device->world_me(); + int gpu_rank=CMMLMF.device->gpu_rank(); + int procs_per_gpu=CMMLMF.device->procs_per_gpu(); + + CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu); + + bool message=false; + if (CMMLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e,g_ewald); + + CMMLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, + host_cut_ljsq, host_cut_coulsq, host_special_coul, + qqrd2e, g_ewald); + CMMLMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CMMLMF.estimate_gpu_overhead(); + return init_ok; +} + +void cmml_gpu_clear() { + CMMLMF.clear(); +} + +int** cmml_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q,boxlo,prd); +} + +void cmml_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double cmml_gpu_bytes() { + return CMMLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp new file mode 100644 index 000000000..d26023081 --- /dev/null +++ b/lib/gpu/lal_charmm_long.cpp @@ -0,0 +1,174 @@ +/*************************************************************************** + charmm_long.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the charmm/coul/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "charmm_long_cl.h" +#else +#include "charmm_long_ptx.h" +#endif + +#include "lal_charmm_long.h" +#include <cassert> +using namespace LAMMPS_AL; +#define CHARMMLongT CHARMMLong<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +CHARMMLongT::CHARMMLong() : BaseCharge<numtyp,acctyp>(), + _allocated(false) { +} + +template <class numtyp, class acctyp> +CHARMMLongT::~CHARMMLong() { + clear(); +} + +template <class numtyp, class acctyp> +int CHARMMLongT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int CHARMMLongT::init(const int ntypes, + double host_cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald, const double cut_lj_innersq, + const double denom_lj, double **epsilon, + double **sigma, const bool mix_arithmetic) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,charmm_long); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (this->_block_bio_size>=64 && mix_arithmetic) + shared_types=true; + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + int h_size=lj_types*lj_types; + int max_bio_shared_types=this->device->max_bio_shared_types(); + if (h_size<max_bio_shared_types) + h_size=max_bio_shared_types; + UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + for (int i=0; i<h_size*32; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_lj3,host_lj4); + + ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_bothsq = host_cut_bothsq; + _cut_coulsq = host_cut_coulsq; + _cut_ljsq = host_cut_ljsq; + _cut_lj_innersq = cut_lj_innersq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + _denom_lj=denom_lj; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void CHARMMLongT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + ljd.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double CHARMMLongT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CHARMMLong<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->_block_bio_size; + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(), + &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class CHARMMLong<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu new file mode 100644 index 000000000..aa88967d6 --- /dev/null +++ b/lib/gpu/lal_charmm_long.cu @@ -0,0 +1,278 @@ +// ************************************************************************** +// charmm_long.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the charmm/coul/long pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +texture<float> q_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +ucl_inline float fetch_q(const int& i, const float *q) + { return tex1Dfetch(q_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + const int lj_types, __global numtyp *sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const numtyp denom_lj, + const numtyp cut_bothsq, const numtyp cut_ljsq, + const numtyp cut_lj_innersq, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int itype=ix.w; + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (rsq<cut_bothsq) { + numtyp r2inv=ucl_recip(rsq); + numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc, switch1; + + if (rsq < cut_ljsq) { + r6inv = r2inv*r2inv*r2inv; + force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + if (rsq > cut_lj_innersq) { + switch1 = (cut_ljsq-rsq); + numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ + denom_lj; + switch1 *= switch1; + switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ + denom_lj; + switch2 *= r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w); + force_lj = force_lj*switch1+switch2; + } + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_ljsq) { + numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w); + if (rsq > cut_lj_innersq) + e *= switch1; + energy+=factor_lj*e; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, + __global numtyp* sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const numtyp denom_lj, + const numtyp cut_bothsq, const numtyp cut_ljsq, + const numtyp cut_lj_innersq, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp2 ljd[MAX_BIO_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + ljd[tid]=ljd_in[tid]; + if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES) + ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int itype=ix.w; + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<cut_bothsq) { + numtyp r2inv=ucl_recip(rsq); + numtyp forcecoul, force_lj, force, prefactor, _erfc, switch1; + numtyp lj3, lj4; + + if (rsq < cut_ljsq) { + numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x); + numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y); + + numtyp sig_r_6 = sig6*sig6*r2inv; + sig_r_6 = sig_r_6*sig_r_6*sig_r_6; + lj4 = (numtyp)4.0*eps*sig_r_6; + lj3 = lj4*sig_r_6; + force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4); + if (rsq > cut_lj_innersq) { + switch1 = (cut_ljsq-rsq); + numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ + denom_lj; + switch1 *= switch1; + switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ + denom_lj; + switch2 *= lj3-lj4; + force_lj = force_lj*switch1+switch2; + } + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < cut_ljsq) { + numtyp e=lj3-lj4; + if (rsq > cut_lj_innersq) + e *= switch1; + energy+=factor_lj*e; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_charmm_long.h b/lib/gpu/lal_charmm_long.h new file mode 100644 index 000000000..201a5c369 --- /dev/null +++ b/lib/gpu/lal_charmm_long.h @@ -0,0 +1,87 @@ +/*************************************************************************** + charmm_long.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the charmm/coul/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_CHARMM_LONG_H +#define LAL_CHARMM_LONG_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class CHARMMLong : public BaseCharge<numtyp, acctyp> { + public: + CHARMMLong(); + ~CHARMMLong(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double host_cut_bothsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, + const double cut_lj_innersq, const double denom_lj, + double **epsilon, double **sigma, const bool mix_arithmetic); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// x = lj1, y = lj2, z = lj3, w = lj4 + UCL_D_Vec<numtyp4> lj1; + /// x = epsilon, y = sigma + UCL_D_Vec<numtyp2> ljd; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e, _g_ewald, _denom_lj; + + numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp new file mode 100644 index 000000000..5d544dc87 --- /dev/null +++ b/lib/gpu/lal_charmm_long_ext.cpp @@ -0,0 +1,135 @@ +/*************************************************************************** + charmm_long_ext.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to charmm/coul/long acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_charmm_long.h" + +using namespace std; +using namespace LAMMPS_AL; + +static CHARMMLong<PRECISION,ACC_PRECISION> CRMLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald, const double cut_lj_innersq, + const double denom_lj, double **epsilon, + double **sigma, const bool mix_arithmetic) { + CRMLMF.clear(); + gpu_mode=CRMLMF.device->gpu_mode(); + double gpu_split=CRMLMF.device->particle_split(); + int first_gpu=CRMLMF.device->first_device(); + int last_gpu=CRMLMF.device->last_device(); + int world_me=CRMLMF.device->world_me(); + int gpu_rank=CRMLMF.device->gpu_rank(); + int procs_per_gpu=CRMLMF.device->procs_per_gpu(); + + CRMLMF.device->init_message(screen,"lj/charmm/coul/long",first_gpu,last_gpu); + + bool message=false; + if (CRMLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_ljsq, host_cut_coulsq, + host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, + epsilon,sigma,mix_arithmetic); + + CRMLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, + host_cut_ljsq, host_cut_coulsq, host_special_coul, + qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon, + sigma, mix_arithmetic); + + CRMLMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CRMLMF.estimate_gpu_overhead(); + return init_ok; +} + +void crml_gpu_clear() { + CRMLMF.clear(); +} + +int** crml_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void crml_gpu_compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd) { + CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, + eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); +} + +double crml_gpu_bytes() { + return CRMLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp new file mode 100644 index 000000000..90c07246f --- /dev/null +++ b/lib/gpu/lal_coul_long.cpp @@ -0,0 +1,156 @@ +/*************************************************************************** + coul_long.cpp + ------------------- + Axel Kohlmeyer (Temple) + + Class for acceleration of the coul/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : July 2011 + email : a.kohlmeyer@temple.edu + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "coul_long_cl.h" +#else +#include "coul_long_ptx.h" +#endif + +#include "lal_coul_long.h" +#include <cassert> +using namespace LAMMPS_AL; +#define CoulLongT CoulLong<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> pair_gpu_device; + +template <class numtyp, class acctyp> +CoulLongT::CoulLong() : BaseCharge<numtyp,acctyp>(), _allocated(false) { +} + +template <class numtyp, class acctyp> +CoulLongT::~CoulLong() { + clear(); +} + +template <class numtyp, class acctyp> +int CoulLongT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int CoulLongT::init(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,coul_long); + if (success!=0) + return success; + + // we don't have atom types for coulomb only, + // but go with the minimum so that we can use + // the same infrastructure as lj/cut/coul/long/gpu. + int lj_types=1; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + + sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_coul[i]; + } + ucl_copy(sp_cl,host_write,4,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_cl.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void CoulLongT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_cl.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double CoulLongT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CoulLong<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void CoulLongT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_cl.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class CoulLong<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu new file mode 100644 index 000000000..88d740620 --- /dev/null +++ b/lib/gpu/lal_coul_long.cu @@ -0,0 +1,301 @@ +// ************************************************************************** +// coul_long.cu +// ------------------- +// Axel Kohlmeyer (Temple) +// +// Device code for acceleration of the coul/long pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : July 2011 +// email : a.kohlmeyer@temple.edu +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +texture<float> q_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +ucl_inline float fetch_q(const int& i, const float *q) + { return tex1Dfetch(q_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_cl_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_cl[4]; + sp_cl[0]=sp_cl_in[0]; + sp_cl[1]=sp_cl_in[1]; + sp_cl[2]=sp_cl_in[2]; + sp_cl[3]=sp_cl_in[3]; + + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_coul; + factor_coul = (numtyp)1.0-sp_cl[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq < cut_coulsq) { + numtyp r2inv=ucl_recip(rsq); + numtyp force, prefactor, _erfc; + + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + e_coul += prefactor*(_erfc-factor_coul); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + e_coul=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (offset==0) { + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=(acctyp)0; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_cl_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_cl[4]; + if (tid<4) + sp_cl[tid]=sp_cl_in[tid]; + + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_coul; + factor_coul = (numtyp)1.0-sp_cl[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq < cut_coulsq) { + numtyp r2inv=ucl_recip(rsq); + numtyp force, prefactor, _erfc; + + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + e_coul += prefactor*(_erfc-factor_coul); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + e_coul=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (offset==0) { + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=(acctyp)0; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } + } // if ii +} + diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h new file mode 100644 index 000000000..62103e726 --- /dev/null +++ b/lib/gpu/lal_coul_long.h @@ -0,0 +1,80 @@ +/*************************************************************************** + coul_long.h + ------------------- + Axel Kohlmeyer (Temple) + + Class for acceleration of the coul/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : July 2011 + email : a.kohlmeyer@temple.edu + ***************************************************************************/ + +#ifndef LAL_Coul_Long_H +#define LAL_Coul_Long_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class CoulLong : public BaseCharge<numtyp, acctyp> { + public: + CoulLong(); + ~CoulLong(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1 dummy + UCL_D_Vec<numtyp4> lj1; + /// lj3 dummy + UCL_D_Vec<numtyp4> lj3; + /// Special Coul values [0-3] + UCL_D_Vec<numtyp> sp_cl; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp new file mode 100644 index 000000000..f6ce0c1d7 --- /dev/null +++ b/lib/gpu/lal_coul_long_ext.cpp @@ -0,0 +1,123 @@ +/*************************************************************************** + coul_long_ext.cpp + ------------------- + Axel Kohlmeyer (Temple) + + Functions for LAMMPS access to coul/long acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : July 2011 + email : a.kohlmeyer@temple.edu + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_coul_long.h" + +using namespace std; +using namespace LAMMPS_AL; + +static CoulLong<PRECISION,ACC_PRECISION> CLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int cl_gpu_init(const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald) { + CLMF.clear(); + gpu_mode=CLMF.device->gpu_mode(); + double gpu_split=CLMF.device->particle_split(); + int first_gpu=CLMF.device->first_device(); + int last_gpu=CLMF.device->last_device(); + int world_me=CLMF.device->world_me(); + int gpu_rank=CLMF.device->gpu_rank(); + int procs_per_gpu=CLMF.device->procs_per_gpu(); + + CLMF.device->init_message(screen,"coul/long",first_gpu,last_gpu); + + bool message=false; + if (CLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split, + screen, host_cut_coulsq, host_special_coul, qqrd2e, + g_ewald); + + CLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split, + screen, host_cut_coulsq, host_special_coul, + qqrd2e, g_ewald); + + CLMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CLMF.estimate_gpu_overhead(); + return init_ok; +} + +void cl_gpu_clear() { + CLMF.clear(); +} + +int** cl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void cl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double cl_gpu_bytes() { + return CLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp new file mode 100644 index 000000000..d4c2e2385 --- /dev/null +++ b/lib/gpu/lal_device.cpp @@ -0,0 +1,640 @@ +/*************************************************************************** + device.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for management of the device where the computations are performed + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include "lal_device.h" +#include "lal_precision.h" +#include <map> +#include <math.h> +#ifdef _OPENMP +#include <omp.h> +#endif + +#ifdef USE_OPENCL +#include "device_cl.h" +#else +#include "device_ptx.h" +#endif + +using namespace LAMMPS_AL; +#define DeviceT Device<numtyp, acctyp> + +template <class numtyp, class acctyp> +DeviceT::Device() : _init_count(0), _device_init(false), + _gpu_mode(GPU_FORCE), _first_device(0), + _last_device(0), _compiled(false) { +} + +template <class numtyp, class acctyp> +DeviceT::~Device() { + clear_device(); +} + +template <class numtyp, class acctyp> +int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, + const int first_gpu, const int last_gpu, + const int gpu_mode, const double p_split, + const int nthreads, const int t_per_atom) { + _nthreads=nthreads; + #ifdef _OPENMP + omp_set_num_threads(nthreads); + #endif + _threads_per_atom=t_per_atom; + _threads_per_charge=t_per_atom; + + if (_device_init) + return 0; + _device_init=true; + _comm_world=world; + _comm_replica=replica; + _first_device=first_gpu; + _last_device=last_gpu; + _gpu_mode=gpu_mode; + _particle_split=p_split; + + // Get the rank/size within the world + MPI_Comm_rank(_comm_world,&_world_me); + MPI_Comm_size(_comm_world,&_world_size); + // Get the rank/size within the replica + MPI_Comm_rank(_comm_replica,&_replica_me); + MPI_Comm_size(_comm_replica,&_replica_size); + + // Get the names of all nodes + int name_length; + char node_name[MPI_MAX_PROCESSOR_NAME]; + char node_names[MPI_MAX_PROCESSOR_NAME*_world_size]; + MPI_Get_processor_name(node_name,&name_length); + MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, + MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world); + std::string node_string=std::string(node_name); + + // Get the number of procs per node + std::map<std::string,int> name_map; + std::map<std::string,int>::iterator np; + for (int i=0; i<_world_size; i++) { + std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); + np=name_map.find(i_string); + if (np==name_map.end()) + name_map[i_string]=1; + else + np->second++; + } + int procs_per_node=name_map.begin()->second; + + // Assign a unique id to each node + int split_num=0, split_id=0; + for (np=name_map.begin(); np!=name_map.end(); ++np) { + if (np->first==node_string) + split_id=split_num; + split_num++; + } + + // Set up a per node communicator and find rank within + MPI_Comm node_comm; + MPI_Comm_split(_comm_world, split_id, 0, &node_comm); + int node_rank; + MPI_Comm_rank(node_comm,&node_rank); + + // set the device ID + _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/ + (last_gpu-first_gpu+1))); + int my_gpu=node_rank/_procs_per_gpu+first_gpu; + + // Time on the device only if 1 proc per gpu + _time_device=true; + if (_procs_per_gpu>1) + _time_device=false; + + // Set up a per device communicator + MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); + MPI_Comm_rank(_comm_gpu,&_gpu_rank); + + gpu=new UCL_Device(); + if (my_gpu>=gpu->num_devices()) + return -2; + + if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false) + return -7; + + if (gpu->set(my_gpu)!=UCL_SUCCESS) + return -6; + + _long_range_precompute=0; + + int flag=compile_kernels(); + + return flag; +} + +template <class numtyp, class acctyp> +int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge, + const bool rot, const int nlocal, + const int host_nlocal, const int nall, + Neighbor *nbor, const int maxspecial, + const int gpu_host, const int max_nbors, + const double cell_size, const bool pre_cut, + const int threads_per_atom) { + if (!_device_init) + return -1; + if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) + return -5; + + // Counts of data transfers for timing overhead estimates + _data_in_estimate=0; + _data_out_estimate=1; + + // Initial number of local particles + int ef_nlocal=nlocal; + if (_particle_split<1.0 && _particle_split>0.0) + ef_nlocal=static_cast<int>(_particle_split*nlocal); + + int gpu_nbor=0; + if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH) + gpu_nbor=1; + else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH) + gpu_nbor=2; + #ifdef USE_OPENCL + if (gpu_nbor==1) + gpu_nbor=2; + #endif + + if (_init_count==0) { + // Initialize atom and nbor data + if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0)) + return -3; + + _data_in_estimate++; + if (charge) + _data_in_estimate++; + if (rot) + _data_in_estimate++; + } else { + if (atom.charge()==false && charge) + _data_in_estimate++; + if (atom.quat()==false && rot) + _data_in_estimate++; + if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial)) + return -3; + } + + if (!ans.init(ef_nlocal,charge,rot,*gpu)) + return -3; + + if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, + *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, + _block_cell_id, _block_nbor_build, threads_per_atom, + _time_device)) + return -3; + nbor->cell_size(cell_size); + + _init_count++; + return 0; +} + +template <class numtyp, class acctyp> +int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal, + const int nall) { + if (!_device_init) + return -1; + if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) + return -5; + + if (_init_count==0) { + // Initialize atom and nbor data + if (!atom.init(nall,true,false,*gpu,false,false)) + return -3; + } else + if (!atom.add_fields(true,false,false,false)) + return -3; + + if (!ans.init(nlocal,true,false,*gpu)) + return -3; + + _init_count++; + return 0; +} + +template <class numtyp, class acctyp> +void DeviceT::set_single_precompute + (PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) { + _long_range_precompute=1; + pppm_single=pppm; +} + +template <class numtyp, class acctyp> +void DeviceT::set_double_precompute + (PPPM<numtyp,acctyp,double,_lgpu_double4> *pppm) { + _long_range_precompute=2; + pppm_double=pppm; +} + +template <class numtyp, class acctyp> +void DeviceT::init_message(FILE *screen, const char *name, + const int first_gpu, const int last_gpu) { + #ifdef USE_OPENCL + std::string fs=""; + #else + std::string fs=toa(gpu->free_gigabytes())+"/"; + #endif + + if (_replica_me == 0 && screen) { + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"-------------------------------------\n"); + fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); + fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu); + #ifdef _OPENMP + fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); + #endif + #ifdef USE_OPENCL + fprintf(screen,"- with OpenCL Parameters for: %s\n",OCL_VENDOR); + #endif + fprintf(screen,"-------------------------------------"); + fprintf(screen,"-------------------------------------\n"); + + int last=last_gpu+1; + if (last>gpu->num_devices()) + last=gpu->num_devices(); + for (int i=first_gpu; i<last; i++) { + std::string sname; + if (i==first_gpu) + sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ + toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ ("; + else + sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ + toa(gpu->clock_rate(i))+" GHZ ("; + if (sizeof(PRECISION)==4) { + if (sizeof(ACC_PRECISION)==4) + sname+="Single Precision)"; + else + sname+="Mixed Precision)"; + } else + sname+="Double Precision)"; + + fprintf(screen,"GPU %d: %s\n",i,sname.c_str()); + } + + fprintf(screen,"-------------------------------------"); + fprintf(screen,"-------------------------------------\n\n"); + } +} + +template <class numtyp, class acctyp> +void DeviceT::estimate_gpu_overhead(const int kernel_calls, + double &gpu_overhead, + double &gpu_driver_overhead) { + UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL; + UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL; + UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL; + UCL_Timer over_timer(*gpu); + + if (_data_in_estimate>0) { + host_data_in=new UCL_H_Vec<int>[_data_in_estimate]; + dev_data_in=new UCL_D_Vec<int>[_data_in_estimate]; + timers_in=new UCL_Timer[_data_in_estimate]; + } + + if (_data_out_estimate>0) { + host_data_out=new UCL_H_Vec<int>[_data_out_estimate]; + dev_data_out=new UCL_D_Vec<int>[_data_out_estimate]; + timers_out=new UCL_Timer[_data_out_estimate]; + } + + if (kernel_calls>0) { + kernel_data=new UCL_D_Vec<int>[kernel_calls]; + timers_kernel=new UCL_Timer[kernel_calls]; + } + + for (int i=0; i<_data_in_estimate; i++) { + host_data_in[i].alloc(1,*gpu); + dev_data_in[i].alloc(1,*gpu); + timers_in[i].init(*gpu); + } + + for (int i=0; i<_data_out_estimate; i++) { + host_data_out[i].alloc(1,*gpu); + dev_data_out[i].alloc(1,*gpu); + timers_out[i].init(*gpu); + } + + for (int i=0; i<kernel_calls; i++) { + kernel_data[i].alloc(1,*gpu); + timers_kernel[i].init(*gpu); + } + + gpu_overhead=0.0; + gpu_driver_overhead=0.0; + + for (int i=0; i<10; i++) { + gpu->sync(); + gpu_barrier(); + over_timer.start(); + gpu->sync(); + gpu_barrier(); + + double driver_time=MPI_Wtime(); + for (int i=0; i<_data_in_estimate; i++) { + timers_in[i].start(); + ucl_copy(dev_data_in[i],host_data_in[i],true); + timers_in[i].stop(); + } + + for (int i=0; i<kernel_calls; i++) { + timers_kernel[i].start(); + zero(kernel_data[i],1); + timers_kernel[i].stop(); + } + + for (int i=0; i<_data_out_estimate; i++) { + timers_out[i].start(); + ucl_copy(host_data_out[i],dev_data_out[i],true); + timers_out[i].stop(); + } + over_timer.stop(); + + double time=over_timer.seconds(); + driver_time=MPI_Wtime()-driver_time; + + if (time_device()) { + for (int i=0; i<_data_in_estimate; i++) + timers_in[i].add_to_total(); + for (int i=0; i<kernel_calls; i++) + timers_kernel[i].add_to_total(); + for (int i=0; i<_data_out_estimate; i++) + timers_out[i].add_to_total(); + } + + double mpi_time, mpi_driver_time; + MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm()); + MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm()); + gpu_overhead+=mpi_time; + gpu_driver_overhead+=mpi_driver_time; + } + gpu_overhead/=10.0; + gpu_driver_overhead/=10.0; + + if (_data_in_estimate>0) { + delete [] host_data_in; + delete [] dev_data_in; + delete [] timers_in; + } + + if (_data_out_estimate>0) { + delete [] host_data_out; + delete [] dev_data_out; + delete [] timers_out; + } + + if (kernel_calls>0) { + delete [] kernel_data; + delete [] timers_kernel; + } +} + +template <class numtyp, class acctyp> +void DeviceT::output_times(UCL_Timer &time_pair, + Answer<numtyp,acctyp> &ans, + Neighbor &nbor, const double avg_split, + const double max_bytes, + const double gpu_overhead, + const double driver_overhead, + const int threads_per_atom, FILE *screen) { + double single[9], times[9]; + + single[0]=atom.transfer_time()+ans.transfer_time(); + single[1]=nbor.time_nbor.total_seconds()+nbor.time_hybrid1.total_seconds()+ + nbor.time_hybrid2.total_seconds(); + single[2]=nbor.time_kernel.total_seconds(); + single[3]=time_pair.total_seconds(); + single[4]=atom.cast_time()+ans.cast_time(); + single[5]=gpu_overhead; + single[6]=driver_overhead; + single[7]=ans.cpu_idle_time(); + single[8]=nbor.bin_time(); + + MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica); + + double my_max_bytes=max_bytes+atom.max_gpu_bytes(); + double mpi_max_bytes; + MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); + double max_mb=mpi_max_bytes/(1024.0*1024.0); + + if (replica_me()==0) + if (screen && times[5]>0.0) { + fprintf(screen,"\n\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + fprintf(screen," GPU Time Info (average): "); + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + + if (time_device()) { + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); + fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); + fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); + if (nbor.gpu_nbor()>0) + fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); + else + fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); + fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); + } + if (nbor.gpu_nbor()==2) + fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[8]/_replica_size); + fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size); + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); + fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); + + fprintf(screen,"-------------------------------------"); + fprintf(screen,"--------------------------------\n\n"); + } +} + +template <class numtyp, class acctyp> +void DeviceT::output_kspace_times(UCL_Timer &time_in, + UCL_Timer &time_out, + UCL_Timer &time_map, + UCL_Timer &time_rho, + UCL_Timer &time_interp, + Answer<numtyp,acctyp> &ans, + const double max_bytes, + const double cpu_time, + const double idle_time, FILE *screen) { + double single[8], times[8]; + + single[0]=time_out.total_seconds(); + single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time(); + single[2]=time_map.total_seconds(); + single[3]=time_rho.total_seconds(); + single[4]=time_interp.total_seconds(); + single[5]=ans.transfer_time()+ans.cast_time(); + single[6]=cpu_time; + single[7]=idle_time; + + MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica); + + double my_max_bytes=max_bytes+atom.max_gpu_bytes(); + double mpi_max_bytes; + MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); + double max_mb=mpi_max_bytes/(1024.0*1024.0); + + if (replica_me()==0) + if (screen && times[6]>0.0) { + fprintf(screen,"\n\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + fprintf(screen," GPU Time Info (average): "); + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + + if (time_device()) { + fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size); + fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size); + fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); + fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size); + fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size); + fprintf(screen,"Total rho: %.4f s.\n", + (times[0]+times[2]+times[3])/_replica_size); + fprintf(screen,"Total interp: %.4f s.\n", + (times[1]+times[4])/_replica_size); + fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size); + fprintf(screen,"Total: %.4f s.\n", + (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/ + _replica_size); + } + fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); + fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + + fprintf(screen,"-------------------------------------"); + fprintf(screen,"--------------------------------\n\n"); + } +} + +template <class numtyp, class acctyp> +void DeviceT::clear() { + if (_init_count>0) { + _long_range_precompute=0; + _init_count--; + if (_init_count==0) { + atom.clear(); + _neighbor_shared.clear(); + if (_compiled) { + k_zero.clear(); + k_info.clear(); + delete dev_program; + _compiled=false; + } + } + } +} + +template <class numtyp, class acctyp> +void DeviceT::clear_device() { + while (_init_count>0) + clear(); + if (_device_init) { + delete gpu; + _device_init=false; + } +} + +template <class numtyp, class acctyp> +int DeviceT::compile_kernels() { + int flag=0; + + if (_compiled) + return flag; + + std::string flags="-cl-mad-enable -D"+std::string(OCL_VENDOR); + dev_program=new UCL_Program(*gpu); + int success=dev_program->load_string(device,flags.c_str()); + if (success!=UCL_SUCCESS) + return -4; + k_zero.set_function(*dev_program,"kernel_zero"); + k_info.set_function(*dev_program,"kernel_info"); + _compiled=true; + + UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED); + UCL_D_Vec<int> d_gpu_lib_data(14,*gpu); + k_info.set_size(1,1); + k_info.run(&d_gpu_lib_data.begin()); + ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false); + + _ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0; + #ifndef USE_OPENCL + if (_ptx_arch>gpu->arch()) + return -4; + #endif + + _num_mem_threads=h_gpu_lib_data[1]; + _warp_size=h_gpu_lib_data[2]; + if (_threads_per_atom<1) + _threads_per_atom=h_gpu_lib_data[3]; + if (_threads_per_charge<1) + _threads_per_charge=h_gpu_lib_data[13]; + _pppm_max_spline=h_gpu_lib_data[4]; + _pppm_block=h_gpu_lib_data[5]; + _block_pair=h_gpu_lib_data[6]; + _max_shared_types=h_gpu_lib_data[7]; + _block_cell_2d=h_gpu_lib_data[8]; + _block_cell_id=h_gpu_lib_data[9]; + _block_nbor_build=h_gpu_lib_data[10]; + _block_bio_pair=h_gpu_lib_data[11]; + _max_bio_shared_types=h_gpu_lib_data[12]; + + if (static_cast<size_t>(_block_pair)>gpu->group_size()) + _block_pair=gpu->group_size(); + if (static_cast<size_t>(_block_bio_pair)>gpu->group_size()) + _block_bio_pair=gpu->group_size(); + if (_threads_per_atom>_warp_size) + _threads_per_atom=_warp_size; + if (_warp_size%_threads_per_atom!=0) + _threads_per_atom=1; + if (_threads_per_atom & (_threads_per_atom - 1)) + _threads_per_atom=1; + if (_threads_per_charge>_warp_size) + _threads_per_charge=_warp_size; + if (_warp_size%_threads_per_charge!=0) + _threads_per_charge=1; + if (_threads_per_charge & (_threads_per_charge - 1)) + _threads_per_charge=1; + + return flag; +} + +template <class numtyp, class acctyp> +double DeviceT::host_memory_usage() const { + return atom.host_memory_usage()+4*sizeof(numtyp)+ + sizeof(Device<numtyp,acctyp>); +} + +template class Device<PRECISION,ACC_PRECISION>; +Device<PRECISION,ACC_PRECISION> global_device; + +int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + const int last_gpu, const int gpu_mode, + const double particle_split, const int nthreads, + const int t_per_atom) { + return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, + particle_split,nthreads,t_per_atom); +} + +void lmp_clear_device() { + global_device.clear_device(); +} + +double lmp_gpu_forces(double **f, double **tor, double *eatom, + double **vatom, double *virial, double &ecoul) { + return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul); +} diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu new file mode 100644 index 000000000..54a95417b --- /dev/null +++ b/lib/gpu/lal_device.cu @@ -0,0 +1,42 @@ +// ************************************************************************** +// device.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for device information +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_preprocessor.h" +#endif + +__kernel void kernel_zero(__global int *mem, int numel) { + int ii=GLOBAL_ID_X; + + if (ii<numel) + mem[ii]=0; +} + +__kernel void kernel_info(__global int *info) { + info[0]=ARCH; + info[1]=MEM_THREADS; + info[2]=WARP_SIZE; + info[3]=THREADS_PER_ATOM; + info[4]=PPPM_MAX_SPLINE; + info[5]=PPPM_BLOCK_1D; + info[6]=BLOCK_PAIR; + info[7]=MAX_SHARED_TYPES; + info[8]=BLOCK_CELL_2D; + info[9]=BLOCK_CELL_ID; + info[10]=BLOCK_NBOR_BUILD; + info[11]=BLOCK_BIO_PAIR; + info[12]=MAX_BIO_SHARED_TYPES; + info[13]=THREADS_PER_CHARGE; +} diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h new file mode 100644 index 000000000..e71c22de8 --- /dev/null +++ b/lib/gpu/lal_device.h @@ -0,0 +1,317 @@ +/*************************************************************************** + device.h + ------------------- + W. Michael Brown (ORNL) + + Class for management of the device where the computations are performed + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_DEVICE_H +#define LAL_DEVICE_H + +#include "lal_atom.h" +#include "lal_answer.h" +#include "lal_neighbor.h" +#include "lal_pppm.h" +#include "mpi.h" +#include <sstream> +#include "stdio.h" +#include <string> +#include <queue> + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp, + class grdtyp, class grdtyp4> class PPPM; + +template <class numtyp, class acctyp> +class Device { + public: + Device(); + ~Device(); + + /// Initialize the device for use by this process + /** Sets up a per-device MPI communicator for load balancing and initializes + * the device (>=first_gpu and <=last_gpu) that this proc will be using + * Returns: + * - 0 if successfull + * - -2 if GPU not found + * - -4 if GPU library not compiled for GPU + * - -6 if GPU could not be initialized for use + * - -7 if accelerator sharing is not currently allowed on system **/ + int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + const int last_gpu, const int gpu_mode, + const double particle_split, const int nthreads, + const int t_per_atom); + + /// Initialize the device for Atom and Neighbor storage + /** \param rot True if quaternions need to be stored + * \param nlocal Total number of local particles to allocate memory for + * \param host_nlocal Initial number of host particles to allocate memory for + * \param nall Total number of local+ghost particles + * \param gpu_host 0 if host will not perform force calculations, + * 1 if gpu_nbor is true, and host needs a half nbor list, + * 2 if gpu_nbor is true, and host needs a full nbor list + * \param max_nbors Initial number of rows in the neighbor matrix + * \param cell_size cutoff+skin + * \param pre_cut True if cutoff test will be performed in separate kernel + * than the force kernel + * \param threads_per_atom value to be used by the neighbor list only + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(Answer<numtyp,acctyp> &a, const bool charge, const bool rot, + const int nlocal, const int host_nlocal, const int nall, + Neighbor *nbor, const int maxspecial, const int gpu_host, + const int max_nbors, const double cell_size, const bool pre_cut, + const int threads_per_atom); + + /// Initialize the device for Atom storage only + /** \param nlocal Total number of local particles to allocate memory for + * \param nall Total number of local+ghost particles + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall); + + /// Output a message for pair_style acceleration with device stats + void init_message(FILE *screen, const char *name, + const int first_gpu, const int last_gpu); + + /// Perform charge assignment asynchronously for PPPM + void set_single_precompute(PPPM<numtyp,acctyp, + float,_lgpu_float4> *pppm); + + /// Perform charge assignment asynchronously for PPPM + void set_double_precompute(PPPM<numtyp,acctyp, + double,_lgpu_double4> *pppm); + + /// Esimate the overhead from GPU calls from multiple procs + /** \param kernel_calls Number of kernel calls/timestep for timing estimated + * overhead + * \param gpu_overhead Estimated gpu overhead per timestep (sec) + * \param driver_overhead Estimated overhead from driver per timestep (s) **/ + void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, + double &gpu_driver_overhead); + + /// Returns true if double precision is supported on card + inline bool double_precision() { return gpu->double_precision(); } + + /// Output a message with timing information + void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, + Neighbor &nbor, const double avg_split, + const double max_bytes, const double gpu_overhead, + const double driver_overhead, + const int threads_per_atom, FILE *screen); + + /// Output a message with timing information + void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, + UCL_Timer & time_map, UCL_Timer & time_rho, + UCL_Timer &time_interp, + Answer<numtyp,acctyp> &ans, + const double max_bytes, const double cpu_time, + const double cpu_idle_time, FILE *screen); + + /// Clear all memory on host and device associated with atom and nbor data + void clear(); + + /// Clear all memory on host and device + void clear_device(); + + /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS + inline void add_ans_object(Answer<numtyp,acctyp> *ans) + { ans_queue.push(ans); } + + /// Add "answers" (force,energies,etc.) into LAMMPS structures + inline double fix_gpu(double **f, double **tor, double *eatom, + double **vatom, double *virial, double &ecoul) { + atom.data_unavail(); + if (ans_queue.empty()==false) { + stop_host_timer(); + double evdw=0.0; + while (ans_queue.empty()==false) { + evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul); + ans_queue.pop(); + } + return evdw; + } + return 0.0; + } + + /// Start timer on host + inline void start_host_timer() + { _cpu_full=MPI_Wtime(); _host_timer_started=true; } + + /// Stop timer on host + inline void stop_host_timer() { + if (_host_timer_started) { + _cpu_full=MPI_Wtime()-_cpu_full; + _host_timer_started=false; + } + } + + /// Return host time + inline double host_time() { return _cpu_full; } + + /// Return host memory usage in bytes + double host_memory_usage() const; + + /// Return the number of procs sharing a device (size of device commincator) + inline int procs_per_gpu() const { return _procs_per_gpu; } + /// Return the number of threads per proc + inline int num_threads() const { return _nthreads; } + /// My rank within all processes + inline int world_me() const { return _world_me; } + /// Total number of processes + inline int world_size() const { return _world_size; } + /// MPI Barrier for world + inline void world_barrier() { MPI_Barrier(_comm_world); } + /// Return the replica MPI communicator + inline MPI_Comm & replica() { return _comm_replica; } + /// My rank within replica communicator + inline int replica_me() const { return _replica_me; } + /// Number of procs in replica communicator + inline int replica_size() const { return _replica_size; } + /// Return the per-GPU MPI communicator + inline MPI_Comm & gpu_comm() { return _comm_gpu; } + /// Return my rank in the device communicator + inline int gpu_rank() const { return _gpu_rank; } + /// MPI Barrier for gpu + inline void gpu_barrier() { MPI_Barrier(_comm_gpu); } + /// Return the 'mode' for acceleration: GPU_FORCE, GPU_NEIGH or GPU_HYB_NEIGH + inline int gpu_mode() const { return _gpu_mode; } + /// Index of first device used by a node + inline int first_device() const { return _first_device; } + /// Index of last device used by a node + inline int last_device() const { return _last_device; } + /// Particle split defined in fix + inline double particle_split() const { return _particle_split; } + /// Return the initialization count for the device + inline int init_count() const { return _init_count; } + /// True if device is being timed + inline bool time_device() const { return _time_device; } + + /// Return the number of threads accessing memory simulatenously + inline int num_mem_threads() const { return _num_mem_threads; } + /// Return the number of threads per atom for pair styles + inline int threads_per_atom() const { return _threads_per_atom; } + /// Return the number of threads per atom for pair styles using charge + inline int threads_per_charge() const { return _threads_per_charge; } + /// Return the min of the pair block size or the device max block size + inline int pair_block_size() const { return _block_pair; } + /// Return the maximum number of atom types that can be used with shared mem + inline int max_shared_types() const { return _max_shared_types; } + /// Return the maximum order for PPPM splines + inline int pppm_max_spline() const { return _pppm_max_spline; } + /// Return the block size for PPPM kernels + inline int pppm_block() const { return _pppm_block; } + /// Return the block size for neighbor binning + inline int block_cell_2d() const { return _block_cell_2d; } + /// Return the block size for atom mapping for neighbor builds + inline int block_cell_id() const { return _block_cell_id; } + /// Return the block size for neighbor build kernel + inline int block_nbor_build() const { return _block_nbor_build; } + /// Return the block size for "bio" pair styles + inline int block_bio_pair() const { return _block_bio_pair; } + /// Return the maximum number of atom types for shared mem with "bio" styles + inline int max_bio_shared_types() const { return _max_bio_shared_types; } + /// Architecture gpu code compiled for (returns 0 for OpenCL) + inline double ptx_arch() const { return _ptx_arch; } + + // -------------------- SHARED DEVICE ROUTINES -------------------- + // Perform asynchronous zero of integer array + void zero(UCL_D_Vec<int> &mem, const int numel) { + int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/ + _block_pair)); + k_zero.set_size(num_blocks,_block_pair); + k_zero.run(&mem.begin(),&numel); + } + + // -------------------------- DEVICE DATA ------------------------- + + /// Geryon Device + UCL_Device *gpu; + + enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH}; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom<numtyp,acctyp> atom; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor Data + NeighborShared _neighbor_shared; + + // ------------------------ LONG RANGE DATA ------------------------- + + // Long Range Data + int _long_range_precompute; + PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm_single; + PPPM<numtyp,acctyp,double,_lgpu_double4> *pppm_double; + /// Precomputations for long range charge assignment (asynchronously) + inline void precompute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *charge, double *boxlo, double *prd) { + if (_long_range_precompute==1) + pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge, + boxlo,prd); + else if (_long_range_precompute==2) + pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge, + boxlo,prd); + } + + private: + std::queue<Answer<numtyp,acctyp> *> ans_queue; + int _init_count; + bool _device_init, _host_timer_started, _time_device; + MPI_Comm _comm_world, _comm_replica, _comm_gpu; + int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, + _replica_size; + int _gpu_mode, _first_device, _last_device, _nthreads; + double _particle_split; + double _cpu_full; + double _ptx_arch; + + int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge; + int _pppm_max_spline, _pppm_block; + int _block_pair, _max_shared_types; + int _block_cell_2d, _block_cell_id, _block_nbor_build; + int _block_bio_pair, _max_bio_shared_types; + + UCL_Program *dev_program; + UCL_Kernel k_zero, k_info; + bool _compiled; + int compile_kernels(); + + int _data_in_estimate, _data_out_estimate; + + template <class t> + inline std::string toa(const t& in) { + std::ostringstream o; + o.precision(2); + o << in; + return o.str(); + } + +}; + +} + +#endif diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h new file mode 100644 index 000000000..48ca32d4f --- /dev/null +++ b/lib/gpu/lal_ellipsoid_extra.h @@ -0,0 +1,539 @@ +// ************************************************************************** +// ellipsoid_extra.h +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for Ellipsoid math routines +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifndef LAL_ELLIPSOID_EXTRA_H +#define LAL_ELLIPSOID_EXTRA_H + +enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; + +#ifdef NV_KERNEL +#include "lal_preprocessor.h" +#endif + +#define atom_info(t_per_atom, ii, tid, offset) \ + tid=THREAD_ID_X; \ + offset=tid & (t_per_atom-1); \ + ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; + +#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset, \ + i, numj, stride, list_end, nbor) \ + nbor=nbor_mem+ii; \ + i=*nbor; \ + nbor+=nbor_stride; \ + numj=*nbor; \ + nbor+=nbor_stride; \ + list_end=nbor+fast_mul(nbor_stride,numj); \ + nbor+=fast_mul(offset,nbor_stride); \ + stride=fast_mul(t_per_atom,nbor_stride); + +#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ + eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + __local acctyp red_acc[6][BLOCK_PAIR]; \ + red_acc[0][tid]=f.x; \ + red_acc[1][tid]=f.y; \ + red_acc[2][tid]=f.z; \ + red_acc[3][tid]=energy; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<4; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + f.x=red_acc[0][tid]; \ + f.y=red_acc[1][tid]; \ + f.z=red_acc[2][tid]; \ + energy=red_acc[3][tid]; \ + if (vflag>0) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv=energy; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + } + +#define store_answers_t(f, tor, energy, virial, ii, astride, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + __local acctyp red_acc[7][BLOCK_PAIR]; \ + red_acc[0][tid]=f.x; \ + red_acc[1][tid]=f.y; \ + red_acc[2][tid]=f.z; \ + red_acc[3][tid]=tor.x; \ + red_acc[4][tid]=tor.y; \ + red_acc[5][tid]=tor.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + f.x=red_acc[0][tid]; \ + f.y=red_acc[1][tid]; \ + f.z=red_acc[2][tid]; \ + tor.x=red_acc[3][tid]; \ + tor.y=red_acc[4][tid]; \ + tor.z=red_acc[5][tid]; \ + if (eflag>0 || vflag>0) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + red_acc[6][tid]=energy; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<7; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + energy=red_acc[6][tid]; \ + } \ + } \ + if (offset==0) { \ + __global acctyp *ap1=engv+ii; \ + if (eflag>0) { \ + *ap1=energy; \ + ap1+=astride; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *ap1=virial[i]; \ + ap1+=astride; \ + } \ + } \ + ans[ii]=f; \ + ans[ii+astride]=tor; \ + } + +#define acc_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ + eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + __local acctyp red_acc[6][BLOCK_PAIR]; \ + red_acc[0][tid]=f.x; \ + red_acc[1][tid]=f.y; \ + red_acc[2][tid]=f.z; \ + red_acc[3][tid]=energy; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<4; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + f.x=red_acc[0][tid]; \ + f.y=red_acc[1][tid]; \ + f.z=red_acc[2][tid]; \ + energy=red_acc[3][tid]; \ + if (vflag>0) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv+=energy; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv+=virial[i]; \ + engv+=inum; \ + } \ + } \ + acctyp4 old=ans[ii]; \ + old.x+=f.x; \ + old.y+=f.y; \ + old.z+=f.z; \ + ans[ii]=old; \ + } + +/* ---------------------------------------------------------------------- + dot product of 2 vectors +------------------------------------------------------------------------- */ + +ucl_inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2) +{ + return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2]; +} + +/* ---------------------------------------------------------------------- + cross product of 2 vectors +------------------------------------------------------------------------- */ + +ucl_inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans) +{ + ans[0] = v1[1]*v2[2]-v1[2]*v2[1]; + ans[1] = v1[2]*v2[0]-v1[0]*v2[2]; + ans[2] = v1[0]*v2[1]-v1[1]*v2[0]; +} + +/* ---------------------------------------------------------------------- + determinant of a matrix +------------------------------------------------------------------------- */ + +ucl_inline numtyp gpu_det3(const numtyp m[9]) +{ + numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - + m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + + m[6]*m[1]*m[5] - m[6]*m[2]*m[4]; + return ans; +} + +/* ---------------------------------------------------------------------- + diagonal matrix times a full matrix +------------------------------------------------------------------------- */ + +ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], + numtyp ans[9]) +{ + ans[0] = shape.x*m[0]; + ans[1] = shape.x*m[1]; + ans[2] = shape.x*m[2]; + ans[3] = shape.y*m[3]; + ans[4] = shape.y*m[4]; + ans[5] = shape.y*m[5]; + ans[6] = shape.z*m[6]; + ans[7] = shape.z*m[7]; + ans[8] = shape.z*m[8]; +} + +/* ---------------------------------------------------------------------- + add two matrices +------------------------------------------------------------------------- */ + +ucl_inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9]) +{ + ans[0] = m[0]+m2[0]; + ans[1] = m[1]+m2[1]; + ans[2] = m[2]+m2[2]; + ans[3] = m[3]+m2[3]; + ans[4] = m[4]+m2[4]; + ans[5] = m[5]+m2[5]; + ans[6] = m[6]+m2[6]; + ans[7] = m[7]+m2[7]; + ans[8] = m[8]+m2[8]; +} + +/* ---------------------------------------------------------------------- + multiply the transpose of mat1 times mat2 +------------------------------------------------------------------------- */ + +ucl_inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9], + numtyp ans[9]) +{ + ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6]; + ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7]; + ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8]; + ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6]; + ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7]; + ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8]; + ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6]; + ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7]; + ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8]; +} + +/* ---------------------------------------------------------------------- + row vector times matrix +------------------------------------------------------------------------- */ + +ucl_inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans) +{ + ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6]; + ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7]; + ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2]; +} + +/* ---------------------------------------------------------------------- + solve Ax = b or M ans = v + use gaussian elimination & partial pivoting on matrix + error_flag set to 2 if bad matrix inversion attempted +------------------------------------------------------------------------- */ + +ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans, + __global int *error_flag) +{ + // create augmented matrix for pivoting + + numtyp aug[12], t; + + aug[3] = v[0]; + aug[0] = m[0]; + aug[1] = m[1]; + aug[2] = m[2]; + aug[7] = v[1]; + aug[4] = m[3]; + aug[5] = m[4]; + aug[6] = m[5]; + aug[11] = v[2]; + aug[8] = m[6]; + aug[9] = m[7]; + aug[10] = m[8]; + + if (ucl_abs(aug[4]) > ucl_abs(aug[0])) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; + swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; + swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; + swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; + } + if (ucl_abs(aug[8]) > ucl_abs(aug[0])) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; + swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; + swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; + swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; + } + + if (aug[0] != (numtyp)0.0) { + if (0!=0) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt; + swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt; + swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt; + swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt; + } + } else if (aug[4] != (numtyp)0.0) { + if (1!=0) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; + swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; + swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; + swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; + } + } else if (aug[8] != (numtyp)0.0) { + if (2!=0) { + numtyp swapt; + swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; + swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; + swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; + swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; + } + } else + *error_flag=2; + + t = aug[4]/aug[0]; + aug[5]-=t*aug[1]; + aug[6]-=t*aug[2]; + aug[7]-=t*aug[3]; + t = aug[8]/aug[0]; + aug[9]-=t*aug[1]; + aug[10]-=t*aug[2]; + aug[11]-=t*aug[3]; + + if (ucl_abs(aug[9]) > ucl_abs(aug[5])) { + numtyp swapt; + swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; + swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; + swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; + swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; + } + + if (aug[5] != (numtyp)0.0) { + if (1!=1) { + numtyp swapt; + swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt; + swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt; + swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt; + swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt; + } + } else if (aug[9] != (numtyp)0.0) { + if (2!=1) { + numtyp swapt; + swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; + swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; + swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; + swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; + } + } + + t = aug[9]/aug[5]; + aug[10]-=t*aug[6]; + aug[11]-=t*aug[7]; + + if (aug[10] == (numtyp)0.0) + *error_flag=2; + + ans[2] = aug[11]/aug[10]; + t = (numtyp)0.0; + t += aug[6]*ans[2]; + ans[1] = (aug[7]-t) / aug[5]; + t = (numtyp)0.0; + t += aug[1]*ans[1]; + t += aug[2]*ans[2]; + ans[0] = (aug[3]-t) / aug[0]; +} + +/* ---------------------------------------------------------------------- + compute rotation matrix from quaternion conjugate + quat = [w i j k] +------------------------------------------------------------------------- */ + +ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, + numtyp mat[9]) +{ + numtyp4 q=qif[qi]; + + numtyp w2 = q.x*q.x; + numtyp i2 = q.y*q.y; + numtyp j2 = q.z*q.z; + numtyp k2 = q.w*q.w; + numtyp twoij = (numtyp)2.0*q.y*q.z; + numtyp twoik = (numtyp)2.0*q.y*q.w; + numtyp twojk = (numtyp)2.0*q.z*q.w; + numtyp twoiw = (numtyp)2.0*q.y*q.x; + numtyp twojw = (numtyp)2.0*q.z*q.x; + numtyp twokw = (numtyp)2.0*q.w*q.x; + + mat[0] = w2+i2-j2-k2; + mat[3] = twoij-twokw; + mat[6] = twojw+twoik; + + mat[1] = twoij+twokw; + mat[4] = w2-i2+j2-k2; + mat[7] = twojk-twoiw; + + mat[2] = twoik-twojw; + mat[5] = twojk+twoiw; + mat[8] = w2-i2-j2+k2; +} + +/* ---------------------------------------------------------------------- + transposed matrix times diagonal matrix +------------------------------------------------------------------------- */ + +ucl_inline void gpu_transpose_times_diag3(const numtyp m[9], + const numtyp4 d, numtyp ans[9]) +{ + ans[0] = m[0]*d.x; + ans[1] = m[3]*d.y; + ans[2] = m[6]*d.z; + ans[3] = m[1]*d.x; + ans[4] = m[4]*d.y; + ans[5] = m[7]*d.z; + ans[6] = m[2]*d.x; + ans[7] = m[5]*d.y; + ans[8] = m[8]*d.z; +} + +/* ---------------------------------------------------------------------- + multiply mat1 times mat2 +------------------------------------------------------------------------- */ + +ucl_inline void gpu_times3(const numtyp m[9], const numtyp m2[9], + numtyp ans[9]) +{ + ans[0] = m[0]*m2[0] + m[1]*m2[3] + m[2]*m2[6]; + ans[1] = m[0]*m2[1] + m[1]*m2[4] + m[2]*m2[7]; + ans[2] = m[0]*m2[2] + m[1]*m2[5] + m[2]*m2[8]; + ans[3] = m[3]*m2[0] + m[4]*m2[3] + m[5]*m2[6]; + ans[4] = m[3]*m2[1] + m[4]*m2[4] + m[5]*m2[7]; + ans[5] = m[3]*m2[2] + m[4]*m2[5] + m[5]*m2[8]; + ans[6] = m[6]*m2[0] + m[7]*m2[3] + m[8]*m2[6]; + ans[7] = m[6]*m2[1] + m[7]*m2[4] + m[8]*m2[7]; + ans[8] = m[6]*m2[2] + m[7]*m2[5] + m[8]*m2[8]; +} + +/* ---------------------------------------------------------------------- + Apply principal rotation generator about x to rotation matrix m +------------------------------------------------------------------------- */ + +ucl_inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9]) +{ + ans[0] = 0; + ans[1] = -m[2]; + ans[2] = m[1]; + ans[3] = 0; + ans[4] = -m[5]; + ans[5] = m[4]; + ans[6] = 0; + ans[7] = -m[8]; + ans[8] = m[7]; +} + +/* ---------------------------------------------------------------------- + Apply principal rotation generator about y to rotation matrix m +------------------------------------------------------------------------- */ + +ucl_inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9]) +{ + ans[0] = m[2]; + ans[1] = 0; + ans[2] = -m[0]; + ans[3] = m[5]; + ans[4] = 0; + ans[5] = -m[3]; + ans[6] = m[8]; + ans[7] = 0; + ans[8] = -m[6]; +} + +/* ---------------------------------------------------------------------- + Apply principal rotation generator about z to rotation matrix m +------------------------------------------------------------------------- */ + +ucl_inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9]) +{ + ans[0] = -m[1]; + ans[1] = m[0]; + ans[2] = 0; + ans[3] = -m[4]; + ans[4] = m[3]; + ans[5] = 0; + ans[6] = -m[7]; + ans[7] = m[6]; + ans[8] = 0; +} + +/* ---------------------------------------------------------------------- + matrix times vector +------------------------------------------------------------------------- */ + +ucl_inline void gpu_times_column3(const numtyp m[9], const numtyp v[3], + numtyp ans[3]) +{ + ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2]; + ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2]; + ans[2] = m[6]*v[0] + m[7]*v[1] + m[8]*v[2]; +} + +#endif diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu new file mode 100644 index 000000000..669973a7e --- /dev/null +++ b/lib/gpu/lal_ellipsoid_nbor.cu @@ -0,0 +1,135 @@ +// ************************************************************************** +// ellipsoid_nbor.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for Ellipsoid neighbor routines +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_preprocessor.h" +#endif + +// --------------------------------------------------------------------------- +// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access +// -- Only unpack neighbors matching the specified inclusive range of forms +// -- Only unpack neighbors within cutoff +// --------------------------------------------------------------------------- +__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, + const int ntypes, __global int *dev_nbor, + const int nbor_pitch, const int start, const int inum, + __global int *dev_ij, const int form_low, + const int form_high) { + + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X+start; + + if (ii<inum) { + __global int *nbor=dev_ij+ii; + int i=*nbor; + nbor+=nbor_pitch; + int numj=*nbor; + nbor+=nbor_pitch; + __global int *list_end=nbor+fast_mul(numj,nbor_pitch); + __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch; + + numtyp4 ix=x_[i]; + int iw=ix.w; + int itype=fast_mul(iw,ntypes); + int newj=0; + for ( ; nbor<list_end; nbor+=nbor_pitch) { + int j=*nbor; + j &= NEIGHMASK; + numtyp4 jx=x_[j]; + int jtype=jx.w; + int mtype=itype+jtype; + numtyp2 cf=cut_form[mtype]; + if (cf.y>=form_low && cf.y<=form_high) { + // Compute r12; + numtyp rsq=jx.x-ix.x; + rsq*=rsq; + numtyp t=jx.y-ix.y; + rsq+=t*t; + t=jx.z-ix.z; + rsq+=t*t; + + if (rsq<cf.x) { + *packed=j; + packed+=nbor_pitch; + newj++; + } + } + } + dev_nbor[ii+nbor_pitch]=newj; + } +} + +// --------------------------------------------------------------------------- +// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access +// -- Only unpack neighbors matching the specified inclusive range of forms +// -- Only unpack neighbors within cutoff +// -- Fast version of routine that uses shared memory for LJ constants +// --------------------------------------------------------------------------- +__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form, + __global int *dev_nbor, const int nbor_pitch, + const int start, const int inum, + __global int *dev_ij, const int form_low, + const int form_high) { + + int ii=THREAD_ID_X; + __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + cutsq[ii]=cut_form[ii].x; + form[ii]=cut_form[ii].y; + } + ii+=fast_mul((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start; + __syncthreads(); + + if (ii<inum) { + __global int *nbor=dev_ij+ii; + int i=*nbor; + nbor+=nbor_pitch; + int numj=*nbor; + nbor+=nbor_pitch; + __global int *list_end=nbor+fast_mul(numj,nbor_pitch); + __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch; + + numtyp4 ix=x_[i]; + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + int newj=0; + for ( ; nbor<list_end; nbor+=nbor_pitch) { + int j=*nbor; + j &= NEIGHMASK; + numtyp4 jx=x_[j]; + int jtype=jx.w; + int mtype=itype+jtype; + + if (form[mtype]>=form_low && form[mtype]<=form_high) { + // Compute r12; + numtyp rsq=jx.x-ix.x; + rsq*=rsq; + numtyp t=jx.y-ix.y; + rsq+=t*t; + t=jx.z-ix.z; + rsq+=t*t; + + if (rsq<cutsq[mtype]) { + *packed=j; + packed+=nbor_pitch; + newj++; + } + } + } + dev_nbor[ii+nbor_pitch]=newj; + } +} diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp new file mode 100644 index 000000000..4e5c87022 --- /dev/null +++ b/lib/gpu/lal_gayberne.cpp @@ -0,0 +1,309 @@ +/*************************************************************************** + gayberne.cpp + ------------------- + W. Michael Brown (ORNL) + + Host code for Gay-Berne potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "gayberne_cl.h" +#include "gayberne_lj_cl.h" +#else +#include "gayberne_ptx.h" +#include "gayberne_lj_ptx.h" +#endif + +#include "lal_gayberne.h" +#include <cassert> +using namespace LAMMPS_AL; + +#define GayBerneT GayBerne<numtyp, acctyp> +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +GayBerneT::GayBerne() : BaseEllipsoid<numtyp,acctyp>(), + _allocated(false) { +} + +template <class numtyp, class acctyp> +GayBerneT::~GayBerne() { + clear(); +} + +template <class numtyp, class acctyp> +int GayBerneT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom(max_nbors); +} + +template <class numtyp, class acctyp> +int GayBerneT::init(const int ntypes, const double gamma, + const double upsilon, const double mu, + double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, + int **h_form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, + double **host_offset, const double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ntypes,h_form,gayberne,gayberne_lj); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + _shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) { + lj_types=max_shared_types; + _shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for copying type data + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write, + host_sigma,host_epsilon); + + this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write, + host_cutsq,h_form); + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq,h_form); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + dev_error.alloc(1,*(this->ucl_device)); + dev_error.zero(); + + // Allocate, cast and asynchronous memcpy of constant data + // Copy data for bonded interactions + gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY); + host_write[0]=static_cast<numtyp>(gamma); + host_write[1]=static_cast<numtyp>(upsilon); + host_write[2]=static_cast<numtyp>(mu); + host_write[3]=static_cast<numtyp>(host_special_lj[0]); + host_write[4]=static_cast<numtyp>(host_special_lj[1]); + host_write[5]=static_cast<numtyp>(host_special_lj[2]); + host_write[6]=static_cast<numtyp>(host_special_lj[3]); + ucl_copy(gamma_upsilon_mu,host_write,7,false); + + lshape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + UCL_H_Vec<double> d_view; + d_view.view(host_lshape,lshape.numel(),*(this->ucl_device)); + ucl_copy(lshape,d_view,false); + + // Copy shape, well, sigma, epsilon, and cutsq onto GPU + // - cast if necessary + shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<ntypes; i++) { + host_write[i*4]=host_shape[i][0]; + host_write[i*4+1]=host_shape[i][1]; + host_write[i*4+2]=host_shape[i][2]; + } + UCL_H_Vec<numtyp4> view4; + view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device)); + ucl_copy(shape,view4,false); + + well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<ntypes; i++) { + host_write[i*4]=host_well[i][0]; + host_write[i*4+1]=host_well[i][1]; + host_write[i*4+2]=host_well[i][2]; + } + view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device)); + ucl_copy(well,view4,false); + + _allocated=true; + this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+ + lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+ + lshape.row_bytes()+shape.row_bytes()+well.row_bytes(); + + return 0; +} + +template <class numtyp, class acctyp> +void GayBerneT::clear() { + if (!_allocated) + return; + + UCL_H_Vec<int> err_flag(1,*(this->ucl_device)); + ucl_copy(err_flag,dev_error,false); + if (err_flag[0] == 2) + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + err_flag.clear(); + + _allocated=false; + + dev_error.clear(); + lj1.clear(); + lj3.clear(); + sigma_epsilon.clear(); + this->cut_form.clear(); + + shape.clear(); + well.clear(); + lshape.clear(); + gamma_upsilon_mu.clear(); + + this->clear_base(); +} + +template <class numtyp, class acctyp> +double GayBerneT::host_memory_usage() const { + return this->host_memory_usage_base()+sizeof(GayBerneT)+ + 4*sizeof(numtyp); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void GayBerneT::loop(const bool _eflag, const bool _vflag) { + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=0, NGX; + int stride=this->nbor->nbor_pitch(); + int ainum=this->ans->inum(); + + if (this->_multiple_forms) { + this->time_nbor1.start(); + if (this->_last_ellipse>0) { + // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- + GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/ + (BX/this->_threads_per_atom))); + NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX)); + this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE, + ELLIPSE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor1.stop(); + + this->time_ellipsoid.start(); + this->k_ellipsoid.set_size(GX,BX); + this->k_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(), + &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), + &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, + &this->_threads_per_atom); + this->time_ellipsoid.stop(); + + if (this->_last_ellipse==this->ans->inum()) { + this->time_nbor2.start(); + this->time_nbor2.stop(); + this->time_ellipsoid2.start(); + this->time_ellipsoid2.stop(); + this->time_lj.start(); + this->time_lj.stop(); + return; + } + + // ------------ SPHERE_ELLIPSE --------------- + + this->time_nbor2.start(); + GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()- + this->_last_ellipse)/ + (BX/this->_threads_per_atom))); + NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()- + this->_last_ellipse)/BX)); + this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(), + SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor2.stop(); + + this->time_ellipsoid2.start(); + this->k_sphere_ellipsoid.set_size(GX,BX); + this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), + &this->well.begin(), &this->gamma_upsilon_mu.begin(), + &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), + &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, + &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + this->time_ellipsoid2.stop(); + } else { + this->ans->dev_ans.zero(); + this->ans->dev_engv.zero(); + this->time_nbor1.stop(); + this->time_ellipsoid.start(); + this->time_ellipsoid.stop(); + this->time_nbor2.start(); + this->time_nbor2.stop(); + this->time_ellipsoid2.start(); + this->time_ellipsoid2.stop(); + } + + // ------------ LJ --------------- + this->time_lj.start(); + if (this->_last_ellipse<this->ans->inum()) { + if (this->_shared_types) { + this->k_lj_fast.set_size(GX,BX); + this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(), + &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride, + &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), + &eflag, &vflag, &this->_last_ellipse, &ainum, + &this->_threads_per_atom); + } else { + this->k_lj.set_size(GX,BX); + this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(), + &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(), + &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, + &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + } + } + this->time_lj.stop(); + } else { + GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX)); + this->time_nbor1.start(); + this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE, + ELLIPSE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor1.stop(); + this->time_ellipsoid.start(); + this->k_ellipsoid.set_size(GX,BX); + this->k_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(), + &stride, &this->ans->dev_ans.begin(), &ainum, + &this->ans->dev_engv.begin(), &this->dev_error.begin(), + &eflag, &vflag, &ainum, &this->_threads_per_atom); + this->time_ellipsoid.stop(); + } +} + +template class GayBerne<PRECISION,ACC_PRECISION>; + diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu new file mode 100644 index 000000000..e2bfe4b1b --- /dev/null +++ b/lib/gpu/lal_gayberne.cu @@ -0,0 +1,356 @@ +// ************************************************************************** +// gayberne.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for Gay-Berne potential acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_ellipsoid_extra.h" +#endif + +ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, + numtyp ans[9]) +{ + numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- + m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- + m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; + den = ucl_recip(den); + + ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- + m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ + m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- + m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ + m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; + + ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ + (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- + (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- + m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ + m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; + + ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- + m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- + m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ + (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ + m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; + + ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ + m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ + m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- + m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- + m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; + + ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ + (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- + (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ + m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- + m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; + + ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- + m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ + (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ + m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- + (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; + + ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ + (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ + m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- + m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- + m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; + + ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- + (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ + (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- + m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ + m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; + + ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- + m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- + m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ + (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ + m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; +} + +__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *gum, __global numtyp2* sig_eps, + const int ntypes, __global numtyp *lshape, + __global int *dev_nbor, const int stride, + __global acctyp4 *ans, const int astride, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag, const int inum, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *nbor_end; + int i, numj, n_stride; + nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, + n_stride,nbor_end,nbor); + + numtyp4 ix=x_[i]; + int itype=ix.w; + numtyp a1[9], b1[9], g1[9]; + numtyp4 ishape=shape[itype]; + { + numtyp t[9]; + gpu_quat_to_mat_trans(q,i,a1); + gpu_diag_times3(ishape,a1,t); + gpu_transpose_times3(a1,t,g1); + gpu_diag_times3(well[itype],a1,t); + gpu_transpose_times3(a1,t,b1); + } + + numtyp factor_lj; + for ( ; nbor<nbor_end; nbor+=n_stride) { + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp r12[3]; + r12[0] = jx.x-ix.x; + r12[1] = jx.y-ix.y; + r12[2] = jx.z-ix.z; + numtyp ir = gpu_dot3(r12,r12); + + ir = ucl_rsqrt(ir); + numtyp r = ucl_recip(ir); + + numtyp a2[9]; + gpu_quat_to_mat_trans(q,j,a2); + + numtyp u_r, dUr[3], tUr[3], eta, teta[3]; + { // Compute U_r, dUr, eta, and teta + // Compute g12 + numtyp g12[9]; + { + numtyp g2[9]; + { + gpu_diag_times3(shape[jtype],a2,g12); + gpu_transpose_times3(a2,g12,g2); + gpu_plus3(g1,g2,g12); + } + + { // Compute U_r and dUr + + // Compute kappa + numtyp kappa[3]; + gpu_mldivide3(g12,r12,kappa,err_flag); + + // -- replace r12 with r12 hat + r12[0]*=ir; + r12[1]*=ir; + r12[2]*=ir; + + // -- kappa is now / r + kappa[0]*=ir; + kappa[1]*=ir; + kappa[2]*=ir; + + // energy + + // compute u_r and dUr + numtyp uslj_rsq; + { + // Compute distance of closest approach + numtyp h12, sigma12; + sigma12 = gpu_dot3(r12,kappa); + sigma12 = ucl_rsqrt((numtyp)0.5*sigma12); + h12 = r-sigma12; + + // -- kappa is now ok + kappa[0]*=r; + kappa[1]*=r; + kappa[2]*=r; + + int mtype=fast_mul(ntypes,itype)+jtype; + numtyp sigma = sig_eps[mtype].x; + numtyp epsilon = sig_eps[mtype].y; + numtyp varrho = sigma/(h12+gum[0]*sigma); + numtyp varrho6 = varrho*varrho*varrho; + varrho6*=varrho6; + numtyp varrho12 = varrho6*varrho6; + u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); + + numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; + temp1 = temp1*(numtyp)24.0*epsilon; + uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; + numtyp temp2 = gpu_dot3(kappa,r12); + uslj_rsq = uslj_rsq*ir*ir; + + dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); + dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); + dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); + } + + // torque for particle 1 + { + numtyp tempv[3], tempv2[3]; + tempv[0] = -uslj_rsq*kappa[0]; + tempv[1] = -uslj_rsq*kappa[1]; + tempv[2] = -uslj_rsq*kappa[2]; + gpu_row_times3(kappa,g1,tempv2); + gpu_cross3(tempv,tempv2,tUr); + } + } + } + + // Compute eta + { + eta = (numtyp)2.0*lshape[itype]*lshape[jtype]; + numtyp det_g12 = gpu_det3(g12); + eta = ucl_powr(eta/det_g12,gum[1]); + } + + // Compute teta + numtyp temp[9], tempv[3], tempv2[3]; + compute_eta_torque(g12,a1,ishape,temp); + numtyp temp1 = -eta*gum[1]; + + tempv[0] = temp1*temp[0]; + tempv[1] = temp1*temp[1]; + tempv[2] = temp1*temp[2]; + gpu_cross3(a1,tempv,tempv2); + teta[0] = tempv2[0]; + teta[1] = tempv2[1]; + teta[2] = tempv2[2]; + + tempv[0] = temp1*temp[3]; + tempv[1] = temp1*temp[4]; + tempv[2] = temp1*temp[5]; + gpu_cross3(a1+3,tempv,tempv2); + teta[0] += tempv2[0]; + teta[1] += tempv2[1]; + teta[2] += tempv2[2]; + + tempv[0] = temp1*temp[6]; + tempv[1] = temp1*temp[7]; + tempv[2] = temp1*temp[8]; + gpu_cross3(a1+6,tempv,tempv2); + teta[0] += tempv2[0]; + teta[1] += tempv2[1]; + teta[2] += tempv2[2]; + } + + numtyp chi, dchi[3], tchi[3]; + { // Compute chi and dchi + + // Compute b12 + numtyp b2[9], b12[9]; + { + gpu_diag_times3(well[jtype],a2,b12); + gpu_transpose_times3(a2,b12,b2); + gpu_plus3(b1,b2,b12); + } + + // compute chi_12 + r12[0]*=r; + r12[1]*=r; + r12[2]*=r; + numtyp iota[3]; + gpu_mldivide3(b12,r12,iota,err_flag); + // -- iota is now iota/r + iota[0]*=ir; + iota[1]*=ir; + iota[2]*=ir; + r12[0]*=ir; + r12[1]*=ir; + r12[2]*=ir; + chi = gpu_dot3(r12,iota); + chi = ucl_powr(chi*(numtyp)2.0,gum[2]); + + // -- iota is now ok + iota[0]*=r; + iota[1]*=r; + iota[2]*=r; + + numtyp temp1 = gpu_dot3(iota,r12); + numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*ucl_powr(chi,(gum[2]-(numtyp)1.0)/ + gum[2]); + dchi[0] = temp2*(iota[0]-temp1*r12[0]); + dchi[1] = temp2*(iota[1]-temp1*r12[1]); + dchi[2] = temp2*(iota[2]-temp1*r12[2]); + + // compute t_chi + numtyp tempv[3]; + gpu_row_times3(iota,b1,tempv); + gpu_cross3(tempv,iota,tchi); + temp1 = (numtyp)-4.0*ir*ir; + tchi[0] *= temp1; + tchi[1] *= temp1; + tchi[2] *= temp1; + } + + numtyp temp2 = factor_lj*eta*chi; + if (eflag>0) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag>0) { + r12[0]*=-r; + r12[1]*=-r; + r12[2]*=-r; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + f.x+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + f.y+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + f.z+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + f.x+=temp1*dchi[0]-temp2*dUr[0]; + f.y+=temp1*dchi[1]-temp2*dUr[1]; + f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + + // Torque on 1 + temp1 = -u_r*eta*factor_lj; + temp2 = -u_r*chi*factor_lj; + numtyp temp3 = -chi*eta*factor_lj; + tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; + tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; + tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; + + } // for nbor + store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_gayberne.h b/lib/gpu/lal_gayberne.h new file mode 100644 index 000000000..dacaf7428 --- /dev/null +++ b/lib/gpu/lal_gayberne.h @@ -0,0 +1,94 @@ +/*************************************************************************** + gayberne.h + ------------------- + W. Michael Brown (ORNL) + + Host code for Gay-Berne potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_GAYBERNE_H +#define LAL_GAYBERNE_H + +#include "lal_base_ellipsoid.h" +#include "mpi.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class GayBerne : public BaseEllipsoid<numtyp, acctyp> { + public: + GayBerne(); + ~GayBerne(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \return false if there is not sufficient memory or device init prob + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const double gamma, + const double upsilon, const double mu, double **host_shape, + double **host_well, double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, int **h_form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + const double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Device Error Flag - Set if a bad matrix inversion occurs + UCL_D_Vec<int> dev_error; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec<numtyp4> lj3; + /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon + UCL_D_Vec<numtyp2> sigma_epsilon; + // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ... + UCL_D_Vec<numtyp> gamma_upsilon_mu; + + /// If atom type constants fit in shared memory, use fast kernels + bool _shared_types; + int _lj_types; + + // --------------------------- ATOM DATA -------------------------- + + /// Aspherical Const Data for Atoms + UCL_D_Vec<numtyp4> shape, well; + /// Aspherical Const Data for Atoms + UCL_D_Vec<numtyp> lshape; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_gayberne_ext.cpp b/lib/gpu/lal_gayberne_ext.cpp new file mode 100644 index 000000000..047250363 --- /dev/null +++ b/lib/gpu/lal_gayberne_ext.cpp @@ -0,0 +1,141 @@ +/*************************************************************************** + gayberne_ext.cpp + ------------------- + W. Michael Brown + + LAMMPS Wrappers for Gay-Berne Acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_gayberne.h" + +using namespace std; +using namespace LAMMPS_AL; + +static GayBerne<PRECISION,ACC_PRECISION> GBMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int gb_gpu_init(const int ntypes, const double gamma, + const double upsilon, const double mu, double **shape, + double **well, double **cutsq, double **sigma, + double **epsilon, double *host_lshape, int **form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { + GBMF.clear(); + gpu_mode=GBMF.device->gpu_mode(); + double gpu_split=GBMF.device->particle_split(); + int first_gpu=GBMF.device->first_device(); + int last_gpu=GBMF.device->last_device(); + int world_me=GBMF.device->world_me(); + int gpu_rank=GBMF.device->gpu_rank(); + int procs_per_gpu=GBMF.device->procs_per_gpu(); + + GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu); + + bool message=false; + if (GBMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, + sigma, epsilon, host_lshape, form, host_lj1, + host_lj2, host_lj3, host_lj4, offset, special_lj, + inum, nall, max_nbors, maxspecial, cell_size, gpu_split, + screen); + + GBMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma, + epsilon, host_lshape, form, host_lj1, host_lj2, + host_lj3, host_lj4, offset, special_lj, inum, nall, + max_nbors, maxspecial, cell_size, gpu_split, screen); + + GBMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + GBMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Clear memory on host and device +// --------------------------------------------------------------------------- +void gb_gpu_clear() { + GBMF.clear(); +} + + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **host_quat); + +int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **jnum, const double cpu_time, bool &success, + double **host_quat) { + return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, + tag, nspecial, special, eflag, vflag, eatom, vatom, + host_start, ilist, jnum, cpu_time, success, host_quat); +} + +int * gb_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_quat) { + return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist, + numj, firstneigh, eflag, vflag, eatom, vatom, host_start, + cpu_time, success, host_quat); +} + +// --------------------------------------------------------------------------- +// Return memory usage +// --------------------------------------------------------------------------- +double gb_gpu_bytes() { + return GBMF.host_memory_usage(); +} + diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu new file mode 100644 index 000000000..bf294e1bb --- /dev/null +++ b/lib/gpu/lal_gayberne_lj.cu @@ -0,0 +1,408 @@ +// ************************************************************************** +// gayberne_lj.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for Gay-Berne - Lennard-Jones potential acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_ellipsoid_extra.h" +#endif + +__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, + __global numtyp4* shape,__global numtyp4* well, + __global numtyp *gum, __global numtyp2* sig_eps, + const int ntypes, __global numtyp *lshape, + __global int *dev_nbor, const int stride, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag,const int start, const int inum, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + ii+=start; + + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *nbor_end; + int i, numj, n_stride; + nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, + n_stride,nbor_end,nbor); + + numtyp4 ix=x_[i]; + int itype=ix.w; + + numtyp oner=shape[itype].x; + numtyp one_well=well[itype].x; + + numtyp factor_lj; + for ( ; nbor<nbor_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp r12[3]; + r12[0] = jx.x-ix.x; + r12[1] = jx.y-ix.y; + r12[2] = jx.z-ix.z; + numtyp ir = gpu_dot3(r12,r12); + + ir = ucl_rsqrt(ir); + numtyp r = ucl_recip(ir); + + numtyp r12hat[3]; + r12hat[0]=r12[0]*ir; + r12hat[1]=r12[1]*ir; + r12hat[2]=r12[2]*ir; + + numtyp a2[9]; + gpu_quat_to_mat_trans(q,j,a2); + + numtyp u_r, dUr[3], eta; + { // Compute U_r, dUr, eta, and teta + // Compute g12 + numtyp g12[9]; + { + { + numtyp g2[9]; + gpu_diag_times3(shape[jtype],a2,g12); + gpu_transpose_times3(a2,g12,g2); + g12[0]=g2[0]+oner; + g12[4]=g2[4]+oner; + g12[8]=g2[8]+oner; + g12[1]=g2[1]; + g12[2]=g2[2]; + g12[3]=g2[3]; + g12[5]=g2[5]; + g12[6]=g2[6]; + g12[7]=g2[7]; + } + + { // Compute U_r and dUr + + // Compute kappa + numtyp kappa[3]; + gpu_mldivide3(g12,r12,kappa,err_flag); + + // -- kappa is now / r + kappa[0]*=ir; + kappa[1]*=ir; + kappa[2]*=ir; + + // energy + + // compute u_r and dUr + numtyp uslj_rsq; + { + // Compute distance of closest approach + numtyp h12, sigma12; + sigma12 = gpu_dot3(r12hat,kappa); + sigma12 = ucl_rsqrt((numtyp)0.5*sigma12); + h12 = r-sigma12; + + // -- kappa is now ok + kappa[0]*=r; + kappa[1]*=r; + kappa[2]*=r; + + int mtype=fast_mul(ntypes,itype)+jtype; + numtyp sigma = sig_eps[mtype].x; + numtyp epsilon = sig_eps[mtype].y; + numtyp varrho = sigma/(h12+gum[0]*sigma); + numtyp varrho6 = varrho*varrho*varrho; + varrho6*=varrho6; + numtyp varrho12 = varrho6*varrho6; + u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); + + numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; + temp1 = temp1*(numtyp)24.0*epsilon; + uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; + numtyp temp2 = gpu_dot3(kappa,r12hat); + uslj_rsq = uslj_rsq*ir*ir; + + dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]); + dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]); + dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]); + } + } + } + + // Compute eta + { + eta = (numtyp)2.0*lshape[itype]*lshape[jtype]; + numtyp det_g12 = gpu_det3(g12); + eta = ucl_powr(eta/det_g12,gum[1]); + } + } + + numtyp chi, dchi[3]; + { // Compute chi and dchi + + // Compute b12 + numtyp b12[9]; + { + numtyp b2[9]; + gpu_diag_times3(well[jtype],a2,b12); + gpu_transpose_times3(a2,b12,b2); + b12[0]=b2[0]+one_well; + b12[4]=b2[4]+one_well; + b12[8]=b2[8]+one_well; + b12[1]=b2[1]; + b12[2]=b2[2]; + b12[3]=b2[3]; + b12[5]=b2[5]; + b12[6]=b2[6]; + b12[7]=b2[7]; + } + + // compute chi_12 + numtyp iota[3]; + gpu_mldivide3(b12,r12,iota,err_flag); + // -- iota is now iota/r + iota[0]*=ir; + iota[1]*=ir; + iota[2]*=ir; + chi = gpu_dot3(r12hat,iota); + chi = ucl_powr(chi*(numtyp)2.0,gum[2]); + + // -- iota is now ok + iota[0]*=r; + iota[1]*=r; + iota[2]*=r; + + numtyp temp1 = gpu_dot3(iota,r12hat); + numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*ucl_powr(chi,(gum[2]-(numtyp)1.0)/ + gum[2]); + dchi[0] = temp2*(iota[0]-temp1*r12hat[0]); + dchi[1] = temp2*(iota[1]-temp1*r12hat[1]); + dchi[2] = temp2*(iota[2]-temp1*r12hat[2]); + } + + numtyp temp2 = factor_lj*eta*chi; + if (eflag>0) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag>0) { + r12[0]*=-1; + r12[1]*=-1; + r12[2]*=-1; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + f.x+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + f.y+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + f.z+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + f.x+=temp1*dchi[0]-temp2*dUr[0]; + f.y+=temp1*dchi[1]-temp2*dUr[1]; + f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + ii+=start; + + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=x_[i]; + int itype=ix.w; + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + int ii=itype*lj_types+jtype; + if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) { + r2inv=ucl_recip(r2inv); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y); + force*=factor_lj; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); + energy+=factor_lj*(e-lj3[ii].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + ii+=start; + + __local numtyp sp_lj[4]; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + if (tid<4) + sp_lj[tid]=gum[tid+3]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + if (eflag>0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=x_[i]; + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) { + r2inv=ucl_recip(r2inv); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp new file mode 100644 index 000000000..a90e96f17 --- /dev/null +++ b/lib/gpu/lal_lj.cpp @@ -0,0 +1,154 @@ +/*************************************************************************** + lj.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/cut pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "lj_cl.h" +#else +#include "lj_ptx.h" +#endif + +#include "lal_lj.h" +#include <cassert> +using namespace LAMMPS_AL; +#define LJT LJ<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +LJT::LJ() : BaseAtomic<numtyp,acctyp>(), _allocated(false) { +} + +template <class numtyp, class acctyp> +LJT::~LJ() { + clear(); +} + +template <class numtyp, class acctyp> +int LJT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int LJT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + UCL_H_Vec<double> dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void LJT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double LJT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJ<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void LJT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJ<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu new file mode 100644 index 000000000..12e2a487e --- /dev/null +++ b/lib/gpu/lal_lj.cu @@ -0,0 +1,188 @@ +// ************************************************************************** +// lj.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the lj/cut pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int itype=ix.w; + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (r2inv<lj1[mtype].z) { + r2inv=ucl_recip(r2inv); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + force*=factor_lj; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + if (eflag>0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv<lj1[mtype].z) { + r2inv=ucl_recip(r2inv); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h new file mode 100644 index 000000000..9555a3c88 --- /dev/null +++ b/lib/gpu/lal_lj.h @@ -0,0 +1,79 @@ +/*************************************************************************** + lj.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/cut pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_LJ_H +#define LAL_LJ_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class LJ : public BaseAtomic<numtyp, acctyp> { + public: + LJ(); + ~LJ(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec<numtyp4> lj3; + /// Special LJ values + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp new file mode 100644 index 000000000..6331574b8 --- /dev/null +++ b/lib/gpu/lal_lj96.cpp @@ -0,0 +1,154 @@ +/*************************************************************************** + lj96.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj96/cut pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "lj96_cl.h" +#else +#include "lj96_ptx.h" +#endif + +#include "lal_lj96.h" +#include <cassert> +using namespace LAMMPS_AL; +#define LJ96T LJ96<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +LJ96T::LJ96() : BaseAtomic<numtyp,acctyp>(), _allocated(false) { +} + +template <class numtyp, class acctyp> +LJ96T::~LJ96() { + clear(); +} + +template <class numtyp, class acctyp> +int LJ96T::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int LJ96T::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj96); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + UCL_H_Vec<double> dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void LJ96T::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double LJ96T::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJ96<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void LJ96T::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJ96<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu new file mode 100644 index 000000000..c5ea89a74 --- /dev/null +++ b/lib/gpu/lal_lj96.cu @@ -0,0 +1,190 @@ +// ************************************************************************** +// lj96.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the lj96/cut style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int itype=ix.w; + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (r2inv<lj1[mtype].z) { + r2inv=ucl_recip(r2inv); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp r3inv = ucl_sqrt(r6inv); + numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y); + force*=factor_lj; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + if (eflag>0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv<lj1[mtype].z) { + r2inv=ucl_recip(r2inv); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp r3inv = ucl_sqrt(r6inv); + numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y); + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_lj96.h b/lib/gpu/lal_lj96.h new file mode 100644 index 000000000..7d51e287d --- /dev/null +++ b/lib/gpu/lal_lj96.h @@ -0,0 +1,79 @@ +/*************************************************************************** + lj96.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj96/cut pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_LJ96_H +#define LAL_LJ96_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class LJ96 : public BaseAtomic<numtyp, acctyp> { + public: + LJ96(); + ~LJ96(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec<numtyp4> lj3; + /// Special LJ values + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp new file mode 100644 index 000000000..4fe188057 --- /dev/null +++ b/lib/gpu/lal_lj96_ext.cpp @@ -0,0 +1,120 @@ +/*************************************************************************** + lj96_ext.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to lj96/cut acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_lj96.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJ96<PRECISION,ACC_PRECISION> LJ96MF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + LJ96MF.clear(); + gpu_mode=LJ96MF.device->gpu_mode(); + double gpu_split=LJ96MF.device->particle_split(); + int first_gpu=LJ96MF.device->first_device(); + int last_gpu=LJ96MF.device->last_device(); + int world_me=LJ96MF.device->world_me(); + int gpu_rank=LJ96MF.device->gpu_rank(); + int procs_per_gpu=LJ96MF.device->procs_per_gpu(); + + LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu); + + bool message=false; + if (LJ96MF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + LJ96MF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen); + + LJ96MF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJ96MF.estimate_gpu_overhead(); + return init_ok; +} + +void lj96_gpu_clear() { + LJ96MF.clear(); +} + +int** lj96_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void lj96_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, + eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double lj96_gpu_bytes() { + return LJ96MF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp new file mode 100644 index 000000000..d5d67e8d3 --- /dev/null +++ b/lib/gpu/lal_lj_class2_long.cpp @@ -0,0 +1,168 @@ +/*************************************************************************** + lj_class2_long.cpp + ------------------- + W. Michael Brown + + Host code for COMPASS LJ long potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Mon May 16 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "lj_class2_long_cl.h" +#else +#include "lj_class2_long_ptx.h" +#endif + +#include "lal_lj_class2_long.h" +#include <cassert> +using namespace LAMMPS_AL; + +#define LJClass2LongT LJClass2Long<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +LJClass2LongT::LJClass2Long() : BaseCharge<numtyp,acctyp>(), + _allocated(false) { +} + +template <class numtyp, class acctyp> +LJClass2LongT::~LJClass2Long() { + clear(); +} + +template <class numtyp, class acctyp> +int LJClass2LongT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int LJClass2LongT::init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_class2_long); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void LJClass2LongT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double LJClass2LongT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJClass2Long<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJClass2Long<PRECISION,ACC_PRECISION>; + diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu new file mode 100644 index 000000000..aabdbb9c2 --- /dev/null +++ b/lib/gpu/lal_lj_class2_long.cu @@ -0,0 +1,252 @@ +// ************************************************************************** +// lj_class2_long.cu +// ------------------- +// W. Michael Brown +// +// Device code for COMPASS LJ long acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : Mon May 16 2011 +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +texture<float> q_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +ucl_inline float fetch_q(const int& i, const float *q) + { return tex1Dfetch(q_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int itype=ix.w; + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (rsq<lj1[mtype].z) { + numtyp r2inv=ucl_recip(rsq); + numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc; + + if (rsq < lj1[mtype].w) { + numtyp rinv=ucl_rsqrt(rsq); + r3inv=r2inv*rinv; + r6inv = r3inv*r3inv; + force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y); + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].w) { + numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + if (eflag>0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<lj1[mtype].z) { + numtyp r2inv=ucl_recip(rsq); + numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc; + + if (rsq < lj1[mtype].w) { + numtyp rinv=ucl_rsqrt(rsq); + r3inv=r2inv*rinv; + r6inv = r3inv*r3inv; + force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y); + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].w) { + numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_lj_class2_long.h b/lib/gpu/lal_lj_class2_long.h new file mode 100644 index 000000000..9dd151f63 --- /dev/null +++ b/lib/gpu/lal_lj_class2_long.h @@ -0,0 +1,84 @@ +/*************************************************************************** + lj_class2_long.h + ------------------- + W. Michael Brown + + Host code for COMPASS LJ long potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Mon May 16 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LJ_CLASS2_LONG_H +#define LJ_CLASS2_LONG_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class LJClass2Long : public BaseCharge<numtyp, acctyp> { + public: + LJClass2Long(); + ~LJClass2Long(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec<numtyp4> lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif + diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp new file mode 100644 index 000000000..7e637d4c9 --- /dev/null +++ b/lib/gpu/lal_lj_class2_long_ext.cpp @@ -0,0 +1,129 @@ +/*************************************************************************** + lj_class2_long_ext.cpp + ------------------- + W. Michael Brown + + LAMMPS Wrappers for COMMPASS LJ long Acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Mon May 16 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_lj_class2_long.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJClass2Long<PRECISION,ACC_PRECISION> C2CLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + C2CLMF.clear(); + gpu_mode=C2CLMF.device->gpu_mode(); + double gpu_split=C2CLMF.device->particle_split(); + int first_gpu=C2CLMF.device->first_device(); + int last_gpu=C2CLMF.device->last_device(); + int world_me=C2CLMF.device->world_me(); + int gpu_rank=C2CLMF.device->gpu_rank(); + int procs_per_gpu=C2CLMF.device->procs_per_gpu(); + + C2CLMF.device->init_message(screen,"lj/class2/coul/long",first_gpu,last_gpu); + + bool message=false; + if (C2CLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + + C2CLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + + C2CLMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + C2CLMF.estimate_gpu_overhead(); + return init_ok; +} + +void c2cl_gpu_clear() { + C2CLMF.clear(); +} + +int** c2cl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return C2CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void c2cl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + C2CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double c2cl_gpu_bytes() { + return C2CLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp new file mode 100644 index 000000000..c649e89e1 --- /dev/null +++ b/lib/gpu/lal_lj_coul.cpp @@ -0,0 +1,169 @@ +/*************************************************************************** + lj_coul.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/cut/coul/cut pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "lj_coul_cl.h" +#else +#include "lj_coul_ptx.h" +#endif + +#include "lal_lj_coul.h" +#include <cassert> +using namespace LAMMPS_AL; +#define LJCoulT LJCoul<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +LJCoulT::LJCoul() : BaseCharge<numtyp,acctyp>(), + _allocated(false) { +} + +template <class numtyp, class acctyp> +LJCoulT::~LJCoul() { + clear(); +} + +template <class numtyp, class acctyp> +int LJCoulT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int LJCoulT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_coul); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cut_ljsq, host_cut_coulsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _qqrd2e=qqrd2e; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void LJCoulT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + cutsq.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double LJCoulT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCoul<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void LJCoulT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, + &this->atom->dev_q.begin(), &cutsq.begin(), + &_qqrd2e, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->dev_q.begin(), + &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJCoul<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu new file mode 100644 index 000000000..221e5cdc8 --- /dev/null +++ b/lib/gpu/lal_lj_coul.cu @@ -0,0 +1,236 @@ +// ************************************************************************** +// lj_coul.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the lj/coul/cut pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +texture<float> q_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +ucl_inline float fetch_q(const int& i, const float *q) + { return tex1Dfetch(q_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_ , + __global numtyp *cutsq, const numtyp qqrd2e, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int itype=ix.w; + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (rsq<cutsq[mtype]) { + numtyp r2inv=ucl_recip(rsq); + numtyp forcecoul, force_lj, force, r6inv; + + if (rsq < lj1[mtype].z) { + r6inv = r2inv*r2inv*r2inv; + force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + } else + force_lj = (numtyp)0.0; + + if (rsq < lj1[mtype].w) + forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul; + else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + e_coul += forcecoul; + if (rsq < lj1[mtype].z) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + __global numtyp *_cutsq, const numtyp qqrd2e, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + cutsq[tid]=_cutsq[tid]; + if (eflag>0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<cutsq[mtype]) { + numtyp r2inv=ucl_recip(rsq); + numtyp forcecoul, force_lj, force, r6inv; + + if (rsq < lj1[mtype].z) { + r6inv = r2inv*r2inv*r2inv; + force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + } else + force_lj = (numtyp)0.0; + + if (rsq < lj1[mtype].w) + forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul; + else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + e_coul += forcecoul; + if (rsq < lj1[mtype].z) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_lj_coul.h b/lib/gpu/lal_lj_coul.h new file mode 100644 index 000000000..abea5a2d5 --- /dev/null +++ b/lib/gpu/lal_lj_coul.h @@ -0,0 +1,85 @@ +/*************************************************************************** + lj_coul.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/cut/coul/cut pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_LJ_COUL_H +#define LAL_LJ_COUL_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class LJCoul : public BaseCharge<numtyp, acctyp> { + public: + LJCoul(); + ~LJCoul(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec<numtyp4> lj3; + /// cutsq + UCL_D_Vec<numtyp> cutsq; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp new file mode 100644 index 000000000..b0dec6f07 --- /dev/null +++ b/lib/gpu/lal_lj_coul_ext.cpp @@ -0,0 +1,128 @@ +/*************************************************************************** + lj_coul_ext.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to lj/cut/coul acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_lj_coul.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJCoul<PRECISION,ACC_PRECISION> LJCMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + LJCMF.clear(); + gpu_mode=LJCMF.device->gpu_mode(); + double gpu_split=LJCMF.device->particle_split(); + int first_gpu=LJCMF.device->first_device(); + int last_gpu=LJCMF.device->last_device(); + int world_me=LJCMF.device->world_me(); + int gpu_rank=LJCMF.device->gpu_rank(); + int procs_per_gpu=LJCMF.device->procs_per_gpu(); + + LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu); + + bool message=false; + if (LJCMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); + + LJCMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); + + LJCMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJCMF.estimate_gpu_overhead(); + return init_ok; +} + +void ljc_gpu_clear() { + LJCMF.clear(); +} + +int** ljc_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljc_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); +} + +double ljc_gpu_bytes() { + return LJCMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp new file mode 100644 index 000000000..bee116a8d --- /dev/null +++ b/lib/gpu/lal_lj_coul_long.cpp @@ -0,0 +1,167 @@ +/*************************************************************************** + lj_coul_long.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/cut/coul/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "lj_coul_long_cl.h" +#else +#include "lj_coul_long_ptx.h" +#endif + +#include "lal_lj_coul_long.h" +#include <cassert> +using namespace LAMMPS_AL; +#define LJCoulLongT LJCoulLong<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +LJCoulLongT::LJCoulLong() : BaseCharge<numtyp,acctyp>(), + _allocated(false) { +} + +template <class numtyp, class acctyp> +LJCoulLongT::~LJCoulLong() { + clear(); +} + +template <class numtyp, class acctyp> +int LJCoulLongT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int LJCoulLongT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_coul_long); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void LJCoulLongT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double LJCoulLongT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCoulLong<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void LJCoulLongT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJCoulLong<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu new file mode 100644 index 000000000..686186a4e --- /dev/null +++ b/lib/gpu/lal_lj_coul_long.cu @@ -0,0 +1,248 @@ +// ************************************************************************** +// lj_coul_long.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the lj/cut/coul/long pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +texture<float> q_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +ucl_inline float fetch_q(const int& i, const float *q) + { return tex1Dfetch(q_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int itype=ix.w; + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (rsq<lj1[mtype].z) { + numtyp r2inv=ucl_recip(rsq); + numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc; + + if (rsq < lj1[mtype].w) { + r6inv = r2inv*r2inv*r2inv; + force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].w) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + if (eflag>0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_lj, factor_coul; + factor_lj = sp_lj[sbmask(j)]; + factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<lj1[mtype].z) { + numtyp r2inv=ucl_recip(rsq); + numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc; + + if (rsq < lj1[mtype].w) { + r6inv = r2inv*r2inv*r2inv; + force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + } else + force_lj = (numtyp)0.0; + + if (rsq < cut_coulsq) { + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); + } else + forcecoul = (numtyp)0.0; + + force = (force_lj + forcecoul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < lj1[mtype].w) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h new file mode 100644 index 000000000..4085f8eae --- /dev/null +++ b/lib/gpu/lal_lj_coul_long.h @@ -0,0 +1,83 @@ +/*************************************************************************** + lj_coul_long.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/cut/coul/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_LJ_COUL_LONG_H +#define LAL_LJ_COUL_LONG_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class LJCoulLong : public BaseCharge<numtyp, acctyp> { + public: + LJCoulLong(); + ~LJCoulLong(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec<numtyp4> lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp new file mode 100644 index 000000000..f0724a8a9 --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_ext.cpp @@ -0,0 +1,129 @@ +/*************************************************************************** + lj_coul_long_ext.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to lj/cut/coul/long acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_lj_coul_long.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJCoulLong<PRECISION,ACC_PRECISION> LJCLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + LJCLMF.clear(); + gpu_mode=LJCLMF.device->gpu_mode(); + double gpu_split=LJCLMF.device->particle_split(); + int first_gpu=LJCLMF.device->first_device(); + int last_gpu=LJCLMF.device->last_device(); + int world_me=LJCLMF.device->world_me(); + int gpu_rank=LJCLMF.device->gpu_rank(); + int procs_per_gpu=LJCLMF.device->procs_per_gpu(); + + LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu); + + bool message=false; + if (LJCLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + + LJCLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + + LJCLMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJCLMF.estimate_gpu_overhead(); + return init_ok; +} + +void ljcl_gpu_clear() { + LJCLMF.clear(); +} + +int** ljcl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljcl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double ljcl_gpu_bytes() { + return LJCLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp new file mode 100644 index 000000000..ed1bd9f51 --- /dev/null +++ b/lib/gpu/lal_lj_expand.cpp @@ -0,0 +1,154 @@ +/*************************************************************************** + lj_expand.cpp + ------------------- + Inderaj Bains (NVIDIA) + + Class for acceleration of the lj/expand pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ibains@nvidia.com + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "lj_expand_cl.h" +#else +#include "lj_expand_ptx.h" +#endif + +#include "lal_lj_expand.h" +#include <cassert> +using namespace LAMMPS_AL; +#define LJExpandT LJExpand<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +LJExpandT::LJExpand() : BaseAtomic<numtyp,acctyp>(), _allocated(false) { +} + +template <class numtyp, class acctyp> +LJExpandT::~LJExpand() { + clear(); +} + +template <class numtyp, class acctyp> +int LJExpandT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int LJExpandT::init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, + double **host_offset, double **host_shift, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_expand); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_shift); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + UCL_H_Vec<double> dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void LJExpandT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double LJExpandT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJExpand<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void LJExpandT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJExpand<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu new file mode 100644 index 000000000..c4d59ab18 --- /dev/null +++ b/lib/gpu/lal_lj_expand.cu @@ -0,0 +1,195 @@ +// ************************************************************************** +// lj_expand.cu +// ------------------- +// Inderaj Bains (NVIDIA) +// +// Device code for acceleration of the lj/expand pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ibains@nvidia.com +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int itype=ix.w; + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (r2inv<lj1[mtype].z) { + numtyp r = ucl_sqrt(r2inv); + numtyp rshift = r - lj1[mtype].w; + numtyp rshiftsq = rshift*rshift; + r2inv = ucl_recip(rshiftsq); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + force*=factor_lj/rshift/r; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + if (eflag>0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(numtyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv<lj1[mtype].z) { + numtyp r = ucl_sqrt(r2inv); + numtyp rshift = r - lj1[mtype].w; + numtyp rshiftsq = rshift*rshift; + r2inv = ucl_recip(rshiftsq); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + force*=factor_lj/rshift/r; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h new file mode 100644 index 000000000..fd59045bf --- /dev/null +++ b/lib/gpu/lal_lj_expand.h @@ -0,0 +1,79 @@ +/*************************************************************************** + lj_expand.h + ------------------- + Inderaj Bains (NVIDIA) + + Class for acceleration of the lj/expand pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ibains@nvidia.com + ***************************************************************************/ + +#ifndef LAL_LJ_EXPAND_H +#define LAL_LJ_EXPAND_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class LJExpand : public BaseAtomic<numtyp, acctyp> { + public: + LJExpand(); + ~LJExpand(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double **host_shift, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = shift + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec<numtyp4> lj3; + /// Special LJ values + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp new file mode 100644 index 000000000..54bb3f62f --- /dev/null +++ b/lib/gpu/lal_lj_expand_ext.cpp @@ -0,0 +1,121 @@ +/*************************************************************************** + lj_expand_ext.cpp + ------------------- + Inderaj Bains (NVIDIA) + + Functions for LAMMPS access to lj/expand acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ibains@nvidia.com + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_lj_expand.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJExpand<PRECISION,ACC_PRECISION> LJEMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **shift, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { + LJEMF.clear(); + gpu_mode=LJEMF.device->gpu_mode(); + double gpu_split=LJEMF.device->particle_split(); + int first_gpu=LJEMF.device->first_device(); + int last_gpu=LJEMF.device->last_device(); + int world_me=LJEMF.device->world_me(); + int gpu_rank=LJEMF.device->gpu_rank(); + int procs_per_gpu=LJEMF.device->procs_per_gpu(); + + LJEMF.device->init_message(screen,"lj/expand",first_gpu,last_gpu); + + bool message=false; + if (LJEMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, shift, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + LJEMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, shift, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split,screen); + + LJEMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJEMF.estimate_gpu_overhead(); + return init_ok; +} + +void lje_gpu_clear() { + LJEMF.clear(); +} + +int** lje_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void lje_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + LJEMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double lje_gpu_bytes() { + return LJEMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp new file mode 100644 index 000000000..1dc47ccbb --- /dev/null +++ b/lib/gpu/lal_lj_ext.cpp @@ -0,0 +1,120 @@ +/*************************************************************************** + lj_ext.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to lj/cut acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_lj.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJ<PRECISION,ACC_PRECISION> LJLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + LJLMF.clear(); + gpu_mode=LJLMF.device->gpu_mode(); + double gpu_split=LJLMF.device->particle_split(); + int first_gpu=LJLMF.device->first_device(); + int last_gpu=LJLMF.device->last_device(); + int world_me=LJLMF.device->world_me(); + int gpu_rank=LJLMF.device->gpu_rank(); + int procs_per_gpu=LJLMF.device->procs_per_gpu(); + + LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu); + + bool message=false; + if (LJLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + LJLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen); + + LJLMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJLMF.estimate_gpu_overhead(); + return init_ok; +} + +void ljl_gpu_clear() { + LJLMF.clear(); +} + +int ** ljl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void ljl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double ljl_gpu_bytes() { + return LJLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp new file mode 100644 index 000000000..966651775 --- /dev/null +++ b/lib/gpu/lal_morse.cpp @@ -0,0 +1,154 @@ +/*************************************************************************** + morse.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the morse pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "morse_cl.h" +#else +#include "morse_ptx.h" +#endif + +#include "lal_morse.h" +#include <cassert> +using namespace LAMMPS_AL; +#define MorseT Morse<numtyp, acctyp> + +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +MorseT::Morse() : BaseAtomic<numtyp,acctyp>(), _allocated(false) { +} + +template <class numtyp, class acctyp> +MorseT::~Morse() { + clear(); +} + +template <class numtyp, class acctyp> +int MorseT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int MorseT::init(const int ntypes, + double **host_cutsq, double **host_morse1, + double **host_r0, double **host_alpha, + double **host_d0, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,morse); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (types<=max_shared_types && this->_block_size>=max_shared_types) { + types=max_shared_types; + shared_types=true; + } + _types=types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<types*types; i++) + host_write[i]=0.0; + + mor1.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,types,mor1,host_write,host_cutsq,host_morse1, + host_r0,host_alpha); + + mor2.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,types,mor2,host_write,host_d0,host_offset); + + UCL_H_Vec<double> dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=mor1.row_bytes()+mor2.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void MorseT::clear() { + if (!_allocated) + return; + _allocated=false; + + mor1.clear(); + mor2.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double MorseT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Morse<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void MorseT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(), + &mor2.begin(), &sp_lj.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(), + &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class Morse<PRECISION,ACC_PRECISION>; + diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu new file mode 100644 index 000000000..bd9ae49c8 --- /dev/null +++ b/lib/gpu/lal_morse.cu @@ -0,0 +1,191 @@ +// ************************************************************************** +// morse.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the morse pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +texture<float4> pos_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +#endif +#endif + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1, + __global numtyp2* mor2, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int itype=ix.w; + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r = delx*delx+dely*dely+delz*delz; + + int mtype=itype*lj_types+jtype; + if (r<mor1[mtype].x) { + r=ucl_sqrt(r); + numtyp dexp=r-mor1[mtype].z; + dexp=ucl_exp(-mor1[mtype].w*dexp); + numtyp dm=dexp*dexp-dexp; + numtyp force = mor1[mtype].y*dm/r*factor_lj; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y; + energy+=e*factor_lj; + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in, + __global numtyp2* mor2_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + mor1[tid]=mor1_in[tid]; + if (eflag>0) + mor2[tid]=mor2_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r = delx*delx+dely*dely+delz*delz; + + if (r<mor1[mtype].x) { + r=ucl_sqrt(r); + numtyp dexp=r-mor1[mtype].z; + dexp=ucl_exp(-mor1[mtype].w*dexp); + numtyp dm=dexp*dexp-dexp; + numtyp force = mor1[mtype].y*dm/r*factor_lj; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y; + energy+=e*factor_lj; + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_morse.h b/lib/gpu/lal_morse.h new file mode 100644 index 000000000..e64852f31 --- /dev/null +++ b/lib/gpu/lal_morse.h @@ -0,0 +1,79 @@ +/*************************************************************************** + morse.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the morse pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_MORSE_H +#define LAL_MORSE_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class Morse : public BaseAtomic<numtyp, acctyp> { + public: + Morse(); + ~Morse(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_morse1, double **host_r0, double **host_alpha, + double **host_d0, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// mor1.x = cutsq, mor1.y = morse1, mor1.z = r0, mor1.w = alpha + UCL_D_Vec<numtyp4> mor1; + /// mor2.x = d0, mor2.y = offset + UCL_D_Vec<numtyp2> mor2; + /// Special LJ values + UCL_D_Vec<numtyp> sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp new file mode 100644 index 000000000..787f49b39 --- /dev/null +++ b/lib/gpu/lal_morse_ext.cpp @@ -0,0 +1,121 @@ +/*************************************************************************** + morse.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to morse acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_morse.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Morse<PRECISION,ACC_PRECISION> MORMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int mor_gpu_init(const int ntypes, double **cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { + MORMF.clear(); + gpu_mode=MORMF.device->gpu_mode(); + double gpu_split=MORMF.device->particle_split(); + int first_gpu=MORMF.device->first_device(); + int last_gpu=MORMF.device->last_device(); + int world_me=MORMF.device->world_me(); + int gpu_rank=MORMF.device->gpu_rank(); + int procs_per_gpu=MORMF.device->procs_per_gpu(); + + MORMF.device->init_message(screen,"morse",first_gpu,last_gpu); + + bool message=false; + if (MORMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + MORMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen); + + MORMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + MORMF.estimate_gpu_overhead(); + return init_ok; +} + +void mor_gpu_clear() { + MORMF.clear(); +} + +int** mor_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void mor_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + MORMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double mor_gpu_bytes() { + return MORMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp new file mode 100644 index 000000000..5f0b1b5c9 --- /dev/null +++ b/lib/gpu/lal_neighbor.cpp @@ -0,0 +1,502 @@ +/*************************************************************************** + neighbor.cpp + ------------------- + W. Michael Brown (ORNL) + Peng Wang (Nvidia) + + Class for handling neighbor lists + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov, penwang@nvidia.com + ***************************************************************************/ + +#include "lal_precision.h" +#include "lal_neighbor.h" +#include "lal_device.h" +#include "math.h" +using namespace LAMMPS_AL; + +int Neighbor::bytes_per_atom(const int max_nbors) const { + if (_gpu_nbor==1) + return (max_nbors+2)*sizeof(int); + else if (_gpu_nbor==2) + return (max_nbors+3)*sizeof(int); + else if (_use_packing) + return ((max_nbors+2)*2)*sizeof(int); + else + return (max_nbors+3)*sizeof(int); +} + +bool Neighbor::init(NeighborShared *shared, const int inum, + const int host_inum, const int max_nbors, + const int maxspecial, UCL_Device &devi, + const int gpu_nbor, const int gpu_host, + const bool pre_cut, const int block_cell_2d, + const int block_cell_id, const int block_nbor_build, + const int threads_per_atom, const bool time_device) { + clear(); + + _threads_per_atom=threads_per_atom; + _block_cell_2d=block_cell_2d; + _block_cell_id=block_cell_id; + _block_nbor_build=block_nbor_build; + _shared=shared; + dev=&devi; + _gpu_nbor=gpu_nbor; + _time_device=time_device; + if (gpu_host==0) + _gpu_host=false; + else if (gpu_host==1) + _gpu_host=true; + else + // Not yet implemented + assert(0==1); + + if (pre_cut || gpu_nbor==0) + _alloc_packed=true; + else + _alloc_packed=false; + + bool success=true; + + // Initialize timers for the selected GPU + _nbor_time_avail=false; + time_nbor.init(*dev); + time_kernel.init(*dev); + time_hybrid1.init(*dev); + time_hybrid2.init(*dev); + time_nbor.zero(); + time_kernel.zero(); + time_hybrid1.zero(); + time_hybrid2.zero(); + + _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10); + if (_max_atoms==0) + _max_atoms=1000; + + _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10); + _max_nbors=max_nbors; + + _maxspecial=maxspecial; + if (gpu_nbor==0) + _maxspecial=0; + + if (gpu_nbor==0) + success=success && (host_packed.alloc(2*IJ_SIZE,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + alloc(success); + if (!success) + return false; + + if (_use_packing==false) + _shared->compile_kernels(devi,gpu_nbor); + + return success; +} + +void Neighbor::alloc(bool &success) { + dev_nbor.clear(); + host_acc.clear(); + int nt=_max_atoms+_max_host; + if (_use_packing==false || _gpu_nbor>0) + success=success && + (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS); + else + success=success && (dev_nbor.alloc(3*_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + success=success && (host_acc.alloc(nt*2,*dev, + UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + + _c_bytes=dev_nbor.row_bytes(); + if (_alloc_packed) { + dev_packed.clear(); + success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + _c_bytes+=dev_packed.row_bytes(); + } + if (_max_host>0) { + host_nbor.clear(); + dev_host_nbor.clear(); + dev_host_numj.clear(); + host_ilist.clear(); + host_jlist.clear(); + + success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev, + UCL_RW_OPTIMIZED)==UCL_SUCCESS); + success=success && (dev_host_nbor.alloc(_max_nbors*_max_host, + *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (dev_host_numj.alloc(_max_host,*dev, + UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); + if (!success) + return; + for (int i=0; i<nt; i++) + host_ilist[i]=i; + success=success && (host_jlist.alloc(_max_host,*dev, + UCL_NOT_PINNED)==UCL_SUCCESS); + if (!success) + return; + int *ptr=host_nbor.begin(); + for (int i=0; i<_max_host; i++) { + host_jlist[i]=ptr; + ptr+=_max_nbors; + } + _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes(); + } + if (_maxspecial>0) { + dev_nspecial.clear(); + dev_special.clear(); + dev_special_t.clear(); + int at=_max_atoms+_max_host; + success=success && (dev_nspecial.alloc(3*at,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + success=success && (dev_special.alloc(_maxspecial*at,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + success=success && (dev_special_t.alloc(_maxspecial*at,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+ + dev_special_t.row_bytes(); + } + + _allocated=true; +} + +void Neighbor::clear() { + _gpu_bytes=0.0; + _cell_bytes=0.0; + _c_bytes=0.0; + _bin_time=0.0; + if (_ncells>0) { + _ncells=0; + dev_cell_counts.clear(); + if (_gpu_nbor==2) { + host_cell_counts.clear(); + delete [] cell_iter; + } + } + if (_allocated) { + _allocated=false; + _nbor_time_avail=false; + + host_packed.clear(); + host_acc.clear(); + dev_nbor.clear(); + dev_host_nbor.clear(); + dev_packed.clear(); + host_nbor.clear(); + dev_host_numj.clear(); + host_ilist.clear(); + host_jlist.clear(); + dev_nspecial.clear(); + dev_special.clear(); + dev_special_t.clear(); + + time_kernel.clear(); + time_nbor.clear(); + time_hybrid1.clear(); + time_hybrid2.clear(); + } +} + +double Neighbor::host_memory_usage() const { + if (_gpu_nbor>0) { + if (_gpu_host) + return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+ + host_jlist.row_bytes(); + else + return 0; + } else + return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+ + sizeof(Neighbor); +} + +void Neighbor::get_host(const int inum, int *ilist, int *numj, + int **firstneigh, const int block_size) { + _nbor_time_avail=true; + time_nbor.start(); + + UCL_H_Vec<int> ilist_view; + ilist_view.view(ilist,inum,*dev); + ucl_copy(dev_nbor,ilist_view,false); + + UCL_D_Vec<int> nbor_offset; + UCL_H_Vec<int> host_offset; + + int copy_count=0; + int ij_count=0; + int acc_count=0; + int dev_count=0; + int *h_ptr=host_packed.begin(); + _nbor_pitch=inum; + + for (int ii=0; ii<inum; ii++) { + int i=ilist[ii]; + int nj=numj[i]; + host_acc[ii]=nj; + host_acc[ii+inum]=acc_count; + + acc_count+=nj; + + int *jlist=firstneigh[i]; + for (int jj=0; jj<nj; jj++) { + *h_ptr=jlist[jj]; + h_ptr++; + ij_count++; + + if (ij_count==IJ_SIZE) { + dev_nbor.sync(); + host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE); + nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE); + ucl_copy(nbor_offset,host_offset,true); + copy_count++; + ij_count=0; + dev_count+=IJ_SIZE; + h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2)); + } + } + } + if (ij_count!=0) { + dev_nbor.sync(); + host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count); + nbor_offset.view_offset(dev_count,dev_packed,ij_count); + ucl_copy(nbor_offset,host_offset,true); + } + UCL_D_Vec<int> acc_view; + acc_view.view_offset(inum,dev_nbor,inum*2); + ucl_copy(acc_view,host_acc,true); + time_nbor.stop(); + + if (_use_packing==false) { + time_kernel.start(); + int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/ + block_size)); + _shared->k_nbor.set_size(GX,block_size); + _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum, + &_threads_per_atom); + time_kernel.stop(); + } +} + +template <class numtyp, class acctyp> +void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, + const int nall, Atom<numtyp,acctyp> &atom, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, bool &success, + int &mn) { + _nbor_time_avail=true; + const int nt=inum+host_inum; + + // Calculate number of cells and allocate storage for binning as necessary + int ncellx, ncelly, ncellz, ncell_3d; + ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) + + 2.0*_cell_size)/_cell_size)); + ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) + + 2.0*_cell_size)/_cell_size)); + ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) + + 2.0*_cell_size)/_cell_size)); + ncell_3d = ncellx * ncelly * ncellz; + if (ncell_3d+1>_ncells) { + dev_cell_counts.clear(); + dev_cell_counts.alloc(ncell_3d+1,dev_nbor); + if (_gpu_nbor==2) { + if (_ncells>0) { + host_cell_counts.clear(); + delete [] cell_iter; + } + cell_iter = new int[ncell_3d+1]; + host_cell_counts.alloc(ncell_3d+1,dev_nbor); + } + _ncells=ncell_3d+1; + _cell_bytes=dev_cell_counts.row_bytes(); + } + + const numtyp cell_size_cast=static_cast<numtyp>(_cell_size); + + // If binning on CPU, do this now + if (_gpu_nbor==2) { + double stime = MPI_Wtime(); + int *cell_id=atom.host_cell_id.begin(); + int *particle_id=atom.host_particle_id.begin(); + + // Build cell list on CPU + host_cell_counts.zero(); + double m_cell_size=-_cell_size; + double dx=subhi[0]-sublo[0]+_cell_size; + double dy=subhi[1]-sublo[1]+_cell_size; + double dz=subhi[2]-sublo[2]+_cell_size; + + for (int i=0; i<nall; i++) { + double px, py, pz; + px=x[i][0]-sublo[0]; + py=x[i][1]-sublo[1]; + pz=x[i][2]-sublo[2]; + if (px<m_cell_size) px=m_cell_size; + if (py<m_cell_size) py=m_cell_size; + if (pz<m_cell_size) pz=m_cell_size; + if (px>dx) px=dx; + if (py>dy) py=dy; + if (pz>dz) pz=dz; + + int id=static_cast<int>(px/_cell_size + 1.0) + + static_cast<int>(py/_cell_size + 1.0) * ncellx + + static_cast<int>(pz/_cell_size + 1.0) * ncellx * ncelly; + + cell_id[i]=id; + host_cell_counts[id+1]++; + } + cell_iter[0]=0; + for (int i=1; i<_ncells; i++) { + host_cell_counts[i]+=host_cell_counts[i-1]; + cell_iter[i]=host_cell_counts[i]; + } + time_hybrid1.start(); + ucl_copy(dev_cell_counts,host_cell_counts,true); + time_hybrid1.stop(); + for (int i=0; i<nall; i++) { + int celli=cell_id[i]; + int ploc=cell_iter[celli]; + cell_iter[celli]++; + particle_id[ploc]=i; + } + time_hybrid2.start(); + ucl_copy(atom.dev_particle_id,atom.host_particle_id,true); + time_hybrid2.stop(); + _bin_time+=MPI_Wtime()-stime; + } + + if (_maxspecial>0) { + time_nbor.start(); + UCL_H_Vec<int> view_nspecial, view_special, view_tag; + view_nspecial.view(nspecial[0],nt*3,*dev); + view_special.view(special[0],nt*_maxspecial,*dev); + view_tag.view(tag,nall,*dev); + ucl_copy(dev_nspecial,view_nspecial,nt*3,false); + ucl_copy(dev_special_t,view_special,nt*_maxspecial,false); + ucl_copy(atom.dev_tag,view_tag,nall,false); + time_nbor.stop(); + if (_time_device) + time_nbor.add_to_total(); + time_kernel.start(); + const int b2x=_block_cell_2d; + const int b2y=_block_cell_2d; + const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x)); + const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y)); + _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); + _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(), + &_maxspecial,&nt); + } else + time_kernel.start(); + + _nbor_pitch=inum; + _shared->neigh_tex.bind_float(atom.dev_x,4); + + // If binning on GPU, do this now + if (_gpu_nbor==1) { + const int neigh_block=_block_cell_id; + const int GX=(int)ceil((float)nall/neigh_block); + const numtyp sublo0=static_cast<numtyp>(sublo[0]); + const numtyp sublo1=static_cast<numtyp>(sublo[1]); + const numtyp sublo2=static_cast<numtyp>(sublo[2]); + const numtyp subhi0=static_cast<numtyp>(subhi[0]); + const numtyp subhi1=static_cast<numtyp>(subhi[1]); + const numtyp subhi2=static_cast<numtyp>(subhi[2]); + _shared->k_cell_id.set_size(GX,neigh_block); + _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), + &atom.dev_particle_id.begin(), + &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, + &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall); + + atom.sort_neighbor(nall); + + /* calculate cell count */ + _shared->k_cell_counts.set_size(GX,neigh_block); + _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), + &dev_cell_counts.begin(), &nall, &ncell_3d); + } + + /* build the neighbor list */ + const int cell_block=_block_nbor_build; + _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); + _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), + &dev_cell_counts.begin(), &dev_nbor.begin(), + &dev_host_nbor.begin(), &dev_host_numj.begin(), + &_max_nbors,&cell_size_cast, + &ncellx, &ncelly, &ncellz, &inum, &nt, &nall, + &_threads_per_atom); + + /* Get the maximum number of nbors and realloc if necessary */ + UCL_D_Vec<int> numj; + numj.view_offset(inum,dev_nbor,inum); + ucl_copy(host_acc,numj,inum,false); + if (nt>inum) { + UCL_H_Vec<int> host_offset; + host_offset.view_offset(inum,host_acc,nt-inum); + ucl_copy(host_offset,dev_host_numj,nt-inum,false); + } + mn=host_acc[0]; + for (int i=1; i<nt; i++) + mn=std::max(mn,host_acc[i]); + + if (mn>_max_nbors) { + mn=static_cast<int>(static_cast<double>(mn)*1.10); + dev_nbor.clear(); + success=success && + (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS); + _gpu_bytes=dev_nbor.row_bytes(); + if (_max_host>0) { + host_nbor.clear(); + dev_host_nbor.clear(); + success=success && (host_nbor.alloc(mn*_max_host,dev_nbor, + UCL_RW_OPTIMIZED)==UCL_SUCCESS); + success=success && (dev_host_nbor.alloc(mn*_max_host, + dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS); + int *ptr=host_nbor.begin(); + for (int i=0; i<_max_host; i++) { + host_jlist[i]=ptr; + ptr+=mn; + } + _gpu_bytes+=dev_host_nbor.row_bytes(); + } + if (_alloc_packed) { + dev_packed.clear(); + success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + _gpu_bytes+=dev_packed.row_bytes(); + } + if (!success) + return; + _max_nbors=mn; + time_kernel.stop(); + if (_time_device) + time_kernel.add_to_total(); + build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial, + special, success, mn); + return; + } + + if (_maxspecial>0) { + const int GX2=static_cast<int>(ceil(static_cast<double> + (nt*_threads_per_atom)/cell_block)); + _shared->k_special.set_size(GX2,cell_block); + _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), + &dev_host_numj.begin(), &atom.dev_tag.begin(), + &dev_nspecial.begin(), &dev_special.begin(), + &inum, &nt, &_max_nbors, &_threads_per_atom); + } + time_kernel.stop(); + + time_nbor.start(); + if (_gpu_host) + ucl_copy(host_nbor,dev_host_nbor,false); + time_nbor.stop(); +} + +template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION> + (double **x, const int inum, const int host_inum, const int nall, + Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi, + int *, int **, int **, bool &success, int &mn); + diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h new file mode 100644 index 000000000..6ecbe97aa --- /dev/null +++ b/lib/gpu/lal_neighbor.h @@ -0,0 +1,235 @@ +/*************************************************************************** + neighbor.h + ------------------- + W. Michael Brown (ORNL) + Peng Wang (Nvidia) + + Class for handling neighbor lists + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov, penwang@nvidia.com + ***************************************************************************/ + +#ifndef LAL_NEIGHBOR_H +#define LAL_NEIGHBOR_H + +#include "lal_atom.h" +#include "lal_neighbor_shared.h" + +#define IJ_SIZE 131072 + +#ifdef USE_OPENCL + +#include "geryon/ocl_timer.h" +#include "geryon/ocl_mat.h" +using namespace ucl_opencl; + +#else + +#include "geryon/nvd_timer.h" +#include "geryon/nvd_mat.h" +using namespace ucl_cudadr; + +#endif + +namespace LAMMPS_AL { + +class Neighbor { + public: + Neighbor() : _allocated(false), _use_packing(false), _ncells(0) {} + ~Neighbor() { clear(); } + + /// Determine whether neighbor unpacking should be used + /** If false, twice as much memory is reserved to allow unpacking neighbors by + * atom for coalesced access. **/ + void packing(const bool use_packing) { _use_packing=use_packing; } + + /// Clear any old data and setup for new LAMMPS run + /** \param inum Initial number of particles whose neighbors stored on device + * \param host_inum Initial number of particles whose nbors copied to host + * \param max_nbors Initial number of rows in the neighbor matrix + * \param gpu_nbor 0 if neighboring will be performed on host + * gpu_nbor 1 if neighboring will be performed on device + * gpu_nbor 2 if binning on host and neighboring on device + * \param gpu_host 0 if host will not perform force calculations, + * 1 if gpu_nbor is true, and host needs a half nbor list, + * 2 if gpu_nbor is true, and host needs a full nbor list + * \param pre_cut True if cutoff test will be performed in separate kernel + * than the force kernel + * \param threads_per_atom Number of threads used per atom for force + * calculation **/ + bool init(NeighborShared *shared, const int inum, const int host_inum, + const int max_nbors, const int maxspecial, UCL_Device &dev, + const int gpu_nbor, const int gpu_host, const bool pre_cut, + const int block_cell_2d, const int block_cell_id, + const int block_nbor_build, const int threads_per_atom, + const bool time_device); + + /// Set the size of the cutoff+skin + inline void cell_size(const double size) { _cell_size=size; } + + /// Get the size of the cutoff+skin + inline double cell_size() const { return _cell_size; } + + /// Check if there is enough memory for neighbor data and realloc if not + /** \param inum Number of particles whose nbors will be stored on device + * \param max_nbor Current max number of neighbors for a particle + * \param success False if insufficient memory **/ + inline void resize(const int inum, const int max_nbor, bool &success) { + if (inum>_max_atoms || max_nbor>_max_nbors) { + _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10); + if (max_nbor>_max_nbors) + _max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10); + alloc(success); + } + } + + /// Check if there is enough memory for neighbor data and realloc if not + /** \param inum Number of particles whose nbors will be stored on device + * \param host_inum Number of particles whose nbors will be copied to host + * \param max_nbor Current max number of neighbors for a particle + * \param success False if insufficient memory **/ + inline void resize(const int inum, const int host_inum, const int max_nbor, + bool &success) { + if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) { + _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10); + _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10); + if (max_nbor>_max_nbors) + _max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10); + alloc(success); + } + } + + inline void acc_timers() { + if (_nbor_time_avail) { + time_nbor.add_to_total(); + time_kernel.add_to_total(); + if (_gpu_nbor==2) { + time_hybrid1.add_to_total(); + time_hybrid2.add_to_total(); + } + _nbor_time_avail=false; + } + } + + /// Free all memory on host and device + void clear(); + + /// Bytes per atom used on device + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by class + double host_memory_usage() const; + + /// Returns the type of neighboring: + /** - 0 if neighboring will be performed on host + * - 1 if neighboring will be performed on device + * - 2 if binning on host and neighboring on device **/ + inline int gpu_nbor() const { return _gpu_nbor; } + + /// Make a copy of unpacked nbor lists in the packed storage area (for gb) + inline void copy_unpacked(const int inum, const int maxj) + { ucl_copy(dev_packed,dev_nbor,inum*(maxj+2),true); } + + /// Copy neighbor list from host (first time or from a rebuild) + void get_host(const int inum, int *ilist, int *numj, + int **firstneigh, const int block_size); + + /// Return the stride in elements for each nbor row + inline int nbor_pitch() const { return _nbor_pitch; } + + /// Return the maximum number of atoms that can currently be stored + inline int max_atoms() const { return _max_atoms; } + + /// Return the maximum number of nbors for a particle based on current alloc + inline int max_nbors() const { return _max_nbors; } + + /// Return the time spent binning on the CPU for hybrid neighbor builds + inline double bin_time() const { return _bin_time; } + + /// Loop through neighbor count array and return maximum nbors for a particle + inline int max_nbor_loop(const int inum, int *numj, int *ilist) const { + int mn=0; + for (int i=0; i<inum; i++) + mn=std::max(mn,numj[ilist[i]]); + return mn; + } + + /// Build nbor list on the device + template <class numtyp, class acctyp> + void build_nbor_list(double **x, const int inum, const int host_inum, + const int nall, Atom<numtyp,acctyp> &atom, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, + bool &success, int &max_nbors); + + /// Return the number of bytes used on device + inline double gpu_bytes() { + double res = _gpu_bytes + _c_bytes + _cell_bytes; + if (_gpu_nbor==0) + res += 2*IJ_SIZE*sizeof(int); + + return res; + } + + // ------------------------------- Data ------------------------------- + + /// Device neighbor matrix + /** - 1st row is i (index into atom data) + * - 2nd row is numj (number of neighbors) + * - 3rd row is starting location in packed nbors + * - Remaining rows are the neighbors arranged for coalesced access **/ + UCL_D_Vec<int> dev_nbor; + /// Packed storage for neighbor lists copied from host + UCL_D_Vec<int> dev_packed; + /// Host buffer for copying neighbor lists + UCL_H_Vec<int> host_packed; + /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2) + UCL_H_Vec<int> host_acc; + + // ----------------- Data for GPU Neighbor Calculation --------------- + + /// Host storage for device calculated neighbor lists + /** Same storage format as device matrix **/ + UCL_H_Vec<int> host_nbor; + /// Device storage for neighbor list matrix that will be copied to host + /** - 1st row is numj + * - Remaining rows are by atom, columns are nbors **/ + UCL_D_Vec<int> dev_host_nbor; + UCL_D_Vec<int> dev_host_numj; + UCL_H_Vec<int> host_ilist; + UCL_H_Vec<int*> host_jlist; + /// Device storage for special neighbor counts + UCL_D_Vec<int> dev_nspecial; + /// Device storage for special neighbors + UCL_D_Vec<int> dev_special, dev_special_t; + /// Host storage for number of particles per cell + UCL_H_Vec<int> host_cell_counts; + int *cell_iter; + /// Device storage for number of particles per cell + UCL_D_Vec<int> dev_cell_counts; + + /// Device timers + UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2; + + private: + NeighborShared *_shared; + UCL_Device *dev; + bool _allocated, _use_packing, _nbor_time_avail, _time_device; + int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial; + bool _gpu_host, _alloc_packed; + double _cell_size, _bin_time; + + double _gpu_bytes, _c_bytes, _cell_bytes; + void alloc(bool &success); + + int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells; + int _threads_per_atom; +}; + +} + +#endif diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu new file mode 100644 index 000000000..4fbf29f10 --- /dev/null +++ b/lib/gpu/lal_neighbor_cpu.cu @@ -0,0 +1,42 @@ +// ************************************************************************** +// atom.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for handling CPU generated neighbor lists +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_preprocessor.h" +#endif + +__kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij, + const int inum, const int t_per_atom) { + int tid=THREAD_ID_X; + int offset=tid & (t_per_atom-1); + int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; + + if (ii<inum) { + __global int *nbor=dev_nbor+ii+inum; + int numj=*nbor; + nbor+=inum; + __global int *list=dev_ij+*nbor; + __global int *list_end=list+numj; + list+=offset; + nbor+=fast_mul(ii,t_per_atom-1)+offset; + int stride=fast_mul(t_per_atom,inum); + + for ( ; list<list_end; list++) { + *nbor=*list; + nbor+=stride; + } + } // if ii +} + diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu new file mode 100644 index 000000000..29007abe8 --- /dev/null +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -0,0 +1,277 @@ +// ************************************************************************** +// neighbor_gpu.cu +// ------------------- +// Peng Wang (Nvidia) +// W. Michael Brown (ORNL) +// +// Device code for handling GPU generated neighbor lists +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : penwang@nvidia.com, brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_preprocessor.h" +texture<float4> neigh_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(neigh_tex, i); } +#endif + +__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id, + numtyp boxlo0, + numtyp boxlo1, numtyp boxlo2, numtyp boxhi0, + numtyp boxhi1, numtyp boxhi2, numtyp cell_size, + int ncellx, int ncelly, int nall) { + int i = threadIdx.x + blockIdx.x*blockDim.x; + + if (i < nall) { + numtyp4 p = fetch_pos(i,pos); //pos[i]; + + p.x -= boxlo0; + p.y -= boxlo1; + p.z -= boxlo2; + + p.x = fmaxf(p.x, -cell_size); + p.x = fminf(p.x, boxhi0-boxlo0+cell_size); + p.y = fmaxf(p.y, -cell_size); + p.y = fminf(p.y, boxhi1-boxlo1+cell_size); + p.z = fmaxf(p.z, -cell_size); + p.z = fminf(p.z, boxhi2-boxlo2+cell_size); + + unsigned int id = (unsigned int)(p.x/cell_size + 1.0) + + (unsigned int)(p.y/cell_size + 1.0) * ncellx + + (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly; + + cell_id[i] = id; + particle_id[i] = i; + } +} + +__kernel void kernel_calc_cell_counts(unsigned *cell_id, + int *cell_counts, int nall, int ncell) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < nall) { + int id = cell_id[idx]; + + // handle boundary cases + if (idx == 0) { + for (int i = 0; i < id + 1; i++) + cell_counts[i] = 0; + } + if (idx == nall - 1) { + for (int i = id+1; i <= ncell; i++) + cell_counts[i] = nall; + } + + if (idx > 0 && idx < nall) { + int id_l = cell_id[idx-1]; + if (id != id_l) { + for (int i = id_l+1; i <= id; i++) + cell_counts[i] = idx; + } + } + } +} + +#endif + + + +__kernel void transpose(__global int *out, __global int *in, int columns_in, + int rows_in) +{ + __local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; + + unsigned ti=THREAD_ID_X; + unsigned tj=THREAD_ID_Y; + unsigned bi=BLOCK_ID_X; + unsigned bj=BLOCK_ID_Y; + + unsigned i=bi*BLOCK_CELL_2D+ti; + unsigned j=bj*BLOCK_CELL_2D+tj; + if ((i<columns_in) && (j<rows_in)) + block[tj][ti]=in[j*columns_in+i]; + + __syncthreads(); + + i=bj*BLOCK_CELL_2D+ti; + j=bi*BLOCK_CELL_2D+tj; + if ((i<rows_in) && (j<columns_in)) + out[j*rows_in+i] = block[ti][tj]; +} + +__kernel void calc_neigh_list_cell(__global numtyp4 *x_, + __global int *cell_particle_id, + __global int *cell_counts, + __global int *nbor_list, + __global int *host_nbor_list, + __global int *host_numj, + int neigh_bin_size, numtyp cell_size, + int ncellx, int ncelly, int ncellz, + int inum, int nt, int nall, int t_per_atom) +{ + int tid = THREAD_ID_X; + int ix = BLOCK_ID_X; + int iy = BLOCK_ID_Y % ncelly; + int iz = BLOCK_ID_Y / ncelly; + + int icell = ix + iy*ncellx + iz*ncellx*ncelly; + + __local int cell_list_sh[BLOCK_NBOR_BUILD]; + __local numtyp4 pos_sh[BLOCK_NBOR_BUILD]; + + int icell_begin = cell_counts[icell]; + int icell_end = cell_counts[icell+1]; + + int nborz0 = max(iz-1,0), nborz1 = min(iz+1, ncellz-1), + nbory0 = max(iy-1,0), nbory1 = min(iy+1, ncelly-1), + nborx0 = max(ix-1,0), nborx1 = min(ix+1, ncellx-1); + + numtyp4 diff; + numtyp r2; + int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/BLOCK_SIZE_X); + for (int ii = 0; ii < cap; ii++) { + int i = icell_begin + tid + ii*BLOCK_SIZE_X; + int pid_i = nall, pid_j, stride; + numtyp4 atom_i, atom_j; + int cnt = 0; + __global int *neigh_counts, *neigh_list; + + if (i < icell_end) + pid_i = cell_particle_id[i]; + + if (pid_i < nt) { + atom_i = fetch_pos(pid_i,x_); //pos[pid_i]; + } + if (pid_i < inum) { + stride=inum; + neigh_counts=nbor_list+stride+pid_i; + neigh_list=neigh_counts+stride+pid_i*(t_per_atom-1); + stride=stride*t_per_atom-t_per_atom; + nbor_list[pid_i]=pid_i; + } else { + stride=0; + neigh_counts=host_numj+pid_i-inum; + neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size; + } + + // loop through neighbors + + for (int nborz = nborz0; nborz <= nborz1; nborz++) { + for (int nbory = nbory0; nbory <= nbory1; nbory++) { + for (int nborx = nborx0; nborx <= nborx1; nborx++) { + + int jcell = nborx + nbory*ncellx + nborz*ncellx*ncelly; + + int jcell_begin = cell_counts[jcell]; + int jcell_end = cell_counts[jcell+1]; + int num_atom_cell = jcell_end - jcell_begin; + + // load jcell to shared memory + int num_iter = ucl_ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD); + + for (int k = 0; k < num_iter; k++) { + int end_idx = min(BLOCK_NBOR_BUILD, + num_atom_cell-k*BLOCK_NBOR_BUILD); + + if (tid < end_idx) { + pid_j = cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin]; + cell_list_sh[tid] = pid_j; + atom_j = fetch_pos(pid_j,x_); //[pid_j]; + pos_sh[tid].x = atom_j.x; + pos_sh[tid].y = atom_j.y; + pos_sh[tid].z = atom_j.z; + } + __syncthreads(); + + if (pid_i < nt) { + + for (int j = 0; j < end_idx; j++) { + int pid_j = cell_list_sh[j]; // gather from shared memory + diff.x = atom_i.x - pos_sh[j].x; + diff.y = atom_i.y - pos_sh[j].y; + diff.z = atom_i.z - pos_sh[j].z; + + r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; + if (r2 < cell_size*cell_size && r2 > 1e-5) { + cnt++; + if (cnt < neigh_bin_size) { + *neigh_list = pid_j; + neigh_list++; + if ((cnt & (t_per_atom-1))==0) + neigh_list=neigh_list+stride; + } + } + } + } + __syncthreads(); + } // for (k) + } + } + } + if (pid_i < nt) + *neigh_counts = cnt; + } // for (i) +} + +__kernel void kernel_special(__global int *dev_nbor, + __global int *host_nbor_list, + __global int *host_numj, __global int *tag, + __global int *nspecial, __global int *special, + int inum, int nt, int max_nbors, int t_per_atom) { + int tid=THREAD_ID_X; + int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid & (t_per_atom-1); + + if (ii<nt) { + int stride; + __global int *list, *list_end; + + int n1=nspecial[ii*3]; + int n2=nspecial[ii*3+1]; + int n3=nspecial[ii*3+2]; + + int numj; + if (ii < inum) { + stride=inum; + list=dev_nbor+stride+ii; + numj=*list; + list+=stride+fast_mul(ii,t_per_atom-1); + stride=fast_mul(inum,t_per_atom); + int njt=numj/t_per_atom; + list_end=list+fast_mul(njt,stride)+(numj & (t_per_atom-1)); + list+=offset; + } else { + stride=1; + list=host_nbor_list+(ii-inum)*max_nbors; + numj=host_numj[ii-inum]; + list_end=list+fast_mul(numj,stride); + } + + for ( ; list<list_end; list+=stride) { + int nbor=*list; + int jtag=tag[nbor]; + + int offset=ii; + for (int i=0; i<n3; i++) { + if (special[offset]==jtag) { + int which = 1; + if (i>=n1) + which++; + if (i>=n2) + which++; + nbor=nbor ^ (which << SBBITS); + *list=nbor; + } + offset+=nt; + } + } + } // if ii +} + diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp new file mode 100644 index 000000000..1547eac4f --- /dev/null +++ b/lib/gpu/lal_neighbor_shared.cpp @@ -0,0 +1,74 @@ +/*************************************************************************** + neighbor_shared.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for management of data shared by all neighbor lists + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include "lal_precision.h" +#include "lal_neighbor_shared.h" + +#ifdef USE_OPENCL +#include "neighbor_cpu_cl.h" +#include "neighbor_gpu_cl.h" +#else +#include "neighbor_cpu_ptx.h" +#include "neighbor_gpu_ptx.h" +#endif + +using namespace LAMMPS_AL; + +void NeighborShared::clear() { + if (_compiled) { + if (_gpu_nbor>0) { + if (_gpu_nbor==1) { + k_cell_id.clear(); + k_cell_counts.clear(); + } + k_build_nbor.clear(); + k_transpose.clear(); + k_special.clear(); + delete build_program; + } else { + k_nbor.clear(); + delete nbor_program; + } + _compiled=false; + } +} + +void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) { + if (_compiled) + return; + + _gpu_nbor=gpu_nbor; + std::string flags="-cl-fast-relaxed-math -cl-mad-enable -D"+ + std::string(OCL_VENDOR); + + if (_gpu_nbor==0) { + nbor_program=new UCL_Program(dev); + nbor_program->load_string(neighbor_cpu,flags.c_str()); + k_nbor.set_function(*nbor_program,"kernel_unpack"); + } else { + build_program=new UCL_Program(dev); + build_program->load_string(neighbor_gpu,flags.c_str()); + + if (_gpu_nbor==1) { + k_cell_id.set_function(*build_program,"calc_cell_id"); + k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts"); + } + k_build_nbor.set_function(*build_program,"calc_neigh_list_cell"); + k_transpose.set_function(*build_program,"transpose"); + k_special.set_function(*build_program,"kernel_special"); + neigh_tex.get_texture(*build_program,"neigh_tex"); + } + _compiled=true; +} diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h new file mode 100644 index 000000000..b579e0d60 --- /dev/null +++ b/lib/gpu/lal_neighbor_shared.h @@ -0,0 +1,61 @@ +/*************************************************************************** + neighbor_shared.h + ------------------- + W. Michael Brown (ORNL) + + Class for management of data shared by all neighbor lists + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_NEIGHBOR_SHARED_H +#define LAL_NEIGHBOR_SHARED_H + +#ifdef USE_OPENCL + +#include "geryon/ocl_kernel.h" +#include "geryon/ocl_texture.h" +using namespace ucl_opencl; + +#else + +#include "geryon/nvd_kernel.h" +#include "geryon/nvd_texture.h" +using namespace ucl_cudadr; + +#endif + +namespace LAMMPS_AL { + +class NeighborShared { + public: + NeighborShared() : _compiled(false) {} + ~NeighborShared() { clear(); } + + /// Free all memory on host and device + void clear(); + + /// Texture for cached position/type access with CUDA + UCL_Texture neigh_tex; + + /// Compile kernels for neighbor lists + void compile_kernels(UCL_Device &dev, const int gpu_nbor); + + // ----------------------------- Kernels + UCL_Program *nbor_program, *build_program; + UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor; + UCL_Kernel k_transpose, k_special; + + private: + bool _compiled; + int _gpu_nbor; +}; + +} + +#endif diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp new file mode 100644 index 000000000..870a22cd5 --- /dev/null +++ b/lib/gpu/lal_pppm.cpp @@ -0,0 +1,410 @@ +/*************************************************************************** + pppm.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for PPPM acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "pppm_cl.h" +#else +#include "pppm_f_ptx.h" +#include "pppm_d_ptx.h" +#endif +#include "lal_pppm.h" +#include <cassert> + +using namespace LAMMPS_AL; +#define PPPMT PPPM<numtyp, acctyp, grdtyp, grdtyp4> + +extern Device<PRECISION,ACC_PRECISION> global_device; + +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +PPPMT::PPPM() : _allocated(false), _compiled(false), + _max_bytes(0) { + device=&global_device; + ans=new Answer<numtyp,acctyp>(); +} + +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +PPPMT::~PPPM() { + clear(0.0); + delete ans; +} + +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +int PPPMT::bytes_per_atom() const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1; +} + +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, + const int order, const int nxlo_out, + const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, + const int nzhi_out, grdtyp **rho_coeff, + grdtyp **vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, + const int nz_pppm, int &flag) { + _max_bytes=10; + screen=_screen; + bool success=true; + + flag=device->init(*ans,nlocal,nall); + if (flag!=0) + return 0; + if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) { + flag=-5; + return 0; + } + if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) { + flag=-4; + return 0; + } + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pppm_block(); + _pencil_size=device->num_mem_threads(); + _block_pencils=_block_size/_pencil_size; + + compile_kernels(*ucl_device); + + // Initialize timers for the selected GPU + time_in.init(*ucl_device); + time_in.zero(); + time_out.init(*ucl_device); + time_out.zero(); + time_map.init(*ucl_device); + time_map.zero(); + time_rho.init(*ucl_device); + time_rho.zero(); + time_interp.init(*ucl_device); + time_interp.zero(); + + pos_tex.bind_float(atom->dev_x,4); + q_tex.bind_float(atom->dev_q,1); + + _allocated=true; + _max_bytes=0; + _max_an_bytes=ans->gpu_bytes(); + + _order=order; + _order_m_1=order-1; + _order2=_order_m_1*_order; + _nlower=-(_order-1)/2; + _nupper=order/2; + _nxlo_out=nxlo_out; + _nylo_out=nylo_out; + _nzlo_out=nzlo_out; + _nxhi_out=nxhi_out; + _nyhi_out=nyhi_out; + _nzhi_out=nzhi_out; + + _slab_volfactor=slab_volfactor; + _nx_pppm=nx_pppm; + _ny_pppm=ny_pppm; + _nz_pppm=nz_pppm; + + _max_brick_atoms=10; + + // Get rho_coeff on device + int n2lo=(1-order)/2; + int numel=order*( order/2 - n2lo + 1 ); + success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)== + UCL_SUCCESS); + UCL_H_Vec<grdtyp> view; + view.view(rho_coeff[0]+n2lo,numel,*ucl_device); + ucl_copy(d_rho_coeff,view,true); + _max_bytes+=d_rho_coeff.row_bytes(); + + // Allocate storage for grid + _npts_x=nxhi_out-nxlo_out+1; + _npts_y=nyhi_out-nylo_out+1; + _npts_z=nzhi_out-nzlo_out+1; + _npts_yx=_npts_x*_npts_y; + success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)== + UCL_SUCCESS); + success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)== + UCL_SUCCESS); + success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)== + UCL_SUCCESS); + *vd_brick=h_vd_brick.begin(); + _max_bytes+=d_brick.row_bytes(); + + // Allocate vector with count of atoms assigned to each grid point + _nlocal_x=_npts_x+_nlower-_nupper; + _nlocal_y=_npts_y+_nlower-_nupper; + _nlocal_z=_npts_z+_nlower-_nupper; + _nlocal_yx=_nlocal_x*_nlocal_y; + _atom_stride=_nlocal_x*_nlocal_y*_nlocal_z; + success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)== + UCL_SUCCESS); + _max_bytes+=d_brick_counts.row_bytes(); + + // Allocate storage for atoms assigned to each grid point + success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms, + *ucl_device)==UCL_SUCCESS); + _max_bytes+=d_brick_atoms.row_bytes(); + + // Allocate error flags for checking out of bounds atoms + success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS); + success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)== + UCL_SUCCESS); + if (!success) { + flag=-3; + return 0; + } + + d_error_flag.zero(); + _max_bytes+=1; + + _cpu_idle_time=0.0; + + return h_brick.begin(); +} + +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +void PPPMT::clear(const double cpu_time) { + if (!_allocated) + return; + _allocated=false; + _precompute_done=false; + + d_brick.clear(); + h_brick.clear(); + h_vd_brick.clear(); + d_brick_counts.clear(); + h_error_flag.clear(); + d_error_flag.clear(); + d_brick_atoms.clear(); + + acc_timers(); + device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp, + *ans,_max_bytes+_max_an_bytes,cpu_time, + _cpu_idle_time,screen); + + if (_compiled) { + k_particle_map.clear(); + k_make_rho.clear(); + k_interp.clear(); + delete pppm_program; + _compiled=false; + } + + time_in.clear(); + time_out.clear(); + time_map.clear(); + time_rho.clear(); + time_interp.clear(); + + ans->clear(); + device->clear(); +} + +// --------------------------------------------------------------------------- +// Charge assignment that can be performed asynchronously +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +void PPPMT::_precompute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, + const double delxinv, const double delyinv, + const double delzinv) { + acc_timers(); + if (nlocal==0) { + zero_timers(); + return; + } + + ans->inum(nlocal); + + if (ago==0) { + resize_atom(nlocal,nall,success); + resize_local(nlocal,success); + if (!success) + return; + + double bytes=ans->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + atom->add_x_data(host_x,host_type); + atom->add_q_data(); + + time_map.start(); + + // Compute the block size and grid size to keep all cores busy + int BX=this->block_size(); + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX)); + + int ainum=this->ans->inum(); + + // Boxlo adjusted to be upper left brick and shift for even spline order + double shift=0.0; + if (_order % 2) + shift=0.5; + _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv; + _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv; + _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv; + + _delxinv=delxinv; + _delyinv=delyinv; + _delzinv=delzinv; + double delvolinv = delxinv*delyinv*delzinv; + grdtyp f_delvolinv = delvolinv; + + device->zero(d_brick_counts,d_brick_counts.numel()); + k_particle_map.set_size(GX,BX); + k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv, + &ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(), + &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, + &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, + &_atom_stride, &_max_brick_atoms, &d_error_flag.begin()); + time_map.stop(); + + time_rho.start(); + BX=block_size(); + + GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/ + _block_pencils)); + k_make_rho.set_size(GX,BX); + k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(), + &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, + &_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y, + &_nlocal_z, &_order_m_1, &_order, &_order2); + time_rho.stop(); + + time_out.start(); + ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true); + ucl_copy(h_error_flag,d_error_flag,true); + time_out.stop(); + + _precompute_done=true; +} + +// --------------------------------------------------------------------------- +// Charge spreading stuff +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +int PPPMT::spread(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, + const double delxinv, const double delyinv, + const double delzinv) { + if (_precompute_done==false) { + atom->acc_timers(); + _precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv, + delyinv,delzinv); + } + + device->stop_host_timer(); + + if (!success || nlocal==0) + return 0; + + double t=MPI_Wtime(); + time_out.sync_stop(); + _cpu_idle_time+=MPI_Wtime()-t; + + _precompute_done=false; + + if (h_error_flag[0]==2) { + // Not enough storage for atoms on the brick + _max_brick_atoms*=2; + d_error_flag.zero(); + d_brick_atoms.clear(); + d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device); + _max_bytes+=d_brick_atoms.row_bytes(); + return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, + delxinv,delyinv,delzinv); + } + + return h_error_flag[0]; +} + +// --------------------------------------------------------------------------- +// Charge spreading stuff +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +void PPPMT::interp(const grdtyp qqrd2e_scale) { + time_in.start(); + ucl_copy(d_brick,h_vd_brick,true); + time_in.stop(); + + time_interp.start(); + // Compute the block size and grid size to keep all cores busy + int BX=this->block_size(); + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX)); + + int ainum=this->ans->inum(); + + k_interp.set_size(GX,BX); + k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, + &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx, + &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv, + &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin()); + time_interp.stop(); + + ans->copy_answers(false,false,false,false); + device->add_ans_object(ans); +} + + +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +double PPPMT::host_memory_usage() const { + return device->atom.host_memory_usage()+ + sizeof(PPPM<numtyp,acctyp,grdtyp,grdtyp4>); +} + +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +void PPPMT::compile_kernels(UCL_Device &dev) { + if (_compiled) + return; + + if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false) + return; + + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE)+" -D"+ + std::string(OCL_VENDOR); + #ifdef USE_OPENCL + flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+ + ucl_template_name<grdtyp>()+"4"; + #endif + + pppm_program=new UCL_Program(dev); + + #ifdef USE_OPENCL + pppm_program->load_string(pppm,flags.c_str()); + #else + if (sizeof(grdtyp)==sizeof(float)) + pppm_program->load_string(pppm_f,flags.c_str()); + else + pppm_program->load_string(pppm_d,flags.c_str()); + #endif + + k_particle_map.set_function(*pppm_program,"particle_map"); + k_make_rho.set_function(*pppm_program,"make_rho"); + k_interp.set_function(*pppm_program,"interp"); + pos_tex.get_texture(*pppm_program,"pos_tex"); + q_tex.get_texture(*pppm_program,"q_tex"); + + _compiled=true; +} + +template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>; +template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>; + diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu new file mode 100644 index 000000000..5204180e8 --- /dev/null +++ b/lib/gpu/lal_pppm.cu @@ -0,0 +1,267 @@ +// ************************************************************************** +// pppm.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for PPPM acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_preprocessor.h" +texture<float4> pos_tex; +texture<float> q_tex; +#ifndef _DOUBLE_DOUBLE +ucl_inline float4 fetch_pos(const int& i, const float4 *pos) + { return tex1Dfetch(pos_tex, i); } +ucl_inline float fetch_q(const int& i, const float *q) + { return tex1Dfetch(q_tex, i); } +#endif + +// Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error +// generated at runtime with use of pppm/gpu +#if (__CUDA_ARCH__ < 110) +#define atomicAdd(x,y) *(x)+=0 +#endif + +#else +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable +#endif + +// Number of threads per pencil for charge spread +#define PENCIL_SIZE MEM_THREADS +// Number of pencils per block for charge spread +#define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE) + +__kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_, + const grdtyp delvolinv, const int nlocal, + __global int *counts, __global grdtyp4 *ans, + const grdtyp b_lo_x, const grdtyp b_lo_y, + const grdtyp b_lo_z, const grdtyp delxinv, + const grdtyp delyinv, const grdtyp delzinv, + const int nlocal_x, const int nlocal_y, + const int nlocal_z, const int atom_stride, + const int max_atoms, __global int *error) { + // ii indexes the two interacting particles in gi + int ii=GLOBAL_ID_X; + + // Resequence the atom indices to avoid collisions during atomic ops + int nthreads=GLOBAL_SIZE_X; + ii=fast_mul(ii,PPPM_BLOCK_1D); + ii-=(ii/nthreads)*(nthreads-1); + + int nx,ny,nz; + + if (ii<nlocal) { + numtyp4 p=fetch_pos(ii,x_); + grdtyp4 delta; + delta.w=delvolinv*fetch_q(ii,q_); + + if (delta.w!=(grdtyp)0.0) { + delta.x=(p.x-b_lo_x)*delxinv; + nx=delta.x; + delta.y=(p.y-b_lo_y)*delyinv; + ny=delta.y; + delta.z=(p.z-b_lo_z)*delzinv; + nz=delta.z; + + if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 || + nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z) + *error=1; + else { + delta.x=nx+(grdtyp)0.5-delta.x; + delta.y=ny+(grdtyp)0.5-delta.y; + delta.z=nz+(grdtyp)0.5-delta.z; + + int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx; + int old=atom_add(counts+i, 1); + if (old>=max_atoms) { + *error=2; + atom_add(counts+i, -1); + } else + ans[atom_stride*old+i]=delta; + } + } + } +} + +/* --------------------------- */ + +__kernel void make_rho(__global int *counts, __global grdtyp4 *atoms, + __global grdtyp *brick, __global grdtyp *_rho_coeff, + const int atom_stride, const int npts_x, + const int npts_y, const int npts_z, const int nlocal_x, + const int nlocal_y, const int nlocal_z, + const int order_m_1, const int order, const int order2) { + __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE]; + __local grdtyp front[BLOCK_PENCILS][PENCIL_SIZE+PPPM_MAX_SPLINE]; + __local grdtyp ans[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; + + int tid=THREAD_ID_X; + if (tid<order2+order) + rho_coeff[tid]=_rho_coeff[tid]; + + int pid=tid/PENCIL_SIZE; + int fid=tid%PENCIL_SIZE; + int fid_halo=PENCIL_SIZE+fid; + if (fid<order) + front[pid][fid_halo]=(grdtyp)0.0; + + __syncthreads(); + + int bt=BLOCK_ID_X*BLOCK_PENCILS+pid; + int ny=bt%npts_y; + int nz=bt/npts_y; + int y_start=0; + int z_start=0; + int y_stop=order; + int z_stop=order; + if (ny<order_m_1) + y_start=order_m_1-ny; + if (nz<order_m_1) + z_start=order_m_1-nz; + if (ny>=nlocal_y) + y_stop-=ny-nlocal_y+1; + if (nz>=nlocal_z) + z_stop-=nz-nlocal_z+1; + int z_stride=fast_mul(nlocal_x,nlocal_y); + + int loop_count=npts_x/PENCIL_SIZE+1; + int nx=fid; + int pt=fast_mul(nz,fast_mul(npts_y,npts_x))+fast_mul(ny,npts_x)+nx; + for (int i=0 ; i<loop_count; i++) { + for (int n=0; n<order; n++) + ans[n][tid]=(grdtyp)0.0; + if (nx<nlocal_x && nz<npts_z) { + int z_pos=fast_mul(nz+z_start-order_m_1,z_stride); + for (int m=z_start; m<z_stop; m++) { + int y_pos=fast_mul(ny+y_start-order_m_1,nlocal_x); + for (int l=y_start; l<y_stop; l++) { + int pos=z_pos+y_pos+nx; + int natoms=fast_mul(counts[pos],atom_stride); + for (int row=pos; row<natoms; row+=atom_stride) { + grdtyp4 delta=atoms[row]; + + grdtyp rho1d_1=(grdtyp)0.0; + grdtyp rho1d_2=(grdtyp)0.0; + for (int k=order2+order-1; k > -1; k-=order) { + rho1d_1=rho_coeff[k-l]+rho1d_1*delta.y; + rho1d_2=rho_coeff[k-m]+rho1d_2*delta.z; + } + delta.w*=rho1d_1*rho1d_2; + + for (int n=0; n<order; n++) { + grdtyp rho1d_0=(grdtyp)0.0; + for (int k=order2+n; k>=n; k-=order) + rho1d_0=rho_coeff[k]+rho1d_0*delta.x; + ans[n][tid]+=delta.w*rho1d_0; + } + } + y_pos+=nlocal_x; + } + z_pos+=z_stride; + } + } + + __syncthreads(); + if (fid<order) { + front[pid][fid]=front[pid][fid_halo]; + front[pid][fid_halo]=(grdtyp)0.0; + } else + front[pid][fid]=(grdtyp)0.0; + + for (int n=0; n<order; n++) { + front[pid][fid+n]+=ans[n][tid]; + __syncthreads(); + } + + if (nx<npts_x && nz<npts_z) + brick[pt]=front[pid][fid]; + pt+=PENCIL_SIZE; + nx+=PENCIL_SIZE; + } +} + +__kernel void interp(__global numtyp4 *x_, __global numtyp *q_, + const int nlocal, __global grdtyp4 *brick, + __global grdtyp *_rho_coeff, const int npts_x, + const int npts_yx, const grdtyp b_lo_x, + const grdtyp b_lo_y, const grdtyp b_lo_z, + const grdtyp delxinv, const grdtyp delyinv, + const grdtyp delzinv, const int order, + const int order2, const grdtyp qqrd2e_scale, + __global acctyp4 *ans) { + __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE]; + __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; + __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; + + int tid=THREAD_ID_X; + if (tid<order2+order) + rho_coeff[tid]=_rho_coeff[tid]; + __syncthreads(); + + int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; + + int nx,ny,nz; + grdtyp tx,ty,tz; + + if (ii<nlocal) { + numtyp4 p=fetch_pos(ii,x_); + grdtyp qs=qqrd2e_scale*fetch_q(ii,q_); + + acctyp4 ek; + ek.x=(acctyp)0.0; + ek.y=(acctyp)0.0; + ek.z=(acctyp)0.0; + if (qs!=(grdtyp)0.0) { + tx=(p.x-b_lo_x)*delxinv; + nx=tx; + ty=(p.y-b_lo_y)*delyinv; + ny=ty; + tz=(p.z-b_lo_z)*delzinv; + nz=tz; + + grdtyp dx=nx+(grdtyp)0.5-tx; + grdtyp dy=ny+(grdtyp)0.5-ty; + grdtyp dz=nz+(grdtyp)0.5-tz; + + for (int k=0; k<order; k++) { + rho1d_0[k][tid]=(grdtyp)0.0; + rho1d_1[k][tid]=(grdtyp)0.0; + for (int l=order2+k; l>=k; l-=order) { + rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx; + rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy; + } + } + + int mz=fast_mul(nz,npts_yx)+nx; + for (int n=0; n<order; n++) { + grdtyp rho1d_2=(grdtyp)0.0; + for (int k=order2+n; k>=n; k-=order) + rho1d_2=rho_coeff[k]+rho1d_2*dz; + grdtyp z0=qs*rho1d_2; + int my=mz+fast_mul(ny,npts_x); + for (int m=0; m<order; m++) { + grdtyp y0=z0*rho1d_1[m][tid]; + for (int l=0; l<order; l++) { + grdtyp x0=y0*rho1d_0[l][tid]; + grdtyp4 el=brick[my+l]; + ek.x-=x0*el.x; + ek.y-=x0*el.y; + ek.z-=x0*el.z; + } + my+=npts_x; + } + mz+=npts_yx; + } + } + ans[ii]=ek; + } +} + diff --git a/lib/gpu/lal_pppm.h b/lib/gpu/lal_pppm.h new file mode 100644 index 000000000..e7740d14d --- /dev/null +++ b/lib/gpu/lal_pppm.h @@ -0,0 +1,196 @@ +/*************************************************************************** + pppm.h + ------------------- + W. Michael Brown (ORNL) + + Class for PPPM acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_PPPM_H +#define LAL_PPPM_H + +#include "mpi.h" +#include "lal_device.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> class Device; + +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +class PPPM { + public: + PPPM(); + virtual ~PPPM(); + + /// Clear any previous data and set up for a new LAMMPS run + /** Success will be: + * - 0 if successfull + * - -1 if fix gpu not found + * - -2 if GPU could not be found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order, + const int nxlo_out, const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, const int nzhi_out, + grdtyp **rho_coeff, grdtyp **vd_brick, + const double slab_volfactor, const int nx_pppm, + const int ny_pppm, const int nz_pppm, int &success); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->dev_x,4); + q_tex.bind_float(atom->dev_q,1); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for local atoms and realloc if not + inline void resize_local(const int inum, bool &success) { + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(const double cpu_time); + + /// Returns memory usage on device per atom + int bytes_per_atom() const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + ans->acc_timers(); + time_in.add_to_total(); + time_out.add_to_total(); + time_map.add_to_total(); + time_rho.add_to_total(); + time_interp.add_to_total(); + } + } + + /// Zero timers + inline void zero_timers() { + atom->zero_timers(); + ans->zero_timers(); + time_in.zero(); + time_out.zero(); + time_map.zero(); + time_rho.zero(); + time_interp.zero(); + } + + /// Precomputations for charge assignment that can be done asynchronously + inline void precompute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *charge, double *boxlo, double *prd) { + double delxinv=_nx_pppm/prd[0]; + double delyinv=_ny_pppm/prd[1]; + double delzinv=_nz_pppm/(prd[2]*_slab_volfactor); + _precompute(ago,nlocal,nall,host_x,host_type,success,charge,boxlo,delxinv, + delyinv,delzinv); + } + + /// Returns non-zero if out of bounds atoms + int spread(const int ago, const int nlocal, const int nall, double **host_x, + int *host_type, bool &success, double *charge, double *boxlo, + const double delxinv, const double delyinv, const double delzinv); + + void interp(const grdtyp qqrd2e_scale); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device<numtyp,acctyp> *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_in, time_out, time_map, time_rho, time_interp; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom<numtyp,acctyp> *atom; + + + // --------------------------- GRID DATA -------------------------- + + UCL_H_Vec<grdtyp> h_brick, h_vd_brick; + UCL_D_Vec<grdtyp> d_brick; + + // Count of number of atoms assigned to each grid point + UCL_D_Vec<int> d_brick_counts; + // Atoms assigned to each grid point + UCL_D_Vec<grdtyp4> d_brick_atoms; + + // Error checking for out of bounds atoms + UCL_D_Vec<int> d_error_flag; + UCL_H_Vec<int> h_error_flag; + + // Number of grid points in brick (including ghost) + int _npts_x, _npts_y, _npts_z, _npts_yx; + + // Number of local grid points in brick + int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride; + + // -------------------------- SPLINE DATA ------------------------- + UCL_D_Vec<grdtyp> d_rho_coeff; + int _order, _nlower, _nupper, _order_m_1, _order2; + int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer<numtyp,acctyp> *ans; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pppm_program; + UCL_Kernel k_particle_map, k_make_rho, k_interp; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + + protected: + bool _allocated, _compiled, _precompute_done; + int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms; + double _max_bytes, _max_an_bytes; + double _cpu_idle_time; + + grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; + + double _slab_volfactor; + int _nx_pppm, _ny_pppm, _nz_pppm; + + void compile_kernels(UCL_Device &dev); + void _precompute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *charge, double *boxlo, const double delxinv, + const double delyinv, const double delzinv); +}; + +} + +#endif diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp new file mode 100644 index 000000000..e090f0034 --- /dev/null +++ b/lib/gpu/lal_pppm_ext.cpp @@ -0,0 +1,163 @@ +/*************************************************************************** + pppm_ext.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to PPPM acceleration routines + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_pppm.h" + +using namespace std; +using namespace LAMMPS_AL; + +static PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4> PPPMF; +static PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +template <class grdtyp, class memtyp> +grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, + FILE *screen, const int order, const int nxlo_out, + const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, + const int nzhi_out, grdtyp **rho_coeff, + grdtyp **vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, const int nz_pppm, + int &success) { + pppm.clear(0.0); + int first_gpu=pppm.device->first_device(); + int last_gpu=pppm.device->last_device(); + int world_me=pppm.device->world_me(); + int gpu_rank=pppm.device->gpu_rank(); + int procs_per_gpu=pppm.device->procs_per_gpu(); + + pppm.device->init_message(screen,"pppm",first_gpu,last_gpu); + + bool message=false; + if (pppm.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + success=0; + grdtyp * host_brick=NULL; + if (world_me==0) + host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out, + nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, + slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success); + + pppm.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out, + nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, + vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, + success); + + pppm.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + return host_brick; +} + +float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, + const int order, const int nxlo_out, + const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, + const int nzhi_out, float **rho_coeff, + float **vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, const int nz_pppm, + int &success) { + float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out, + nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, + slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success); + PPPMF.device->set_single_precompute(&PPPMF); + return b; +} + +void pppm_gpu_clear_f(const double cpu_time) { + PPPMF.clear(cpu_time); +} + +int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, const double delxinv, + const double delyinv, const double delzinv) { + return PPPMF.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, + delxinv,delyinv,delzinv); +} + +void pppm_gpu_interp_f(const float qqrd2e_scale) { + return PPPMF.interp(qqrd2e_scale); +} + +double pppm_gpu_bytes_f() { + return PPPMF.host_memory_usage(); +} + +double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, + const int order, const int nxlo_out, + const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, + const int nzhi_out, double **rho_coeff, + double **vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, + const int nz_pppm, int &success) { + double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out, + nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, + vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, + success); + PPPMF.device->set_double_precompute(&PPPMD); + return b; +} + +void pppm_gpu_clear_d(const double cpu_time) { + PPPMD.clear(cpu_time); +} + +int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, const double delxinv, + const double delyinv, const double delzinv) { + return PPPMD.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, + delxinv,delyinv,delzinv); +} + +void pppm_gpu_interp_d(const double qqrd2e_scale) { + return PPPMD.interp(qqrd2e_scale); +} + +double pppm_gpu_bytes_d() { + return PPPMD.host_memory_usage(); +} + diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h new file mode 100644 index 000000000..2ba544542 --- /dev/null +++ b/lib/gpu/lal_precision.h @@ -0,0 +1,95 @@ +/*************************************************************************** + precision.h + ------------------- + W. Michael Brown (ORNL) + + Data and preprocessor definitions for different precision modes + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_PRECISION_H +#define LAL_PRECISION_H + +struct _lgpu_float2 { + float x; float y; +}; + +struct _lgpu_float4 { + float x; float y; float z; float w; +}; + +struct _lgpu_double2 { + double x; double y; +}; + +struct _lgpu_double4 { + double x; double y; double z; double w; +}; + +#include <iostream> +inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) { + out << v.x << " " << v.y; + return out; +} + +inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) { + out << v.x << " " << v.y << " " << v.z; + return out; +} + +inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) { + out << v.x << " " << v.y; + return out; +} + +inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { + out << v.x << " " << v.y << " " << v.z; + return out; +} + +// PRECISION - Precision for rsq, energy, force, and torque calculation +// ACC_PRECISION - Precision for accumulation of energies, forces, and torques +#ifdef _SINGLE_DOUBLE +#define OCL_PRECISION_COMPILE "-D_SINGLE_DOUBLE" +#define PRECISION float +#define ACC_PRECISION double +#define numtyp2 _lgpu_float2 +#define numtyp4 _lgpu_float4 +#define acctyp4 _lgpu_double4 +#endif + +#ifdef _DOUBLE_DOUBLE +#define OCL_PRECISION_COMPILE "-D_DOUBLE_DOUBLE" +#define PRECISION double +#define ACC_PRECISION double +#define numtyp2 _lgpu_double2 +#define numtyp4 _lgpu_double4 +#define acctyp4 _lgpu_double4 +#endif + +#ifndef PRECISION +#define OCL_PRECISION_COMPILE "-D_SINGLE_SINGLE" +#define PRECISION float +#define ACC_PRECISION float +#define numtyp2 _lgpu_float2 +#define numtyp4 _lgpu_float4 +#define acctyp4 _lgpu_float4 +#endif + +enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; + +#ifdef FERMI_OCL +#define OCL_VENDOR "FERMI_OCL" +#endif + +#ifndef OCL_VENDOR +#define OCL_VENDOR "GENERIC_OCL" +#endif + +#endif diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h new file mode 100644 index 000000000..722860f51 --- /dev/null +++ b/lib/gpu/lal_preprocessor.h @@ -0,0 +1,319 @@ +// ************************************************************************** +// preprocessor.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for CUDA-specific preprocessor definitions +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : brownw@ornl.gov +// ***************************************************************************/ + +//************************************************************************* +// Preprocessor Definitions +// +// Note: It is assumed that constants with the same names are defined with +// the same values in all files. +// +// ARCH +// Definition: Architecture number for accelerator +// MEM_THREADS +// Definition: Number of threads with sequential ids accessing memory +// simultaneously on multiprocessor +// WARP_SIZE: +// Definition: Number of threads guaranteed to be on the same instruction +// THREADS_PER_ATOM +// Definition: Default number of threads assigned per atom for pair styles +// Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE +// THREADS_PER_CHARGE +// Definition: Default number of threads assigned per atom for pair styles +// with charge +// Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE +// PPPM_MAX_SPLINE +// Definition: Maximum order for splines in PPPM +// PPPM_BLOCK_1D +// Definition: Thread block size for PPPM kernels +// Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE +// PPPM_BLOCK_1D%32==0 +// BLOCK_PAIR +// Definition: Default thread block size for pair styles +// Restrictions: +// MAX_SHARED_TYPES 8 +// Definition: Max # of atom type params can be stored in shared memory +// Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR +// BLOCK_CELL_2D +// Definition: Default block size in each dimension for cell list builds +// and matrix transpose +// BLOCK_CELL_ID +// Definition: Default block size for binning atoms in cell list builds +// BLOCK_NBOR_BUILD +// Definition: Default block size for neighbor list builds +// BLOCK_BIO_PAIR +// Definition: Default thread block size for "bio" pair styles +// MAX_BIO_SHARED_TYPES +// Definition: Max # of atom type params can be stored in shared memory +// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 && +// MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR +// +//*************************************************************************/ + +// ------------------------------------------------------------------------- +// CUDA DEFINITIONS +// ------------------------------------------------------------------------- + +#ifdef NV_KERNEL + +#ifdef __CUDA_ARCH__ +#define ARCH __CUDA_ARCH__ +#else +#define ARCH 100 +#endif + +#if (ARCH < 200) + +#define THREADS_PER_ATOM 1 +#define THREADS_PER_CHARGE 16 +#define BLOCK_NBOR_BUILD 64 +#define BLOCK_PAIR 64 +#define BLOCK_BIO_PAIR 64 +#define MAX_SHARED_TYPES 8 + +#else + +#define THREADS_PER_ATOM 4 +#define THREADS_PER_CHARGE 8 +#define BLOCK_NBOR_BUILD 128 +#define BLOCK_PAIR 128 +#define BLOCK_BIO_PAIR 128 +#define MAX_SHARED_TYPES 11 + +#endif + +#define WARP_SIZE 32 +#define PPPM_BLOCK_1D 64 +#define BLOCK_CELL_2D 8 +#define BLOCK_CELL_ID 128 +#define MAX_BIO_SHARED_TYPES 128 + +#ifdef _DOUBLE_DOUBLE +ucl_inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; } +ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; } +#endif + +#if (__CUDA_ARCH__ < 200) +#define fast_mul __mul24 +#define MEM_THREADS 16 +#else +#define fast_mul(X,Y) (X)*(Y) +#define MEM_THREADS 32 +#endif + +#ifdef CUDA_PRE_THREE +struct __builtin_align__(16) _double4 +{ + double x, y, z, w; +}; +typedef struct _double4 double4; +#endif + +#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) +#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) +#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); +#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); +#define THREAD_ID_X threadIdx.x +#define THREAD_ID_Y threadIdx.y +#define BLOCK_ID_X blockIdx.x +#define BLOCK_ID_Y blockIdx.y +#define BLOCK_SIZE_X blockDim.x +#define BLOCK_SIZE_Y blockDim.y +#define __kernel extern "C" __global__ +#define __local __shared__ +#define __global +#define atom_add atomicAdd +#define ucl_inline static __inline__ __device__ + + +#ifndef _DOUBLE_DOUBLE + +#define ucl_exp exp +#define ucl_powr pow +#define ucl_atan atan +#define ucl_cbrt cbrt +#define ucl_ceil ceil +#define ucl_abs fabs +#define ucl_rsqrt rsqrt +#define ucl_sqrt sqrt +#define ucl_recip(x) ((numtyp)1.0/(x)) + +#else + +#define ucl_atan atanf +#define ucl_cbrt cbrtf +#define ucl_ceil ceilf +#define ucl_abs fabsf +#define ucl_recip(x) ((numtyp)1.0/(x)) + +#ifdef NO_HARDWARE_TRANSCENDENTALS + +#define ucl_exp expf +#define ucl_powr powf +#define ucl_rsqrt rsqrtf +#define ucl_sqrt sqrtf + +#else + +#define ucl_exp __expf +#define ucl_powr __powf +#define ucl_rsqrt __rsqrtf +#define ucl_sqrt __sqrtf + +#endif + +#endif + +#endif + +// ------------------------------------------------------------------------- +// FERMI OPENCL DEFINITIONS +// ------------------------------------------------------------------------- + +#ifdef FERMI_OCL + +#define USE_OPENCL +#define fast_mul(X,Y) (X)*(Y) +#define ARCH 0 +#define DRIVER 0 +#define MEM_THREADS 32 +#define THREADS_PER_ATOM 4 +#define THREADS_PER_CHARGE 8 +#define BLOCK_PAIR 128 +#define MAX_SHARED_TYPES 11 +#define BLOCK_NBOR_BUILD 128 +#define BLOCK_BIO_PAIR 128 + +#define WARP_SIZE 32 +#define PPPM_BLOCK_1D 64 +#define BLOCK_CELL_2D 8 +#define BLOCK_CELL_ID 128 +#define MAX_BIO_SHARED_TYPES 128 + +#pragma OPENCL EXTENSION cl_khr_fp64: enable + +#endif + +// ------------------------------------------------------------------------- +// GENERIC OPENCL DEFINITIONS +// ------------------------------------------------------------------------- + +#ifdef GENERIC_OCL + +#define USE_OPENCL +#define fast_mul mul24 +#define ARCH 0 +#define DRIVER 0 +#define MEM_THREADS 16 +#define THREADS_PER_ATOM 1 +#define THREADS_PER_CHARGE 1 +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 +#define BLOCK_NBOR_BUILD 64 +#define BLOCK_BIO_PAIR 64 + +#define WARP_SIZE 1 +#define PPPM_BLOCK_1D 64 +#define BLOCK_CELL_2D 8 +#define BLOCK_CELL_ID 128 +#define MAX_BIO_SHARED_TYPES 128 + +#pragma OPENCL EXTENSION cl_khr_fp64: enable + +#endif + +// ------------------------------------------------------------------------- +// OPENCL Stuff for All Hardware +// ------------------------------------------------------------------------- +#ifdef USE_OPENCL + +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define GLOBAL_SIZE_X get_global_size(0) +#define THREAD_ID_Y get_local_id(1) +#define BLOCK_ID_Y get_group_id(1) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define ucl_inline inline +#define fetch_pos(i,y) x_[i] +#define fetch_q(i,y) q_[i] + +#define ucl_atan atan +#define ucl_cbrt cbrt +#define ucl_ceil ceil +#define ucl_abs fabs + +#ifdef NO_HARDWARE_TRANSCENDENTALS + +#define ucl_exp exp +#define ucl_powr powr +#define ucl_rsqrt rsqrt +#define ucl_sqrt sqrt +#define ucl_recip(x) ((numtyp)1.0/(x)) + +#else + +#define ucl_exp native_exp +#define ucl_powr native_powr +#define ucl_rsqrt native_rsqrt +#define ucl_sqrt native_sqrt +#define ucl_recip native_recip + +#endif + +#endif + +// ------------------------------------------------------------------------- +// ARCHITECTURE INDEPENDENT DEFINITIONS +// ------------------------------------------------------------------------- + +#define PPPM_MAX_SPLINE 8 + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#define EWALD_F (numtyp)1.12837917 +#define EWALD_P (numtyp)0.3275911 +#define A1 (numtyp)0.254829592 +#define A2 (numtyp)-0.284496736 +#define A3 (numtyp)1.421413741 +#define A4 (numtyp)-1.453152027 +#define A5 (numtyp)1.061405429 + +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +ucl_inline int sbmask(int j) { return j >> SBBITS & 3; } + diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp new file mode 100644 index 000000000..50f0503b3 --- /dev/null +++ b/lib/gpu/lal_re_squared.cpp @@ -0,0 +1,310 @@ +/*************************************************************************** + re_squared.cpp + ------------------- + W. Michael Brown + + Host code for RE-Squared potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Fri May 06 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "re_squared_cl.h" +#include "re_squared_lj_cl.h" +#else +#include "re_squared_ptx.h" +#include "re_squared_lj_ptx.h" +#endif + +#include "lal_re_squared.h" +#include <cassert> +using namespace LAMMPS_AL; + +#define RESquaredT RESquared<numtyp, acctyp> +extern Device<PRECISION,ACC_PRECISION> device; + +template <class numtyp, class acctyp> +RESquaredT::RESquared() : BaseEllipsoid<numtyp,acctyp>(), + _allocated(false) { +} + +template <class numtyp, class acctyp> +RESquaredT::~RESquared() { + clear(); +} + +template <class numtyp, class acctyp> +int RESquaredT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom(max_nbors); +} + +template <class numtyp, class acctyp> +int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, + double **host_epsilon, int **h_form, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, const double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ntypes,h_form,re_squared,re_squared_lj,true); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + _shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) { + lj_types=max_shared_types; + _shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for copying type data + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write, + host_sigma,host_epsilon); + + this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write, + host_cutsq,h_form); + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq,h_form); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + dev_error.alloc(1,*(this->ucl_device)); + dev_error.zero(); + + // Allocate, cast and asynchronous memcpy of constant data + // Copy data for bonded interactions + special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + host_write[0]=static_cast<numtyp>(host_special_lj[0]); + host_write[1]=static_cast<numtyp>(host_special_lj[1]); + host_write[2]=static_cast<numtyp>(host_special_lj[2]); + host_write[3]=static_cast<numtyp>(host_special_lj[3]); + ucl_copy(special_lj,host_write,4,false); + + // Copy shape, well, sigma, epsilon, and cutsq onto GPU + // - cast if necessary + shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<ntypes; i++) { + host_write[i*4]=host_shape[i][0]; + host_write[i*4+1]=host_shape[i][1]; + host_write[i*4+2]=host_shape[i][2]; + } + UCL_H_Vec<numtyp4> view4; + view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device)); + ucl_copy(shape,view4,false); + + well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<ntypes; i++) { + host_write[i*4]=host_well[i][0]; + host_write[i*4+1]=host_well[i][1]; + host_write[i*4+2]=host_well[i][2]; + } + view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device)); + ucl_copy(well,view4,false); + + _allocated=true; + this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+ + lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+ + shape.row_bytes()+well.row_bytes(); + + return 0; +} + +template <class numtyp, class acctyp> +void RESquaredT::clear() { + if (!_allocated) + return; + + UCL_H_Vec<int> err_flag(1,*(this->ucl_device)); + ucl_copy(err_flag,dev_error,false); + if (err_flag[0] == 2) + std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; + err_flag.clear(); + + _allocated=false; + + dev_error.clear(); + lj1.clear(); + lj3.clear(); + sigma_epsilon.clear(); + this->cut_form.clear(); + + shape.clear(); + well.clear(); + special_lj.clear(); + + this->clear_base(); +} + +template <class numtyp, class acctyp> +double RESquaredT::host_memory_usage() const { + return this->host_memory_usage_base()+sizeof(RESquaredT)+ + 4*sizeof(numtyp); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void RESquaredT::loop(const bool _eflag, const bool _vflag) { + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=0, NGX; + int stride=this->nbor->nbor_pitch(); + int ainum=this->ans->inum(); + + if (this->_multiple_forms) { + if (this->_last_ellipse>0) { + // ------------ ELLIPSE_ELLIPSE --------------- + this->time_nbor1.start(); + GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/ + (BX/this->_threads_per_atom))); + NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX)); + this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE, + ELLIPSE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor1.stop(); + + this->time_ellipsoid.start(); + this->k_ellipsoid.set_size(GX,BX); + this->k_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->special_lj.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride, + &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), + &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, + &this->_threads_per_atom); + this->time_ellipsoid.stop(); + + // ------------ ELLIPSE_SPHERE --------------- + this->time_nbor2.start(); + this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE, + ELLIPSE_SPHERE,_shared_types,_lj_types); + this->time_nbor2.stop(); + + this->time_ellipsoid2.start(); + this->k_ellipsoid_sphere.set_size(GX,BX); + this->k_ellipsoid_sphere.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->special_lj.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride, + &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), + &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, + &this->_threads_per_atom); + this->time_ellipsoid2.stop(); + + if (this->_last_ellipse==this->ans->inum()) { + this->time_nbor3.zero(); + this->time_ellipsoid3.zero(); + this->time_lj.zero(); + return; + } + + // ------------ SPHERE_ELLIPSE --------------- + + this->time_nbor3.start(); + GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()- + this->_last_ellipse)/ + (BX/this->_threads_per_atom))); + NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()- + this->_last_ellipse)/BX)); + this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(), + SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor3.stop(); + + this->time_ellipsoid3.start(); + this->k_sphere_ellipsoid.set_size(GX,BX); + this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), + &this->well.begin(), &this->special_lj.begin(), + &this->sigma_epsilon.begin(), &this->_lj_types, + &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, + &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + this->time_ellipsoid3.stop(); + } else { + this->ans->dev_ans.zero(); + this->ans->dev_engv.zero(); + this->time_nbor1.zero(); + this->time_ellipsoid.zero(); + this->time_nbor2.zero(); + this->time_ellipsoid2.zero(); + this->time_nbor3.zero(); + this->time_ellipsoid3.zero(); + } + + // ------------ LJ --------------- + this->time_lj.start(); + if (this->_last_ellipse<this->ans->inum()) { + if (this->_shared_types) { + this->k_lj_fast.set_size(GX,BX); + this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(), + &this->lj3.begin(), &this->special_lj.begin(), &stride, + &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), + &eflag, &vflag, &this->_last_ellipse, &ainum, + &this->_threads_per_atom); + } else { + this->k_lj.set_size(GX,BX); + this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(), + &this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(), + &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, + &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + } + } + this->time_lj.stop(); + } else { + GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX)); + this->time_nbor1.start(); + this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE, + ELLIPSE_ELLIPSE,_shared_types,_lj_types); + this->time_nbor1.stop(); + this->time_ellipsoid.start(); + this->k_ellipsoid.set_size(GX,BX); + this->k_ellipsoid.run(&this->atom->dev_x.begin(), + &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), + &this->special_lj.begin(), &this->sigma_epsilon.begin(), + &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride, + &this->ans->dev_ans.begin(), &ainum, &this->ans->dev_engv.begin(), + &this->dev_error.begin(), &eflag, &vflag, &ainum, + &this->_threads_per_atom); + this->time_ellipsoid.stop(); + } +} + +template class RESquared<PRECISION,ACC_PRECISION>; + diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu new file mode 100644 index 000000000..63057a30d --- /dev/null +++ b/lib/gpu/lal_re_squared.cu @@ -0,0 +1,452 @@ +// ************************************************************************** +// re_squared.cu +// ------------------- +// W. Michael Brown +// +// Device code for RE-Squared potential acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : Fri May 06 2011 +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_ellipsoid_extra.h" +#endif + +ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9]) +{ + numtyp ans; + ans = m2[0]*m[4]*m[8] - m2[0]*m[5]*m[7] - + m[3]*m2[1]*m[8] + m[3]*m2[2]*m[7] + + m[6]*m2[1]*m[5] - m[6]*m2[2]*m[4] + + m[0]*m2[4]*m[8] - m[0]*m2[5]*m[7] - + m2[3]*m[1]*m[8] + m2[3]*m[2]*m[7] + + m[6]*m[1]*m2[5] - m[6]*m[2]*m2[4] + + m[0]*m[4]*m2[8] - m[0]*m[5]*m2[7] - + m[3]*m[1]*m2[8] + m[3]*m[2]*m2[7] + + m2[6]*m[1]*m[5] - m2[6]*m[2]*m[4]; + return ans; +} + +__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *splj, __global numtyp2* sig_eps, + const int ntypes, __global int *dev_nbor, + const int stride, __global acctyp4 *ans, + const int astride, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int inum, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; + sp_lj[3]=splj[3]; + + __local numtyp b_alpha, cr60; + b_alpha=(numtyp)45.0/(numtyp)56.0; + cr60=ucl_cbrt((numtyp)60.0); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *nbor_end; + int i, numj, n_stride; + nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, + n_stride,nbor_end,nbor); + + numtyp4 ix=x_[i]; + int itype=ix.w; + + numtyp a1[9]; // Rotation matrix (lab->body) + numtyp aTe1[9]; // A'*E + numtyp gamma1[9]; // A'*S^2*A + numtyp sa1[9]; // S^2*A; + numtyp lA1_0[9], lA1_1[9], lA1_2[9]; // -A*rotation generator (x,y, or z) + numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9]; // A'*S^2*lA + numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9]; // lAtwo+lA'*sa + numtyp4 ishape; + + ishape=shape[itype]; + numtyp4 ishape2; + ishape2.x=ishape.x*ishape.x; + ishape2.y=ishape.y*ishape.y; + ishape2.z=ishape.z*ishape.z; + numtyp ilshape = ishape.x*ishape.y*ishape.z; + + { + numtyp aTs[9]; // A1'*S1^2 + gpu_quat_to_mat_trans(q,i,a1); + gpu_transpose_times_diag3(a1,well[itype],aTe1); + gpu_transpose_times_diag3(a1,ishape2,aTs); + gpu_diag_times3(ishape2,a1,sa1); + gpu_times3(aTs,a1,gamma1); + gpu_rotation_generator_x(a1,lA1_0); + gpu_rotation_generator_y(a1,lA1_1); + gpu_rotation_generator_z(a1,lA1_2); + gpu_times3(aTs,lA1_0,lAtwo1_0); + gpu_transpose_times3(lA1_0,sa1,lAsa1_0); + gpu_plus3(lAsa1_0,lAtwo1_0,lAsa1_0); + gpu_times3(aTs,lA1_1,lAtwo1_1); + gpu_transpose_times3(lA1_1,sa1,lAsa1_1); + gpu_plus3(lAsa1_1,lAtwo1_1,lAsa1_1); + gpu_times3(aTs,lA1_2,lAtwo1_2); + gpu_transpose_times3(lA1_2,sa1,lAsa1_2); + gpu_plus3(lAsa1_2,lAtwo1_2,lAsa1_2); + } + ishape2.x=ucl_recip(ishape2.x); + ishape2.y=ucl_recip(ishape2.y); + ishape2.z=ucl_recip(ishape2.z); + + numtyp factor_lj; + for ( ; nbor<nbor_end; nbor+=n_stride) { + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp r[3], rhat[3]; + numtyp rnorm; + r[0] = jx.x-ix.x; + r[1] = jx.y-ix.y; + r[2] = jx.z-ix.z; + rnorm = gpu_dot3(r,r); + rnorm = ucl_rsqrt(rnorm); + rhat[0] = r[0]*rnorm; + rhat[1] = r[1]*rnorm; + rhat[2] = r[2]*rnorm; + + + numtyp a2[9]; // Rotation matrix (lab->body) + numtyp gamma2[9]; // A'*S^2*A + numtyp4 jshape; + + jshape=shape[jtype]; + numtyp4 jshape2; + jshape2.x=jshape.x*jshape.x; + jshape2.y=jshape.y*jshape.y; + jshape2.z=jshape.z*jshape.z; + { + numtyp aTs[9]; // A1'*S1^2 + gpu_quat_to_mat_trans(q,j,a2); + gpu_transpose_times_diag3(a2,jshape2,aTs); + gpu_times3(aTs,a2,gamma2); + } + + numtyp temp[9], s[3], z1[3], z2[3], v1[3], v2[3]; + numtyp sigma12, sigma1, sigma2; + gpu_plus3(gamma1,gamma2,temp); + gpu_mldivide3(temp,rhat,s,err_flag); + sigma12 = ucl_rsqrt((numtyp)0.5*gpu_dot3(s,rhat)); + gpu_times_column3(a1,rhat,z1); + gpu_times_column3(a2,rhat,z2); + v1[0] = z1[0]*ishape2.x; + v1[1] = z1[1]*ishape2.y; + v1[2] = z1[2]*ishape2.z; + v2[0] = z2[0]/jshape2.x; + v2[1] = z2[1]/jshape2.y; + v2[2] = z2[2]/jshape2.z; + sigma1 = ucl_sqrt(gpu_dot3(z1,v1)); + sigma2 = ucl_sqrt(gpu_dot3(z2,v2)); + + numtyp H12[9]; + numtyp dH; + H12[0] = gamma1[0]*sigma1+gamma2[0]*sigma2; + H12[1] = gamma1[1]*sigma1+gamma2[1]*sigma2; + H12[2] = gamma1[2]*sigma1+gamma2[2]*sigma2; + H12[3] = gamma1[3]*sigma1+gamma2[3]*sigma2; + H12[4] = gamma1[4]*sigma1+gamma2[4]*sigma2; + H12[5] = gamma1[5]*sigma1+gamma2[5]*sigma2; + H12[6] = gamma1[6]*sigma1+gamma2[6]*sigma2; + H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2; + H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2; + dH=gpu_det3(H12); + + numtyp sigma1p2, sigma2p2, lambda, nu; + sigma1p2 = sigma1*sigma1; + sigma2p2 = sigma2*sigma2; + numtyp jlshape = jshape.x*jshape.y*jshape.z; + lambda = ilshape*sigma1p2 + jlshape*sigma2p2; + + + sigma1=ucl_recip(sigma1); + sigma2=ucl_recip(sigma2); + + nu = ucl_rsqrt((sigma1+sigma2)/dH); + gpu_times3(aTe1,a1,temp); + + numtyp sigma, epsilon; + int mtype=fast_mul(ntypes,itype)+jtype; + sigma = sig_eps[mtype].x; + epsilon = sig_eps[mtype].y*factor_lj; + + numtyp w[3], temp2[9]; + numtyp h12,eta,chi,sprod,sigh,tprod; + numtyp aTe2[9]; // A'*E + gpu_transpose_times_diag3(a2,well[jtype],aTe2); + gpu_times3(aTe2,a2,temp2); + gpu_plus3(temp,temp2,temp); + gpu_mldivide3(temp,rhat,w,err_flag); + h12 = ucl_recip(rnorm)-sigma12; + eta = lambda/nu; + chi = (numtyp)2.0*gpu_dot3(rhat,w); + sprod = ilshape * jlshape; + sigh = sigma/h12; + tprod = eta*chi*sigh; + + numtyp stemp, Ua; + stemp = h12*(numtyp)0.5; + Ua = (ishape.x+stemp)*(ishape.y+stemp)* + (ishape.z+stemp)*(jshape.x+stemp)* + (jshape.y+stemp)*(jshape.z+stemp); + Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*sprod/Ua; + Ua = epsilon*Ua/(numtyp)-36.0; + + numtyp Ur; + stemp = h12/cr60; + Ur = (ishape.x+stemp)*(ishape.y+stemp)* + (ishape.z+stemp)*(jshape.x+stemp)* + (jshape.y+stemp)*(jshape.z+stemp); + Ur = ((numtyp)1.0+b_alpha*tprod)*sprod/Ur; + numtyp sigh6=sigh*sigh*sigh; + sigh6*=sigh6; + Ur = epsilon*Ur*sigh6/(numtyp)2025.0; + + energy+=Ua+Ur; + + // force + + numtyp vsigma1[3], vsigma2[3], gsigma1[9], gsigma2[9]; + numtyp sec, sigma12p3, sigma1p3, sigma2p3; + sec = sigma*eta*chi; + sigma12p3 = sigma12*sigma12*sigma12; + sigma1p3 = sigma1/sigma1p2; + sigma2p3 = sigma2/sigma2p2; + vsigma1[0] = -sigma1p3*v1[0]; + vsigma1[1] = -sigma1p3*v1[1]; + vsigma1[2] = -sigma1p3*v1[2]; + vsigma2[0] = -sigma2p3*v2[0]; + vsigma2[1] = -sigma2p3*v2[1]; + vsigma2[2] = -sigma2p3*v2[2]; + gsigma1[0] = -gamma1[0]*sigma1p2; + gsigma1[1] = -gamma1[1]*sigma1p2; + gsigma1[2] = -gamma1[2]*sigma1p2; + gsigma1[3] = -gamma1[3]*sigma1p2; + gsigma1[4] = -gamma1[4]*sigma1p2; + gsigma1[5] = -gamma1[5]*sigma1p2; + gsigma1[6] = -gamma1[6]*sigma1p2; + gsigma1[7] = -gamma1[7]*sigma1p2; + gsigma1[8] = -gamma1[8]*sigma1p2; + gsigma2[0] = -gamma2[0]*sigma2p2; + gsigma2[1] = -gamma2[1]*sigma2p2; + gsigma2[2] = -gamma2[2]*sigma2p2; + gsigma2[3] = -gamma2[3]*sigma2p2; + gsigma2[4] = -gamma2[4]*sigma2p2; + gsigma2[5] = -gamma2[5]*sigma2p2; + gsigma2[6] = -gamma2[6]*sigma2p2; + gsigma2[7] = -gamma2[7]*sigma2p2; + gsigma2[8] = -gamma2[8]*sigma2p2; + + numtyp tsig1sig2, tdH, teta1, teta2; + numtyp fourw[3], spr[3]; + tsig1sig2 = eta/((numtyp)2.0*(sigma1+sigma2)); + tdH = eta/((numtyp)2.0*dH); + teta1 = (numtyp)2.0*eta/lambda; + teta2 = teta1*jlshape/sigma2p3; + teta1 = teta1*ilshape/sigma1p3; + fourw[0] = (numtyp)4.0*w[0]; + fourw[1] = (numtyp)4.0*w[1]; + fourw[2] = (numtyp)4.0*w[2]; + spr[0] = (numtyp)0.5*sigma12p3*s[0]; + spr[1] = (numtyp)0.5*sigma12p3*s[1]; + spr[2] = (numtyp)0.5*sigma12p3*s[2]; + + numtyp hsec, dspu, pbsu; + stemp = ucl_recip(ishape.x*(numtyp)2.0+h12)+ + ucl_recip(ishape.y*(numtyp)2.0+h12)+ + ucl_recip(ishape.z*(numtyp)2.0+h12)+ + ucl_recip(jshape.x*(numtyp)2.0+h12)+ + ucl_recip(jshape.y*(numtyp)2.0+h12)+ + ucl_recip(jshape.z*(numtyp)2.0+h12); + hsec = ucl_recip(h12+(numtyp)3.0*sec); + dspu = ucl_recip(h12)-hsec+stemp; + pbsu = (numtyp)3.0*sigma*hsec; + + numtyp dspr, pbsr; + stemp = ucl_recip(ishape.x*cr60+h12)+ + ucl_recip(ishape.y*cr60+h12)+ + ucl_recip(ishape.z*cr60+h12)+ + ucl_recip(jshape.x*cr60+h12)+ + ucl_recip(jshape.y*cr60+h12)+ + ucl_recip(jshape.z*cr60+h12); + hsec = ucl_recip(h12+b_alpha*sec); + dspr = (numtyp)7.0/h12-hsec+stemp; + pbsr = b_alpha*sigma*hsec; + + numtyp dH12[9]; + numtyp dUa, dUr, deta, dchi, ddH, dh12; + numtyp dsigma1, dsigma2; + + #pragma unroll + for (int i=0; i<3; i++) { + numtyp u[3], u1[3], u2[3]; + u[0] = -rhat[i]*rhat[0]; + u[1] = -rhat[i]*rhat[1]; + u[2] = -rhat[i]*rhat[2]; + u[i] += (numtyp)1.0; + u[0] *= rnorm; + u[1] *= rnorm; + u[2] *= rnorm; + gpu_times_column3(a1,u,u1); + gpu_times_column3(a2,u,u2); + dsigma1=gpu_dot3(u1,vsigma1); + dsigma2=gpu_dot3(u2,vsigma2); + dH12[0] = dsigma1*gsigma1[0]+dsigma2*gsigma2[0]; + dH12[1] = dsigma1*gsigma1[1]+dsigma2*gsigma2[1]; + dH12[2] = dsigma1*gsigma1[2]+dsigma2*gsigma2[2]; + dH12[3] = dsigma1*gsigma1[3]+dsigma2*gsigma2[3]; + dH12[4] = dsigma1*gsigma1[4]+dsigma2*gsigma2[4]; + dH12[5] = dsigma1*gsigma1[5]+dsigma2*gsigma2[5]; + dH12[6] = dsigma1*gsigma1[6]+dsigma2*gsigma2[6]; + dH12[7] = dsigma1*gsigma1[7]+dsigma2*gsigma2[7]; + dH12[8] = dsigma1*gsigma1[8]+dsigma2*gsigma2[8]; + ddH = det_prime(H12,dH12); + deta = (dsigma1+dsigma2)*tsig1sig2; + deta -= ddH*tdH; + deta -= dsigma1*teta1+dsigma2*teta2; + dchi = gpu_dot3(u,fourw); + dh12 = rhat[i]+gpu_dot3(u,spr); + dUa = pbsu*(eta*dchi+deta*chi)-dh12*dspu; + dUr = pbsr*(eta*dchi+deta*chi)-dh12*dspr; + numtyp force=dUr*Ur+dUa*Ua; + if (i==0) { + f.x+=force; + if (vflag>0) + virial[0]+=-r[0]*force; + } else if (i==1) { + f.y+=force; + if (vflag>0) { + virial[1]+=-r[1]*force; + virial[3]+=-r[0]*force; + } + } else { + f.z+=force; + if (vflag>0) { + virial[2]+=-r[2]*force; + virial[4]+=-r[0]*force; + virial[5]+=-r[1]*force; + } + } + } + + // torque on i + sigma1=ucl_recip(sigma1); + + numtyp fwae[3], p[3]; + gpu_row_times3(fourw,aTe1,fwae); + + { + gpu_times_column3(lA1_0,rhat,p); + dsigma1 = gpu_dot3(p,vsigma1); + dH12[0] = lAsa1_0[0]*sigma1+dsigma1*gsigma1[0]; + dH12[1] = lAsa1_0[1]*sigma1+dsigma1*gsigma1[1]; + dH12[2] = lAsa1_0[2]*sigma1+dsigma1*gsigma1[2]; + dH12[3] = lAsa1_0[3]*sigma1+dsigma1*gsigma1[3]; + dH12[4] = lAsa1_0[4]*sigma1+dsigma1*gsigma1[4]; + dH12[5] = lAsa1_0[5]*sigma1+dsigma1*gsigma1[5]; + dH12[6] = lAsa1_0[6]*sigma1+dsigma1*gsigma1[6]; + dH12[7] = lAsa1_0[7]*sigma1+dsigma1*gsigma1[7]; + dH12[8] = lAsa1_0[8]*sigma1+dsigma1*gsigma1[8]; + ddH = det_prime(H12,dH12); + deta = tsig1sig2*dsigma1-tdH*ddH; + deta -= teta1*dsigma1; + numtyp tempv[3]; + gpu_times_column3(lA1_0,w,tempv); + dchi = -gpu_dot3(fwae,tempv); + gpu_times_column3(lAtwo1_0,spr,tempv); + dh12 = -gpu_dot3(s,tempv); + + dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; + dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; + tor.x -= (dUa*Ua+dUr*Ur); + } + + { + gpu_times_column3(lA1_1,rhat,p); + dsigma1 = gpu_dot3(p,vsigma1); + dH12[0] = lAsa1_1[0]*sigma1+dsigma1*gsigma1[0]; + dH12[1] = lAsa1_1[1]*sigma1+dsigma1*gsigma1[1]; + dH12[2] = lAsa1_1[2]*sigma1+dsigma1*gsigma1[2]; + dH12[3] = lAsa1_1[3]*sigma1+dsigma1*gsigma1[3]; + dH12[4] = lAsa1_1[4]*sigma1+dsigma1*gsigma1[4]; + dH12[5] = lAsa1_1[5]*sigma1+dsigma1*gsigma1[5]; + dH12[6] = lAsa1_1[6]*sigma1+dsigma1*gsigma1[6]; + dH12[7] = lAsa1_1[7]*sigma1+dsigma1*gsigma1[7]; + dH12[8] = lAsa1_1[8]*sigma1+dsigma1*gsigma1[8]; + ddH = det_prime(H12,dH12); + deta = tsig1sig2*dsigma1-tdH*ddH; + deta -= teta1*dsigma1; + numtyp tempv[3]; + gpu_times_column3(lA1_1,w,tempv); + dchi = -gpu_dot3(fwae,tempv); + gpu_times_column3(lAtwo1_1,spr,tempv); + dh12 = -gpu_dot3(s,tempv); + + dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; + dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; + tor.y -= (dUa*Ua+dUr*Ur); + } + + { + gpu_times_column3(lA1_2,rhat,p); + dsigma1 = gpu_dot3(p,vsigma1); + dH12[0] = lAsa1_2[0]*sigma1+dsigma1*gsigma1[0]; + dH12[1] = lAsa1_2[1]*sigma1+dsigma1*gsigma1[1]; + dH12[2] = lAsa1_2[2]*sigma1+dsigma1*gsigma1[2]; + dH12[3] = lAsa1_2[3]*sigma1+dsigma1*gsigma1[3]; + dH12[4] = lAsa1_2[4]*sigma1+dsigma1*gsigma1[4]; + dH12[5] = lAsa1_2[5]*sigma1+dsigma1*gsigma1[5]; + dH12[6] = lAsa1_2[6]*sigma1+dsigma1*gsigma1[6]; + dH12[7] = lAsa1_2[7]*sigma1+dsigma1*gsigma1[7]; + dH12[8] = lAsa1_2[8]*sigma1+dsigma1*gsigma1[8]; + ddH = det_prime(H12,dH12); + deta = tsig1sig2*dsigma1-tdH*ddH; + deta -= teta1*dsigma1; + numtyp tempv[3]; + gpu_times_column3(lA1_2,w,tempv); + dchi = -gpu_dot3(fwae,tempv); + gpu_times_column3(lAtwo1_2,spr,tempv); + dh12 = -gpu_dot3(s,tempv); + + dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; + dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; + tor.z -= (dUa*Ua+dUr*Ur); + } + + } // for nbor + store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_re_squared.h b/lib/gpu/lal_re_squared.h new file mode 100644 index 000000000..c7441ed83 --- /dev/null +++ b/lib/gpu/lal_re_squared.h @@ -0,0 +1,90 @@ +/*************************************************************************** + re_squared.h + ------------------- + W. Michael Brown (ORNL) + + Host code for RE-Squared potential acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : Fri May 06 2011 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef RE_SQUARED_H +#define RE_SQUARED_H + +#include "lal_base_ellipsoid.h" +#include "mpi.h" + +namespace LAMMPS_AL { + +template <class numtyp, class acctyp> +class RESquared : public BaseEllipsoid<numtyp, acctyp> { + public: + RESquared(); + ~RESquared(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \return false if there is not sufficient memory or device init prob + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, double **host_epsilon, + int **h_form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **host_offset, + const double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Device Error Flag - Set if a bad matrix inversion occurs + UCL_D_Vec<int> dev_error; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form + UCL_D_Vec<numtyp4> lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec<numtyp4> lj3; + /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon + UCL_D_Vec<numtyp2> sigma_epsilon; + /// special lj 0-4 + UCL_D_Vec<numtyp> special_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool _shared_types; + int _lj_types; + + // --------------------------- ATOM DATA -------------------------- + + /// Aspherical Const Data for Atoms + UCL_D_Vec<numtyp4> shape, well; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_re_squared_ext.cpp b/lib/gpu/lal_re_squared_ext.cpp new file mode 100644 index 000000000..09e4c15c4 --- /dev/null +++ b/lib/gpu/lal_re_squared_ext.cpp @@ -0,0 +1,138 @@ +/*************************************************************************** + re_squared_ext.cpp + ------------------- + W. Michael Brown + + LAMMPS Wrappers for RE-Squared Acceleration + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "lal_re_squared.h" + +using namespace std; +using namespace LAMMPS_AL; + +static RESquared<PRECISION,ACC_PRECISION> REMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, + double **sigma, double **epsilon, + int **form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + REMF.clear(); + gpu_mode=REMF.device->gpu_mode(); + double gpu_split=REMF.device->particle_split(); + int first_gpu=REMF.device->first_device(); + int last_gpu=REMF.device->last_device(); + int world_me=REMF.device->world_me(); + int gpu_rank=REMF.device->gpu_rank(); + int procs_per_gpu=REMF.device->procs_per_gpu(); + + REMF.device->init_message(screen,"resquared",first_gpu,last_gpu); + + bool message=false; + if (REMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, + form, host_lj1, host_lj2, host_lj3, host_lj4, offset, + special_lj, inum, nall, max_nbors, maxspecial, cell_size, + gpu_split, screen); + + REMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, + form, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, + max_nbors, maxspecial, cell_size, gpu_split, screen); + + REMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + REMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Clear memory on host and device +// --------------------------------------------------------------------------- +void re_gpu_clear() { + REMF.clear(); +} + + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **host_quat); + +int** re_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **jnum, const double cpu_time, bool &success, + double **host_quat) { + return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, + tag, nspecial, special, eflag, vflag, eatom, vatom, + host_start, ilist, jnum, cpu_time, success, host_quat); +} + +int * re_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_quat) { + return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist, + numj, firstneigh, eflag, vflag, eatom, vatom, host_start, + cpu_time, success, host_quat); +} + +// --------------------------------------------------------------------------- +// Return memory usage +// --------------------------------------------------------------------------- +double re_gpu_bytes() { + return REMF.host_memory_usage(); +} + diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu new file mode 100644 index 000000000..a0c82ea29 --- /dev/null +++ b/lib/gpu/lal_re_squared_lj.cu @@ -0,0 +1,696 @@ +// ************************************************************************** +// re_squared_lj.cu +// ------------------- +// W. Michael Brown +// +// Device code for RE-Squared - Lennard-Jones potential acceleration +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : Fri May 06 2011 +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_ellipsoid_extra.h" +#endif + +__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *splj, __global numtyp2* sig_eps, + const int ntypes, __global int *dev_nbor, const int stride, + __global acctyp4 *ans, const int astride, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag, const int inum, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; + sp_lj[3]=splj[3]; + + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; + b_alpha=(numtyp)45.0/(numtyp)56.0; + cr60=ucl_cbrt((numtyp)60.0); + solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); + solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *nbor_end; + int i, numj, n_stride; + nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, + n_stride,nbor_end,nbor); + + numtyp4 ix=x_[i]; + int itype=ix.w; + + numtyp a[9]; // Rotation matrix (lab->body) + numtyp aTe[9]; // A'*E + numtyp lA_0[9], lA_1[9], lA_2[9]; // -A*rotation generator (x,y, or z) + + numtyp4 ishape; + ishape=shape[itype]; + numtyp ilshape=ishape.x*ishape.y*ishape.z; + + { + gpu_quat_to_mat_trans(q,i,a); + gpu_transpose_times_diag3(a,well[itype],aTe); + gpu_rotation_generator_x(a,lA_0); + gpu_rotation_generator_y(a,lA_1); + gpu_rotation_generator_z(a,lA_2); + } + + numtyp factor_lj; + for ( ; nbor<nbor_end; nbor+=n_stride) { + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp r[3], rhat[3]; + numtyp rnorm; + r[0] = jx.x-ix.x; + r[1] = jx.y-ix.y; + r[2] = jx.z-ix.z; + rnorm = gpu_dot3(r,r); + rnorm = ucl_rsqrt(rnorm); + rhat[0] = r[0]*rnorm; + rhat[1] = r[1]*rnorm; + rhat[2] = r[2]*rnorm; + + numtyp sigma, epsilon; + int mtype=fast_mul(ntypes,itype)+jtype; + sigma = sig_eps[mtype].x; + epsilon = sig_eps[mtype].y*factor_lj; + + numtyp aTs[9]; + numtyp4 scorrect; + numtyp half_sigma=sigma*(numtyp)0.5; + scorrect.x = ishape.x+half_sigma; + scorrect.y = ishape.y+half_sigma; + scorrect.z = ishape.z+half_sigma; + scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5; + scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5; + scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5; + gpu_transpose_times_diag3(a,scorrect,aTs); + + // energy + + numtyp gamma[9], s[3]; + gpu_times3(aTs,a,gamma); + gpu_mldivide3(gamma,rhat,s,err_flag); + + numtyp sigma12 = ucl_rsqrt((numtyp)0.5*gpu_dot3(s,rhat)); + numtyp temp[9], w[3]; + gpu_times3(aTe,a,temp); + temp[0] += (numtyp)1.0; + temp[4] += (numtyp)1.0; + temp[8] += (numtyp)1.0; + gpu_mldivide3(temp,rhat,w,err_flag); + + numtyp h12 = ucl_recip(rnorm)-sigma12; + numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w); + numtyp sigh = sigma/h12; + numtyp tprod = chi*sigh; + + numtyp Ua, Ur; + numtyp h12p3 = h12*h12*h12; + numtyp sigmap3 = sigma*sigma*sigma; + numtyp stemp = h12*(numtyp)0.5; + Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0; + Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua; + Ua = epsilon*Ua*sigmap3*solv_f_a; + + stemp = h12/cr60; + Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/ + (numtyp)60.0; + Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur; + numtyp sigh6=sigh*sigh*sigh; + sigh6*=sigh6; + Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r; + + energy+=Ua+Ur; + + // force + + numtyp fourw[3], spr[3]; + numtyp sec = sigma*chi; + numtyp sigma12p3 = sigma12*sigma12*sigma12; + fourw[0] = (numtyp)4.0*w[0]; + fourw[1] = (numtyp)4.0*w[1]; + fourw[2] = (numtyp)4.0*w[2]; + spr[0] = (numtyp)0.5*sigma12p3*s[0]; + spr[1] = (numtyp)0.5*sigma12p3*s[1]; + spr[2] = (numtyp)0.5*sigma12p3*s[2]; + + stemp = ucl_recip(ishape.x*(numtyp)2.0+h12)+ + ucl_recip(ishape.y*(numtyp)2.0+h12)+ + ucl_recip(ishape.z*(numtyp)2.0+h12)+ + (numtyp)3.0/h12; + numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec); + numtyp dspu = ucl_recip(h12)-hsec+stemp; + numtyp pbsu = (numtyp)3.0*sigma*hsec; + + stemp = ucl_recip(ishape.x*cr60+h12)+ + ucl_recip(ishape.y*cr60+h12)+ + ucl_recip(ishape.z*cr60+h12)+ + (numtyp)3.0/h12; + hsec = ucl_recip(h12+b_alpha*sec); + numtyp dspr = (numtyp)7.0/h12-hsec+stemp; + numtyp pbsr = b_alpha*sigma*hsec; + + #pragma unroll + for (int i=0; i<3; i++) { + numtyp u[3]; + u[0] = -rhat[i]*rhat[0]; + u[1] = -rhat[i]*rhat[1]; + u[2] = -rhat[i]*rhat[2]; + u[i] += (numtyp)1.0; + u[0] *= rnorm; + u[1] *= rnorm; + u[2] *= rnorm; + numtyp dchi = gpu_dot3(u,fourw); + numtyp dh12 = rhat[i]+gpu_dot3(u,spr); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + numtyp force=dUr*Ur+dUa*Ua; + if (i==0) { + f.x+=force; + if (vflag>0) + virial[0]+=-r[0]*force; + } else if (i==1) { + f.y+=force; + if (vflag>0) { + virial[1]+=-r[1]*force; + virial[3]+=-r[0]*force; + } + } else { + f.z+=force; + if (vflag>0) { + virial[2]+=-r[2]*force; + virial[4]+=-r[0]*force; + virial[5]+=-r[1]*force; + } + } + + } + + // torque on i + numtyp fwae[3]; + gpu_row_times3(fourw,aTe,fwae); + { + numtyp tempv[3], p[3], lAtwo[9]; + gpu_times_column3(lA_0,rhat,p); + gpu_times_column3(lA_0,w,tempv); + numtyp dchi = -gpu_dot3(fwae,tempv); + gpu_times3(aTs,lA_0,lAtwo); + gpu_times_column3(lAtwo,spr,tempv); + numtyp dh12 = -gpu_dot3(s,tempv); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + tor.x -= (dUa*Ua+dUr*Ur); + } + + { + numtyp tempv[3], p[3], lAtwo[9]; + gpu_times_column3(lA_1,rhat,p); + gpu_times_column3(lA_1,w,tempv); + numtyp dchi = -gpu_dot3(fwae,tempv); + gpu_times3(aTs,lA_1,lAtwo); + gpu_times_column3(lAtwo,spr,tempv); + numtyp dh12 = -gpu_dot3(s,tempv); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + tor.y -= (dUa*Ua+dUr*Ur); + } + + { + numtyp tempv[3], p[3], lAtwo[9]; + gpu_times_column3(lA_2,rhat,p); + gpu_times_column3(lA_2,w,tempv); + numtyp dchi = -gpu_dot3(fwae,tempv); + gpu_times3(aTs,lA_2,lAtwo); + gpu_times_column3(lAtwo,spr,tempv); + numtyp dh12 = -gpu_dot3(s,tempv); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + tor.z -= (dUa*Ua+dUr*Ur); + } + + } // for nbor + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[7][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=tor.x; + red_acc[4][tid]=tor.y; + red_acc[5][tid]=tor.z; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + tor.x=red_acc[3][tid]; + tor.y=red_acc[4][tid]; + tor.z=red_acc[5][tid]; + + if (eflag>0 || vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<7; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; + } + } + + // Store answers + if (offset==0) { + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1+=energy; + ap1+=astride; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1+=virial[i]; + ap1+=astride; + } + } + acctyp4 old=ans[ii]; + old.x+=f.x; + old.y+=f.y; + old.z+=f.z; + ans[ii]=old; + + old=ans[ii+astride]; + old.x+=tor.x; + old.y+=tor.y; + old.z+=tor.z; + ans[ii+astride]=old; + } + } // if ii +} + +__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, + __global numtyp4* shape,__global numtyp4* well, + __global numtyp *splj, __global numtyp2* sig_eps, + const int ntypes, __global int *dev_nbor, + const int stride, __global acctyp4 *ans, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag,const int start, + const int inum, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + ii+=start; + + __local numtyp sp_lj[4]; + sp_lj[0]=splj[0]; + sp_lj[1]=splj[1]; + sp_lj[2]=splj[2]; + sp_lj[3]=splj[3]; + + __local numtyp b_alpha, cr60, solv_f_a, solv_f_r; + b_alpha=(numtyp)45.0/(numtyp)56.0; + cr60=ucl_cbrt((numtyp)60.0); + solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0); + solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *nbor_end; + int j, numj, n_stride; + nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj, + n_stride,nbor_end,nbor); + + numtyp4 jx=x_[j]; + int jtype=jx.w; + + numtyp factor_lj; + for ( ; nbor<nbor_end; nbor+=n_stride) { + int i=*nbor; + factor_lj = sp_lj[sbmask(i)]; + i &= NEIGHMASK; + + numtyp4 ix=x_[i]; + int itype=ix.w; + + numtyp a[9]; // Rotation matrix (lab->body) + numtyp aTe[9]; // A'*E + numtyp4 ishape; + + ishape=shape[itype]; + gpu_quat_to_mat_trans(q,i,a); + gpu_transpose_times_diag3(a,well[itype],aTe); + + // Compute r12 + numtyp r[3], rhat[3]; + numtyp rnorm; + r[0] = ix.x-jx.x; + r[1] = ix.y-jx.y; + r[2] = ix.z-jx.z; + rnorm = gpu_dot3(r,r); + rnorm = ucl_rsqrt(rnorm); + rhat[0] = r[0]*rnorm; + rhat[1] = r[1]*rnorm; + rhat[2] = r[2]*rnorm; + + numtyp sigma, epsilon; + int mtype=fast_mul(ntypes,itype)+jtype; + sigma = sig_eps[mtype].x; + epsilon = sig_eps[mtype].y*factor_lj; + + numtyp aTs[9]; + numtyp4 scorrect; + numtyp half_sigma=sigma * (numtyp)0.5; + scorrect.x = ishape.x+half_sigma; + scorrect.y = ishape.y+half_sigma; + scorrect.z = ishape.z+half_sigma; + scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5; + scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5; + scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5; + gpu_transpose_times_diag3(a,scorrect,aTs); + + // energy + + numtyp gamma[9], s[3]; + gpu_times3(aTs,a,gamma); + gpu_mldivide3(gamma,rhat,s,err_flag); + + numtyp sigma12 = ucl_rsqrt((numtyp)0.5*gpu_dot3(s,rhat)); + numtyp temp[9], w[3]; + gpu_times3(aTe,a,temp); + temp[0] += (numtyp)1.0; + temp[4] += (numtyp)1.0; + temp[8] += (numtyp)1.0; + gpu_mldivide3(temp,rhat,w,err_flag); + + numtyp h12 = ucl_recip(rnorm)-sigma12; + numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w); + numtyp sigh = sigma/h12; + numtyp tprod = chi*sigh; + + numtyp Ua, Ur; + numtyp h12p3 = h12*h12*h12; + numtyp sigmap3 = sigma*sigma*sigma; + numtyp stemp = h12/(numtyp)2.0; + Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0; + numtyp ilshape=ishape.x*ishape.y*ishape.z; + Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua; + Ua = epsilon*Ua*sigmap3*solv_f_a; + + stemp = h12/cr60; + Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/ + (numtyp)60.0; + Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur; + numtyp sigh6=sigh*sigh*sigh; + sigh6*=sigh6; + Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r; + + energy+=Ua+Ur; + + // force + + numtyp fourw[3], spr[3]; + numtyp sec = sigma*chi; + numtyp sigma12p3 = sigma12*sigma12*sigma12; + fourw[0] = (numtyp)4.0*w[0]; + fourw[1] = (numtyp)4.0*w[1]; + fourw[2] = (numtyp)4.0*w[2]; + spr[0] = (numtyp)0.5*sigma12p3*s[0]; + spr[1] = (numtyp)0.5*sigma12p3*s[1]; + spr[2] = (numtyp)0.5*sigma12p3*s[2]; + + stemp = ucl_recip(ishape.x*(numtyp)2.0+h12)+ + ucl_recip(ishape.y*(numtyp)2.0+h12)+ + ucl_recip(ishape.z*(numtyp)2.0+h12)+ + (numtyp)3.0/h12; + numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec); + numtyp dspu = ucl_recip(h12)-hsec+stemp; + numtyp pbsu = (numtyp)3.0*sigma*hsec; + + stemp = ucl_recip(ishape.x*cr60+h12)+ + ucl_recip(ishape.y*cr60+h12)+ + ucl_recip(ishape.z*cr60+h12)+ + (numtyp)3.0/h12; + hsec = ucl_recip(h12+b_alpha*sec); + numtyp dspr = (numtyp)7.0/h12-hsec+stemp; + numtyp pbsr = b_alpha*sigma*hsec; + + #pragma unroll + for (int i=0; i<3; i++) { + numtyp u[3]; + u[0] = -rhat[i]*rhat[0]; + u[1] = -rhat[i]*rhat[1]; + u[2] = -rhat[i]*rhat[2]; + u[i] += (numtyp)1.0; + u[0] *= rnorm; + u[1] *= rnorm; + u[2] *= rnorm; + numtyp dchi = gpu_dot3(u,fourw); + numtyp dh12 = rhat[i]+gpu_dot3(u,spr); + numtyp dUa = pbsu*dchi-dh12*dspu; + numtyp dUr = pbsr*dchi-dh12*dspr; + numtyp force=dUr*Ur+dUa*Ua; + if (i==0) { + f.x+=force; + if (vflag>0) + virial[0]+=-r[0]*force; + } else if (i==1) { + f.y+=force; + if (vflag>0) { + virial[1]+=-r[1]*force; + virial[3]+=-r[0]*force; + } + } else { + f.z+=force; + if (vflag>0) { + virial[2]+=-r[2]*force; + virial[4]+=-r[0]*force; + virial[5]+=-r[1]*force; + } + } + } + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + ii+=start; + + __local numtyp sp_lj[4]; + sp_lj[0]=gum[0]; + sp_lj[1]=gum[1]; + sp_lj[2]=gum[2]; + sp_lj[3]=gum[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=x_[i]; + int itype=ix.w; + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + int ii=itype*lj_types+jtype; + if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) { + r2inv=ucl_recip(r2inv); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y); + force*=factor_lj; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y); + energy+=factor_lj*(e-lj3[ii].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + } // for nbor + acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + ii+=start; + + __local numtyp sp_lj[4]; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + if (tid<4) + sp_lj[tid]=gum[tid]; + if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { + lj1[tid]=lj1_in[tid]; + if (eflag>0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=x_[i]; + int iw=ix.w; + int itype=fast_mul((int)MAX_SHARED_TYPES,iw); + + numtyp factor_lj; + for ( ; nbor<list_end; nbor+=n_stride) { + + int j=*nbor; + factor_lj = sp_lj[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=x_[j]; + int mtype=itype+jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp r2inv = delx*delx+dely*dely+delz*delz; + + if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) { + r2inv=ucl_recip(r2inv); + numtyp r6inv = r2inv*r2inv*r2inv; + numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y); + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lj96_cut_gpu_kernel.ptx b/lib/gpu/lj96_cut_gpu_kernel.ptx new file mode 100644 index 000000000..5bf19f6f0 --- /dev/null +++ b/lib/gpu/lj96_cut_gpu_kernel.ptx @@ -0,0 +1,979 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bddd_00000000-9_lj96_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.4Q2aYE) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bddd_00000000-8_lj96_cut_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "lj96_cut_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .u64 __cudaparm_kernel_pair_lj3, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<72>; + .reg .u64 %rd<62>; + .reg .f32 %f<103>; + .reg .pred %p<19>; + .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16]; + .shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072]; + // __cuda_local_var_32504_10_non_const_f = 48 + // __cuda_local_var_32508_9_non_const_virial = 16 + .loc 16 88 0 +$LDWbegin_kernel_pair: + .loc 16 95 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 96 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 97 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 98 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; + .loc 16 107 0 + mov.f32 %f5, 0f00000000; // 0 + mov.f32 %f6, %f5; + mov.f32 %f7, 0f00000000; // 0 + mov.f32 %f8, %f7; + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_19202; + .loc 16 113 0 + ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd2, %r11; + mul.wide.s32 %rd3, %r11, 4; + cvt.s64.s32 %rd4, %r9; + mul.wide.s32 %rd5, %r9, 4; + ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd7, %rd5, %rd6; + add.u64 %rd8, %rd3, %rd7; + ld.global.s32 %r12, [%rd8+0]; + add.u64 %rd9, %rd3, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd6; + @%p2 bra $Lt_0_19714; + .loc 16 119 0 + cvt.s32.s64 %r13, %rd2; + mul.lo.s32 %r14, %r13, %r12; + cvt.s64.s32 %rd11, %r14; + mul.wide.s32 %rd12, %r14, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 120 0 + mul.lo.s32 %r15, %r6, %r13; + cvt.s64.s32 %rd14, %r15; + mul.wide.s32 %rd15, %r15, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 121 0 + mul.lo.s32 %r16, %r13, %r1; + bra.uni $Lt_0_19458; +$Lt_0_19714: + .loc 16 123 0 + ld.global.s32 %r17, [%rd9+0]; + cvt.s64.s32 %rd17, %r17; + mul.wide.s32 %rd18, %r17, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 124 0 + cvt.s64.s32 %rd20, %r12; + mul.wide.s32 %rd21, %r12, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 125 0 + mov.s32 %r16, %r1; + .loc 16 126 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_19458: + .loc 16 129 0 + ld.global.s32 %r18, [%rd7+0]; + mov.u32 %r19, %r18; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f21, %f17; + mov.f32 %f22, %f18; + mov.f32 %f23, %f19; + mov.f32 %f24, %f20; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_28162; + cvt.rzi.ftz.s32.f32 %r26, %f24; + cvt.s64.s32 %rd24, %r16; + ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r28, %r27, %r26; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1]; + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92; +$Lt_0_20482: + //<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown + .loc 16 135 0 + ld.global.s32 %r29, [%rd16+0]; + .loc 16 136 0 + shr.s32 %r30, %r29, 30; + and.b32 %r31, %r30, 3; + cvt.s64.s32 %rd27, %r31; + mul.wide.s32 %rd28, %r31, 4; + add.u64 %rd29, %rd26, %rd28; + ld.shared.f32 %f29, [%rd29+0]; + .loc 16 139 0 + and.b32 %r32, %r29, 1073741823; + mov.u32 %r33, %r32; + mov.s32 %r34, 0; + mov.u32 %r35, %r34; + mov.s32 %r36, 0; + mov.u32 %r37, %r36; + mov.s32 %r38, 0; + mov.u32 %r39, %r38; + tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}]; + mov.f32 %f34, %f30; + mov.f32 %f35, %f31; + mov.f32 %f36, %f32; + mov.f32 %f37, %f33; + cvt.rzi.ftz.s32.f32 %r40, %f37; + sub.ftz.f32 %f38, %f22, %f35; + sub.ftz.f32 %f39, %f21, %f34; + sub.ftz.f32 %f40, %f23, %f36; + mul.ftz.f32 %f41, %f38, %f38; + fma.rn.ftz.f32 %f42, %f39, %f39, %f41; + fma.rn.ftz.f32 %f43, %f40, %f40, %f42; + add.s32 %r41, %r40, %r28; + cvt.s64.s32 %rd30, %r41; + mul.wide.s32 %rd31, %r41, 16; + add.u64 %rd32, %rd31, %rd25; + ld.global.f32 %f44, [%rd32+8]; + setp.gt.ftz.f32 %p4, %f44, %f43; + @!%p4 bra $Lt_0_21762; + .loc 16 154 0 + rcp.approx.ftz.f32 %f45, %f43; + mul.ftz.f32 %f46, %f45, %f45; + mul.ftz.f32 %f47, %f45, %f46; + sqrt.approx.ftz.f32 %f48, %f47; + mul.ftz.f32 %f49, %f45, %f47; + ld.global.v2.f32 {%f50,%f51}, [%rd32+0]; + mul.ftz.f32 %f52, %f50, %f48; + sub.ftz.f32 %f53, %f52, %f51; + mul.ftz.f32 %f54, %f49, %f53; + mul.ftz.f32 %f55, %f29, %f54; + .loc 16 156 0 + fma.rn.ftz.f32 %f27, %f39, %f55, %f27; + .loc 16 157 0 + fma.rn.ftz.f32 %f26, %f38, %f55, %f26; + .loc 16 158 0 + fma.rn.ftz.f32 %f25, %f40, %f55, %f25; + ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r43, 0; + setp.le.s32 %p5, %r42, %r43; + @%p5 bra $Lt_0_21250; + .loc 16 162 0 + ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd34, %rd33, %rd31; + ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd34+0]; + mul.ftz.f32 %f59, %f56, %f48; + sub.ftz.f32 %f60, %f59, %f57; + mul.ftz.f32 %f61, %f47, %f60; + sub.ftz.f32 %f62, %f61, %f58; + fma.rn.ftz.f32 %f28, %f29, %f62, %f28; +$Lt_0_21250: + ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r45, 0; + setp.le.s32 %p6, %r44, %r45; + @%p6 bra $Lt_0_21762; + .loc 16 165 0 + mov.f32 %f63, %f6; + mul.ftz.f32 %f64, %f39, %f39; + fma.rn.ftz.f32 %f65, %f55, %f64, %f63; + mov.f32 %f6, %f65; + .loc 16 166 0 + mov.f32 %f66, %f8; + fma.rn.ftz.f32 %f67, %f55, %f41, %f66; + mov.f32 %f8, %f67; + .loc 16 167 0 + mov.f32 %f68, %f10; + mul.ftz.f32 %f69, %f40, %f40; + fma.rn.ftz.f32 %f70, %f55, %f69, %f68; + mov.f32 %f10, %f70; + .loc 16 168 0 + mov.f32 %f71, %f12; + mul.ftz.f32 %f72, %f38, %f39; + fma.rn.ftz.f32 %f73, %f55, %f72, %f71; + mov.f32 %f12, %f73; + .loc 16 169 0 + mov.f32 %f74, %f14; + mul.ftz.f32 %f75, %f39, %f40; + fma.rn.ftz.f32 %f76, %f55, %f75, %f74; + mov.f32 %f14, %f76; + .loc 16 170 0 + mul.ftz.f32 %f77, %f38, %f40; + fma.rn.ftz.f32 %f15, %f55, %f77, %f15; + mov.f32 %f16, %f15; +$Lt_0_21762: +$Lt_0_20738: + .loc 16 133 0 + mul.lo.u64 %rd35, %rd24, 4; + add.u64 %rd16, %rd16, %rd35; + setp.lt.u64 %p7, %rd16, %rd13; + @%p7 bra $Lt_0_20482; + bra.uni $Lt_0_18946; +$Lt_0_28162: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + bra.uni $Lt_0_18946; +$Lt_0_19202: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 +$Lt_0_18946: + mov.u32 %r46, 1; + setp.le.s32 %p8, %r1, %r46; + @%p8 bra $Lt_0_24578; + .loc 16 181 0 + mov.u64 %rd36, __cuda___cuda_local_var_32582_35_non_const_red_acc108; + cvt.s64.s32 %rd37, %r2; + mul.wide.s32 %rd38, %r2, 4; + add.u64 %rd39, %rd36, %rd38; + mov.f32 %f78, %f27; + st.shared.f32 [%rd39+0], %f78; + .loc 16 182 0 + mov.f32 %f79, %f26; + st.shared.f32 [%rd39+512], %f79; + .loc 16 183 0 + mov.f32 %f80, %f25; + st.shared.f32 [%rd39+1024], %f80; + .loc 16 184 0 + mov.f32 %f81, %f28; + st.shared.f32 [%rd39+1536], %f81; + .loc 16 186 0 + shr.s32 %r47, %r1, 31; + mov.s32 %r48, 1; + and.b32 %r49, %r47, %r48; + add.s32 %r50, %r49, %r1; + shr.s32 %r51, %r50, 1; + mov.s32 %r52, %r51; + mov.u32 %r53, 0; + setp.ne.u32 %p9, %r51, %r53; + @!%p9 bra $Lt_0_23042; +$Lt_0_23554: + setp.ge.u32 %p10, %r6, %r52; + @%p10 bra $Lt_0_23810; + .loc 16 189 0 + add.u32 %r54, %r2, %r52; + cvt.u64.u32 %rd40, %r54; + mul.wide.u32 %rd41, %r54, 4; + add.u64 %rd42, %rd36, %rd41; + ld.shared.f32 %f82, [%rd42+0]; + add.ftz.f32 %f78, %f82, %f78; + st.shared.f32 [%rd39+0], %f78; + ld.shared.f32 %f83, [%rd42+512]; + add.ftz.f32 %f79, %f83, %f79; + st.shared.f32 [%rd39+512], %f79; + ld.shared.f32 %f84, [%rd42+1024]; + add.ftz.f32 %f80, %f84, %f80; + st.shared.f32 [%rd39+1024], %f80; + ld.shared.f32 %f85, [%rd42+1536]; + add.ftz.f32 %f81, %f85, %f81; + st.shared.f32 [%rd39+1536], %f81; +$Lt_0_23810: + .loc 16 186 0 + shr.u32 %r52, %r52, 1; + mov.u32 %r55, 0; + setp.ne.u32 %p11, %r52, %r55; + @%p11 bra $Lt_0_23554; +$Lt_0_23042: + .loc 16 193 0 + mov.f32 %f27, %f78; + .loc 16 194 0 + mov.f32 %f26, %f79; + .loc 16 195 0 + mov.f32 %f25, %f80; + .loc 16 196 0 + mov.f32 %f28, %f81; + ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r57, 0; + setp.le.s32 %p12, %r56, %r57; + @%p12 bra $Lt_0_24578; + .loc 16 200 0 + mov.f32 %f78, %f6; + st.shared.f32 [%rd39+0], %f78; + mov.f32 %f79, %f8; + st.shared.f32 [%rd39+512], %f79; + mov.f32 %f80, %f10; + st.shared.f32 [%rd39+1024], %f80; + mov.f32 %f81, %f12; + st.shared.f32 [%rd39+1536], %f81; + mov.f32 %f86, %f14; + st.shared.f32 [%rd39+2048], %f86; + mov.f32 %f87, %f16; + st.shared.f32 [%rd39+2560], %f87; + .loc 16 202 0 + mov.s32 %r58, %r51; + @!%p9 bra $Lt_0_25090; +$Lt_0_25602: + setp.ge.u32 %p13, %r6, %r58; + @%p13 bra $Lt_0_25858; + .loc 16 205 0 + add.u32 %r59, %r2, %r58; + cvt.u64.u32 %rd43, %r59; + mul.wide.u32 %rd44, %r59, 4; + add.u64 %rd45, %rd36, %rd44; + ld.shared.f32 %f88, [%rd45+0]; + add.ftz.f32 %f78, %f88, %f78; + st.shared.f32 [%rd39+0], %f78; + ld.shared.f32 %f89, [%rd45+512]; + add.ftz.f32 %f79, %f89, %f79; + st.shared.f32 [%rd39+512], %f79; + ld.shared.f32 %f90, [%rd45+1024]; + add.ftz.f32 %f80, %f90, %f80; + st.shared.f32 [%rd39+1024], %f80; + ld.shared.f32 %f91, [%rd45+1536]; + add.ftz.f32 %f81, %f91, %f81; + st.shared.f32 [%rd39+1536], %f81; + ld.shared.f32 %f92, [%rd45+2048]; + add.ftz.f32 %f86, %f92, %f86; + st.shared.f32 [%rd39+2048], %f86; + ld.shared.f32 %f93, [%rd45+2560]; + add.ftz.f32 %f87, %f93, %f87; + st.shared.f32 [%rd39+2560], %f87; +$Lt_0_25858: + .loc 16 202 0 + shr.u32 %r58, %r58, 1; + mov.u32 %r60, 0; + setp.ne.u32 %p14, %r58, %r60; + @%p14 bra $Lt_0_25602; +$Lt_0_25090: + .loc 16 210 0 + mov.f32 %f6, %f78; + mov.f32 %f8, %f79; + mov.f32 %f10, %f80; + mov.f32 %f12, %f81; + mov.f32 %f14, %f86; + mov.f32 %f16, %f87; +$Lt_0_24578: +$Lt_0_22530: + selp.s32 %r61, 1, 0, %p1; + mov.s32 %r62, 0; + set.eq.u32.s32 %r63, %r6, %r62; + neg.s32 %r64, %r63; + and.b32 %r65, %r61, %r64; + mov.u32 %r66, 0; + setp.eq.s32 %p15, %r65, %r66; + @%p15 bra $Lt_0_26626; + .loc 16 216 0 + cvt.s64.s32 %rd46, %r9; + ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd48, %r9, 4; + add.u64 %rd49, %rd47, %rd48; + ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r68, 0; + setp.le.s32 %p16, %r67, %r68; + @%p16 bra $Lt_0_27138; + .loc 16 218 0 + st.global.f32 [%rd49+0], %f28; + .loc 16 219 0 + cvt.s64.s32 %rd50, %r10; + mul.wide.s32 %rd51, %r10, 4; + add.u64 %rd49, %rd49, %rd51; +$Lt_0_27138: + ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r70, 0; + setp.le.s32 %p17, %r69, %r70; + @%p17 bra $Lt_0_27650; + .loc 16 223 0 + mov.f32 %f94, %f6; + st.global.f32 [%rd49+0], %f94; + .loc 16 224 0 + cvt.s64.s32 %rd52, %r10; + mul.wide.s32 %rd53, %r10, 4; + add.u64 %rd54, %rd53, %rd49; + .loc 16 223 0 + mov.f32 %f95, %f8; + st.global.f32 [%rd54+0], %f95; + .loc 16 224 0 + add.u64 %rd55, %rd53, %rd54; + .loc 16 223 0 + mov.f32 %f96, %f10; + st.global.f32 [%rd55+0], %f96; + .loc 16 224 0 + add.u64 %rd56, %rd53, %rd55; + .loc 16 223 0 + mov.f32 %f97, %f12; + st.global.f32 [%rd56+0], %f97; + .loc 16 224 0 + add.u64 %rd49, %rd53, %rd56; + .loc 16 223 0 + mov.f32 %f98, %f14; + st.global.f32 [%rd49+0], %f98; + mov.f32 %f99, %f16; + add.u64 %rd57, %rd53, %rd49; + st.global.f32 [%rd57+0], %f99; +$Lt_0_27650: + .loc 16 227 0 + ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd59, %rd46, 16; + add.u64 %rd60, %rd58, %rd59; + mov.f32 %f100, %f101; + st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f100}; +$Lt_0_26626: + .loc 16 229 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_lj1_in, + .param .u64 __cudaparm_kernel_pair_fast_lj3_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<74>; + .reg .u64 %rd<74>; + .reg .f32 %f<109>; + .reg .pred %p<22>; + .shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16]; + .shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj13296[1936]; + .shared .align 16 .b8 __cuda___cuda_local_var_32647_34_non_const_lj35232[1936]; + .shared .align 4 .b8 __cuda___cuda_local_var_32737_35_non_const_red_acc7168[3072]; + // __cuda_local_var_32658_10_non_const_f = 48 + // __cuda_local_var_32662_9_non_const_virial = 16 + .loc 16 237 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 3; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_21250; + .loc 16 247 0 + mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_21250: + mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268; + mov.u32 %r3, 120; + setp.gt.s32 %p2, %r1, %r3; + @%p2 bra $Lt_1_21762; + .loc 16 249 0 + mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296; + cvt.s64.s32 %rd8, %r1; + mul.wide.s32 %rd9, %r1, 16; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; + add.u64 %rd11, %rd10, %rd9; + add.u64 %rd12, %rd9, %rd7; + ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; + st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; + ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r5, 0; + setp.le.s32 %p3, %r4, %r5; + @%p3 bra $Lt_1_22274; + .loc 16 251 0 + mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232; + ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; + add.u64 %rd15, %rd14, %rd9; + add.u64 %rd16, %rd9, %rd13; + ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; + st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; +$Lt_1_22274: + mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232; +$Lt_1_21762: + mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296; + mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232; + .loc 16 261 0 + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, %f14; + mov.f32 %f16, 0f00000000; // 0 + mov.f32 %f17, %f16; + mov.f32 %f18, 0f00000000; // 0 + mov.f32 %f19, %f18; + mov.f32 %f20, 0f00000000; // 0 + mov.f32 %f21, %f20; + .loc 16 263 0 + bar.sync 0; + ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r7, %r1, %r6; + cvt.s32.u32 %r8, %ntid.x; + div.s32 %r9, %r8, %r6; + rem.s32 %r10, %r1, %r6; + cvt.s32.u32 %r11, %ctaid.x; + mul.lo.s32 %r12, %r11, %r9; + add.s32 %r13, %r7, %r12; + ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p4, %r13, %r14; + @!%p4 bra $Lt_1_23042; + .loc 16 269 0 + ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd17, %r15; + mul.wide.s32 %rd18, %r15, 4; + cvt.s64.s32 %rd19, %r13; + mul.wide.s32 %rd20, %r13, 4; + ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd22, %rd20, %rd21; + add.u64 %rd23, %rd18, %rd22; + ld.global.s32 %r16, [%rd23+0]; + add.u64 %rd24, %rd18, %rd23; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p5, %rd25, %rd21; + @%p5 bra $Lt_1_23554; + .loc 16 275 0 + cvt.s32.s64 %r17, %rd17; + mul.lo.s32 %r18, %r17, %r16; + cvt.s64.s32 %rd26, %r18; + mul.wide.s32 %rd27, %r18, 4; + add.u64 %rd28, %rd24, %rd27; + .loc 16 276 0 + mul.lo.s32 %r19, %r10, %r17; + cvt.s64.s32 %rd29, %r19; + mul.wide.s32 %rd30, %r19, 4; + add.u64 %rd31, %rd24, %rd30; + .loc 16 277 0 + mul.lo.s32 %r20, %r17, %r6; + bra.uni $Lt_1_23298; +$Lt_1_23554: + .loc 16 279 0 + ld.global.s32 %r21, [%rd24+0]; + cvt.s64.s32 %rd32, %r21; + mul.wide.s32 %rd33, %r21, 4; + add.u64 %rd34, %rd25, %rd33; + .loc 16 280 0 + cvt.s64.s32 %rd35, %r16; + mul.wide.s32 %rd36, %r16, 4; + add.u64 %rd28, %rd34, %rd36; + .loc 16 281 0 + mov.s32 %r20, %r6; + .loc 16 282 0 + cvt.s64.s32 %rd37, %r10; + mul.wide.s32 %rd38, %r10, 4; + add.u64 %rd31, %rd34, %rd38; +$Lt_1_23298: + .loc 16 285 0 + ld.global.s32 %r22, [%rd22+0]; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + mov.s32 %r28, 0; + mov.u32 %r29, %r28; + tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}]; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + mov.f32 %f29, %f25; + setp.ge.u64 %p6, %rd31, %rd28; + @%p6 bra $Lt_1_32002; + cvt.rzi.ftz.s32.f32 %r30, %f29; + cvt.s64.s32 %rd39, %r20; + mul.lo.s32 %r31, %r30, 11; + cvt.rn.f32.s32 %f30, %r31; + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 +$Lt_1_24322: + //<loop> Loop body line 285, nesting depth: 1, estimated iterations: unknown + .loc 16 292 0 + ld.global.s32 %r32, [%rd31+0]; + .loc 16 296 0 + and.b32 %r33, %r32, 1073741823; + mov.u32 %r34, %r33; + mov.s32 %r35, 0; + mov.u32 %r36, %r35; + mov.s32 %r37, 0; + mov.u32 %r38, %r37; + mov.s32 %r39, 0; + mov.u32 %r40, %r39; + tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r34,%r36,%r38,%r40}]; + mov.f32 %f39, %f35; + mov.f32 %f40, %f36; + mov.f32 %f41, %f37; + mov.f32 %f42, %f38; + sub.ftz.f32 %f43, %f27, %f40; + sub.ftz.f32 %f44, %f26, %f39; + sub.ftz.f32 %f45, %f28, %f41; + mul.ftz.f32 %f46, %f43, %f43; + fma.rn.ftz.f32 %f47, %f44, %f44, %f46; + fma.rn.ftz.f32 %f48, %f45, %f45, %f47; + add.ftz.f32 %f49, %f30, %f42; + cvt.rzi.ftz.s32.f32 %r41, %f49; + cvt.s64.s32 %rd40, %r41; + mul.wide.s32 %rd41, %r41, 16; + add.u64 %rd42, %rd41, %rd7; + ld.shared.f32 %f50, [%rd42+8]; + setp.gt.ftz.f32 %p7, %f50, %f48; + @!%p7 bra $Lt_1_25602; + .loc 16 309 0 + rcp.approx.ftz.f32 %f51, %f48; + mul.ftz.f32 %f52, %f51, %f51; + mul.ftz.f32 %f53, %f51, %f52; + sqrt.approx.ftz.f32 %f54, %f53; + mul.ftz.f32 %f55, %f51, %f53; + ld.shared.v2.f32 {%f56,%f57}, [%rd42+0]; + mul.ftz.f32 %f58, %f56, %f54; + sub.ftz.f32 %f59, %f58, %f57; + mul.ftz.f32 %f60, %f55, %f59; + .loc 16 311 0 + fma.rn.ftz.f32 %f33, %f44, %f60, %f33; + .loc 16 312 0 + fma.rn.ftz.f32 %f32, %f43, %f60, %f32; + .loc 16 313 0 + fma.rn.ftz.f32 %f31, %f45, %f60, %f31; + ld.param.s32 %r42, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r43, 0; + setp.le.s32 %p8, %r42, %r43; + @%p8 bra $Lt_1_25090; + .loc 16 316 0 + add.u64 %rd43, %rd41, %rd13; + ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd43+0]; + mul.ftz.f32 %f64, %f61, %f54; + sub.ftz.f32 %f65, %f64, %f62; + mul.ftz.f32 %f66, %f53, %f65; + .loc 16 317 0 + shr.s32 %r44, %r32, 30; + and.b32 %r45, %r44, 3; + cvt.s64.s32 %rd44, %r45; + mul.wide.s32 %rd45, %r45, 4; + add.u64 %rd46, %rd1, %rd45; + ld.shared.f32 %f67, [%rd46+0]; + sub.ftz.f32 %f68, %f66, %f63; + fma.rn.ftz.f32 %f34, %f67, %f68, %f34; +$Lt_1_25090: + ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r47, 0; + setp.le.s32 %p9, %r46, %r47; + @%p9 bra $Lt_1_25602; + .loc 16 320 0 + mov.f32 %f69, %f11; + mul.ftz.f32 %f70, %f44, %f44; + fma.rn.ftz.f32 %f71, %f60, %f70, %f69; + mov.f32 %f11, %f71; + .loc 16 321 0 + mov.f32 %f72, %f13; + fma.rn.ftz.f32 %f73, %f60, %f46, %f72; + mov.f32 %f13, %f73; + .loc 16 322 0 + mov.f32 %f74, %f15; + mul.ftz.f32 %f75, %f45, %f45; + fma.rn.ftz.f32 %f76, %f60, %f75, %f74; + mov.f32 %f15, %f76; + .loc 16 323 0 + mov.f32 %f77, %f17; + mul.ftz.f32 %f78, %f43, %f44; + fma.rn.ftz.f32 %f79, %f60, %f78, %f77; + mov.f32 %f17, %f79; + .loc 16 324 0 + mov.f32 %f80, %f19; + mul.ftz.f32 %f81, %f44, %f45; + fma.rn.ftz.f32 %f82, %f60, %f81, %f80; + mov.f32 %f19, %f82; + .loc 16 325 0 + mul.ftz.f32 %f83, %f43, %f45; + fma.rn.ftz.f32 %f20, %f60, %f83, %f20; + mov.f32 %f21, %f20; +$Lt_1_25602: +$Lt_1_24578: + .loc 16 290 0 + mul.lo.u64 %rd47, %rd39, 4; + add.u64 %rd31, %rd31, %rd47; + setp.lt.u64 %p10, %rd31, %rd28; + @%p10 bra $Lt_1_24322; + bra.uni $Lt_1_22786; +$Lt_1_32002: + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 + bra.uni $Lt_1_22786; +$Lt_1_23042: + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 +$Lt_1_22786: + mov.u32 %r48, 1; + setp.le.s32 %p11, %r6, %r48; + @%p11 bra $Lt_1_28418; + .loc 16 336 0 + mov.u64 %rd48, __cuda___cuda_local_var_32737_35_non_const_red_acc7168; + cvt.s64.s32 %rd49, %r1; + mul.wide.s32 %rd50, %r1, 4; + add.u64 %rd51, %rd48, %rd50; + mov.f32 %f84, %f33; + st.shared.f32 [%rd51+0], %f84; + .loc 16 337 0 + mov.f32 %f85, %f32; + st.shared.f32 [%rd51+512], %f85; + .loc 16 338 0 + mov.f32 %f86, %f31; + st.shared.f32 [%rd51+1024], %f86; + .loc 16 339 0 + mov.f32 %f87, %f34; + st.shared.f32 [%rd51+1536], %f87; + .loc 16 341 0 + shr.s32 %r49, %r6, 31; + mov.s32 %r50, 1; + and.b32 %r51, %r49, %r50; + add.s32 %r52, %r51, %r6; + shr.s32 %r53, %r52, 1; + mov.s32 %r54, %r53; + mov.u32 %r55, 0; + setp.ne.u32 %p12, %r53, %r55; + @!%p12 bra $Lt_1_26882; +$Lt_1_27394: + setp.ge.u32 %p13, %r10, %r54; + @%p13 bra $Lt_1_27650; + .loc 16 344 0 + add.u32 %r56, %r1, %r54; + cvt.u64.u32 %rd52, %r56; + mul.wide.u32 %rd53, %r56, 4; + add.u64 %rd54, %rd48, %rd53; + ld.shared.f32 %f88, [%rd54+0]; + add.ftz.f32 %f84, %f88, %f84; + st.shared.f32 [%rd51+0], %f84; + ld.shared.f32 %f89, [%rd54+512]; + add.ftz.f32 %f85, %f89, %f85; + st.shared.f32 [%rd51+512], %f85; + ld.shared.f32 %f90, [%rd54+1024]; + add.ftz.f32 %f86, %f90, %f86; + st.shared.f32 [%rd51+1024], %f86; + ld.shared.f32 %f91, [%rd54+1536]; + add.ftz.f32 %f87, %f91, %f87; + st.shared.f32 [%rd51+1536], %f87; +$Lt_1_27650: + .loc 16 341 0 + shr.u32 %r54, %r54, 1; + mov.u32 %r57, 0; + setp.ne.u32 %p14, %r54, %r57; + @%p14 bra $Lt_1_27394; +$Lt_1_26882: + .loc 16 348 0 + mov.f32 %f33, %f84; + .loc 16 349 0 + mov.f32 %f32, %f85; + .loc 16 350 0 + mov.f32 %f31, %f86; + .loc 16 351 0 + mov.f32 %f34, %f87; + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p15, %r58, %r59; + @%p15 bra $Lt_1_28418; + .loc 16 355 0 + mov.f32 %f84, %f11; + st.shared.f32 [%rd51+0], %f84; + mov.f32 %f85, %f13; + st.shared.f32 [%rd51+512], %f85; + mov.f32 %f86, %f15; + st.shared.f32 [%rd51+1024], %f86; + mov.f32 %f87, %f17; + st.shared.f32 [%rd51+1536], %f87; + mov.f32 %f92, %f19; + st.shared.f32 [%rd51+2048], %f92; + mov.f32 %f93, %f21; + st.shared.f32 [%rd51+2560], %f93; + .loc 16 357 0 + mov.s32 %r60, %r53; + @!%p12 bra $Lt_1_28930; +$Lt_1_29442: + setp.ge.u32 %p16, %r10, %r60; + @%p16 bra $Lt_1_29698; + .loc 16 360 0 + add.u32 %r61, %r1, %r60; + cvt.u64.u32 %rd55, %r61; + mul.wide.u32 %rd56, %r61, 4; + add.u64 %rd57, %rd48, %rd56; + ld.shared.f32 %f94, [%rd57+0]; + add.ftz.f32 %f84, %f94, %f84; + st.shared.f32 [%rd51+0], %f84; + ld.shared.f32 %f95, [%rd57+512]; + add.ftz.f32 %f85, %f95, %f85; + st.shared.f32 [%rd51+512], %f85; + ld.shared.f32 %f96, [%rd57+1024]; + add.ftz.f32 %f86, %f96, %f86; + st.shared.f32 [%rd51+1024], %f86; + ld.shared.f32 %f97, [%rd57+1536]; + add.ftz.f32 %f87, %f97, %f87; + st.shared.f32 [%rd51+1536], %f87; + ld.shared.f32 %f98, [%rd57+2048]; + add.ftz.f32 %f92, %f98, %f92; + st.shared.f32 [%rd51+2048], %f92; + ld.shared.f32 %f99, [%rd57+2560]; + add.ftz.f32 %f93, %f99, %f93; + st.shared.f32 [%rd51+2560], %f93; +$Lt_1_29698: + .loc 16 357 0 + shr.u32 %r60, %r60, 1; + mov.u32 %r62, 0; + setp.ne.u32 %p17, %r60, %r62; + @%p17 bra $Lt_1_29442; +$Lt_1_28930: + .loc 16 365 0 + mov.f32 %f11, %f84; + mov.f32 %f13, %f85; + mov.f32 %f15, %f86; + mov.f32 %f17, %f87; + mov.f32 %f19, %f92; + mov.f32 %f21, %f93; +$Lt_1_28418: +$Lt_1_26370: + selp.s32 %r63, 1, 0, %p4; + mov.s32 %r64, 0; + set.eq.u32.s32 %r65, %r10, %r64; + neg.s32 %r66, %r65; + and.b32 %r67, %r63, %r66; + mov.u32 %r68, 0; + setp.eq.s32 %p18, %r67, %r68; + @%p18 bra $Lt_1_30466; + .loc 16 371 0 + cvt.s64.s32 %rd58, %r13; + ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd60, %r13, 4; + add.u64 %rd61, %rd59, %rd60; + ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r70, 0; + setp.le.s32 %p19, %r69, %r70; + @%p19 bra $Lt_1_30978; + .loc 16 373 0 + st.global.f32 [%rd61+0], %f34; + .loc 16 374 0 + cvt.s64.s32 %rd62, %r14; + mul.wide.s32 %rd63, %r14, 4; + add.u64 %rd61, %rd61, %rd63; +$Lt_1_30978: + ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r72, 0; + setp.le.s32 %p20, %r71, %r72; + @%p20 bra $Lt_1_31490; + .loc 16 378 0 + mov.f32 %f100, %f11; + st.global.f32 [%rd61+0], %f100; + .loc 16 379 0 + cvt.s64.s32 %rd64, %r14; + mul.wide.s32 %rd65, %r14, 4; + add.u64 %rd66, %rd65, %rd61; + .loc 16 378 0 + mov.f32 %f101, %f13; + st.global.f32 [%rd66+0], %f101; + .loc 16 379 0 + add.u64 %rd67, %rd65, %rd66; + .loc 16 378 0 + mov.f32 %f102, %f15; + st.global.f32 [%rd67+0], %f102; + .loc 16 379 0 + add.u64 %rd68, %rd65, %rd67; + .loc 16 378 0 + mov.f32 %f103, %f17; + st.global.f32 [%rd68+0], %f103; + .loc 16 379 0 + add.u64 %rd61, %rd65, %rd68; + .loc 16 378 0 + mov.f32 %f104, %f19; + st.global.f32 [%rd61+0], %f104; + mov.f32 %f105, %f21; + add.u64 %rd69, %rd65, %rd61; + st.global.f32 [%rd69+0], %f105; +$Lt_1_31490: + .loc 16 382 0 + ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd71, %rd58, 16; + add.u64 %rd72, %rd70, %rd71; + mov.f32 %f106, %f107; + st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f106}; +$Lt_1_30466: + .loc 16 384 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/lj96_cut_gpu_ptx.h b/lib/gpu/lj96_cut_gpu_ptx.h new file mode 100644 index 000000000..b7b6b717b --- /dev/null +++ b/lib/gpu/lj96_cut_gpu_ptx.h @@ -0,0 +1,927 @@ +const char * lj96_cut_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .u64 __cudaparm_kernel_pair_lj3,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<72>;\n" +" .reg .u64 %rd<62>;\n" +" .reg .f32 %f<103>;\n" +" .reg .pred %p<19>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072];\n" +" .loc 16 88 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 95 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 96 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 97 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 98 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 107 0\n" +" mov.f32 %f5, 0f00000000; \n" +" mov.f32 %f6, %f5;\n" +" mov.f32 %f7, 0f00000000; \n" +" mov.f32 %f8, %f7;\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_19202;\n" +" .loc 16 113 0\n" +" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd2, %r11;\n" +" mul.wide.s32 %rd3, %r11, 4;\n" +" cvt.s64.s32 %rd4, %r9;\n" +" mul.wide.s32 %rd5, %r9, 4;\n" +" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd7, %rd5, %rd6;\n" +" add.u64 %rd8, %rd3, %rd7;\n" +" ld.global.s32 %r12, [%rd8+0];\n" +" add.u64 %rd9, %rd3, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd6;\n" +" @%p2 bra $Lt_0_19714;\n" +" .loc 16 119 0\n" +" cvt.s32.s64 %r13, %rd2;\n" +" mul.lo.s32 %r14, %r13, %r12;\n" +" cvt.s64.s32 %rd11, %r14;\n" +" mul.wide.s32 %rd12, %r14, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 120 0\n" +" mul.lo.s32 %r15, %r6, %r13;\n" +" cvt.s64.s32 %rd14, %r15;\n" +" mul.wide.s32 %rd15, %r15, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 121 0\n" +" mul.lo.s32 %r16, %r13, %r1;\n" +" bra.uni $Lt_0_19458;\n" +"$Lt_0_19714:\n" +" .loc 16 123 0\n" +" ld.global.s32 %r17, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r17;\n" +" mul.wide.s32 %rd18, %r17, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 124 0\n" +" cvt.s64.s32 %rd20, %r12;\n" +" mul.wide.s32 %rd21, %r12, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 125 0\n" +" mov.s32 %r16, %r1;\n" +" .loc 16 126 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_19458:\n" +" .loc 16 129 0\n" +" ld.global.s32 %r18, [%rd7+0];\n" +" mov.u32 %r19, %r18;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f21, %f17;\n" +" mov.f32 %f22, %f18;\n" +" mov.f32 %f23, %f19;\n" +" mov.f32 %f24, %f20;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_28162;\n" +" cvt.rzi.ftz.s32.f32 %r26, %f24;\n" +" cvt.s64.s32 %rd24, %r16;\n" +" ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r28, %r27, %r26;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;\n" +"$Lt_0_20482:\n" +" .loc 16 135 0\n" +" ld.global.s32 %r29, [%rd16+0];\n" +" .loc 16 136 0\n" +" shr.s32 %r30, %r29, 30;\n" +" and.b32 %r31, %r30, 3;\n" +" cvt.s64.s32 %rd27, %r31;\n" +" mul.wide.s32 %rd28, %r31, 4;\n" +" add.u64 %rd29, %rd26, %rd28;\n" +" ld.shared.f32 %f29, [%rd29+0];\n" +" .loc 16 139 0\n" +" and.b32 %r32, %r29, 1073741823;\n" +" mov.u32 %r33, %r32;\n" +" mov.s32 %r34, 0;\n" +" mov.u32 %r35, %r34;\n" +" mov.s32 %r36, 0;\n" +" mov.u32 %r37, %r36;\n" +" mov.s32 %r38, 0;\n" +" mov.u32 %r39, %r38;\n" +" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];\n" +" mov.f32 %f34, %f30;\n" +" mov.f32 %f35, %f31;\n" +" mov.f32 %f36, %f32;\n" +" mov.f32 %f37, %f33;\n" +" cvt.rzi.ftz.s32.f32 %r40, %f37;\n" +" sub.ftz.f32 %f38, %f22, %f35;\n" +" sub.ftz.f32 %f39, %f21, %f34;\n" +" sub.ftz.f32 %f40, %f23, %f36;\n" +" mul.ftz.f32 %f41, %f38, %f38;\n" +" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" +" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" +" add.s32 %r41, %r40, %r28;\n" +" cvt.s64.s32 %rd30, %r41;\n" +" mul.wide.s32 %rd31, %r41, 16;\n" +" add.u64 %rd32, %rd31, %rd25;\n" +" ld.global.f32 %f44, [%rd32+8];\n" +" setp.gt.ftz.f32 %p4, %f44, %f43;\n" +" @!%p4 bra $Lt_0_21762;\n" +" .loc 16 154 0\n" +" rcp.approx.ftz.f32 %f45, %f43;\n" +" mul.ftz.f32 %f46, %f45, %f45;\n" +" mul.ftz.f32 %f47, %f45, %f46;\n" +" sqrt.approx.ftz.f32 %f48, %f47;\n" +" mul.ftz.f32 %f49, %f45, %f47;\n" +" ld.global.v2.f32 {%f50,%f51}, [%rd32+0];\n" +" mul.ftz.f32 %f52, %f50, %f48;\n" +" sub.ftz.f32 %f53, %f52, %f51;\n" +" mul.ftz.f32 %f54, %f49, %f53;\n" +" mul.ftz.f32 %f55, %f29, %f54;\n" +" .loc 16 156 0\n" +" fma.rn.ftz.f32 %f27, %f39, %f55, %f27;\n" +" .loc 16 157 0\n" +" fma.rn.ftz.f32 %f26, %f38, %f55, %f26;\n" +" .loc 16 158 0\n" +" fma.rn.ftz.f32 %f25, %f40, %f55, %f25;\n" +" ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r43, 0;\n" +" setp.le.s32 %p5, %r42, %r43;\n" +" @%p5 bra $Lt_0_21250;\n" +" .loc 16 162 0\n" +" ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd34, %rd33, %rd31;\n" +" ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd34+0];\n" +" mul.ftz.f32 %f59, %f56, %f48;\n" +" sub.ftz.f32 %f60, %f59, %f57;\n" +" mul.ftz.f32 %f61, %f47, %f60;\n" +" sub.ftz.f32 %f62, %f61, %f58;\n" +" fma.rn.ftz.f32 %f28, %f29, %f62, %f28;\n" +"$Lt_0_21250:\n" +" ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r45, 0;\n" +" setp.le.s32 %p6, %r44, %r45;\n" +" @%p6 bra $Lt_0_21762;\n" +" .loc 16 165 0\n" +" mov.f32 %f63, %f6;\n" +" mul.ftz.f32 %f64, %f39, %f39;\n" +" fma.rn.ftz.f32 %f65, %f55, %f64, %f63;\n" +" mov.f32 %f6, %f65;\n" +" .loc 16 166 0\n" +" mov.f32 %f66, %f8;\n" +" fma.rn.ftz.f32 %f67, %f55, %f41, %f66;\n" +" mov.f32 %f8, %f67;\n" +" .loc 16 167 0\n" +" mov.f32 %f68, %f10;\n" +" mul.ftz.f32 %f69, %f40, %f40;\n" +" fma.rn.ftz.f32 %f70, %f55, %f69, %f68;\n" +" mov.f32 %f10, %f70;\n" +" .loc 16 168 0\n" +" mov.f32 %f71, %f12;\n" +" mul.ftz.f32 %f72, %f38, %f39;\n" +" fma.rn.ftz.f32 %f73, %f55, %f72, %f71;\n" +" mov.f32 %f12, %f73;\n" +" .loc 16 169 0\n" +" mov.f32 %f74, %f14;\n" +" mul.ftz.f32 %f75, %f39, %f40;\n" +" fma.rn.ftz.f32 %f76, %f55, %f75, %f74;\n" +" mov.f32 %f14, %f76;\n" +" .loc 16 170 0\n" +" mul.ftz.f32 %f77, %f38, %f40;\n" +" fma.rn.ftz.f32 %f15, %f55, %f77, %f15;\n" +" mov.f32 %f16, %f15;\n" +"$Lt_0_21762:\n" +"$Lt_0_20738:\n" +" .loc 16 133 0\n" +" mul.lo.u64 %rd35, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd35;\n" +" setp.lt.u64 %p7, %rd16, %rd13;\n" +" @%p7 bra $Lt_0_20482;\n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_28162:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_19202:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +"$Lt_0_18946:\n" +" mov.u32 %r46, 1;\n" +" setp.le.s32 %p8, %r1, %r46;\n" +" @%p8 bra $Lt_0_24578;\n" +" .loc 16 181 0\n" +" mov.u64 %rd36, __cuda___cuda_local_var_32582_35_non_const_red_acc108;\n" +" cvt.s64.s32 %rd37, %r2;\n" +" mul.wide.s32 %rd38, %r2, 4;\n" +" add.u64 %rd39, %rd36, %rd38;\n" +" mov.f32 %f78, %f27;\n" +" st.shared.f32 [%rd39+0], %f78;\n" +" .loc 16 182 0\n" +" mov.f32 %f79, %f26;\n" +" st.shared.f32 [%rd39+512], %f79;\n" +" .loc 16 183 0\n" +" mov.f32 %f80, %f25;\n" +" st.shared.f32 [%rd39+1024], %f80;\n" +" .loc 16 184 0\n" +" mov.f32 %f81, %f28;\n" +" st.shared.f32 [%rd39+1536], %f81;\n" +" .loc 16 186 0\n" +" shr.s32 %r47, %r1, 31;\n" +" mov.s32 %r48, 1;\n" +" and.b32 %r49, %r47, %r48;\n" +" add.s32 %r50, %r49, %r1;\n" +" shr.s32 %r51, %r50, 1;\n" +" mov.s32 %r52, %r51;\n" +" mov.u32 %r53, 0;\n" +" setp.ne.u32 %p9, %r51, %r53;\n" +" @!%p9 bra $Lt_0_23042;\n" +"$Lt_0_23554:\n" +" setp.ge.u32 %p10, %r6, %r52;\n" +" @%p10 bra $Lt_0_23810;\n" +" .loc 16 189 0\n" +" add.u32 %r54, %r2, %r52;\n" +" cvt.u64.u32 %rd40, %r54;\n" +" mul.wide.u32 %rd41, %r54, 4;\n" +" add.u64 %rd42, %rd36, %rd41;\n" +" ld.shared.f32 %f82, [%rd42+0];\n" +" add.ftz.f32 %f78, %f82, %f78;\n" +" st.shared.f32 [%rd39+0], %f78;\n" +" ld.shared.f32 %f83, [%rd42+512];\n" +" add.ftz.f32 %f79, %f83, %f79;\n" +" st.shared.f32 [%rd39+512], %f79;\n" +" ld.shared.f32 %f84, [%rd42+1024];\n" +" add.ftz.f32 %f80, %f84, %f80;\n" +" st.shared.f32 [%rd39+1024], %f80;\n" +" ld.shared.f32 %f85, [%rd42+1536];\n" +" add.ftz.f32 %f81, %f85, %f81;\n" +" st.shared.f32 [%rd39+1536], %f81;\n" +"$Lt_0_23810:\n" +" .loc 16 186 0\n" +" shr.u32 %r52, %r52, 1;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p11, %r52, %r55;\n" +" @%p11 bra $Lt_0_23554;\n" +"$Lt_0_23042:\n" +" .loc 16 193 0\n" +" mov.f32 %f27, %f78;\n" +" .loc 16 194 0\n" +" mov.f32 %f26, %f79;\n" +" .loc 16 195 0\n" +" mov.f32 %f25, %f80;\n" +" .loc 16 196 0\n" +" mov.f32 %f28, %f81;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p12, %r56, %r57;\n" +" @%p12 bra $Lt_0_24578;\n" +" .loc 16 200 0\n" +" mov.f32 %f78, %f6;\n" +" st.shared.f32 [%rd39+0], %f78;\n" +" mov.f32 %f79, %f8;\n" +" st.shared.f32 [%rd39+512], %f79;\n" +" mov.f32 %f80, %f10;\n" +" st.shared.f32 [%rd39+1024], %f80;\n" +" mov.f32 %f81, %f12;\n" +" st.shared.f32 [%rd39+1536], %f81;\n" +" mov.f32 %f86, %f14;\n" +" st.shared.f32 [%rd39+2048], %f86;\n" +" mov.f32 %f87, %f16;\n" +" st.shared.f32 [%rd39+2560], %f87;\n" +" .loc 16 202 0\n" +" mov.s32 %r58, %r51;\n" +" @!%p9 bra $Lt_0_25090;\n" +"$Lt_0_25602:\n" +" setp.ge.u32 %p13, %r6, %r58;\n" +" @%p13 bra $Lt_0_25858;\n" +" .loc 16 205 0\n" +" add.u32 %r59, %r2, %r58;\n" +" cvt.u64.u32 %rd43, %r59;\n" +" mul.wide.u32 %rd44, %r59, 4;\n" +" add.u64 %rd45, %rd36, %rd44;\n" +" ld.shared.f32 %f88, [%rd45+0];\n" +" add.ftz.f32 %f78, %f88, %f78;\n" +" st.shared.f32 [%rd39+0], %f78;\n" +" ld.shared.f32 %f89, [%rd45+512];\n" +" add.ftz.f32 %f79, %f89, %f79;\n" +" st.shared.f32 [%rd39+512], %f79;\n" +" ld.shared.f32 %f90, [%rd45+1024];\n" +" add.ftz.f32 %f80, %f90, %f80;\n" +" st.shared.f32 [%rd39+1024], %f80;\n" +" ld.shared.f32 %f91, [%rd45+1536];\n" +" add.ftz.f32 %f81, %f91, %f81;\n" +" st.shared.f32 [%rd39+1536], %f81;\n" +" ld.shared.f32 %f92, [%rd45+2048];\n" +" add.ftz.f32 %f86, %f92, %f86;\n" +" st.shared.f32 [%rd39+2048], %f86;\n" +" ld.shared.f32 %f93, [%rd45+2560];\n" +" add.ftz.f32 %f87, %f93, %f87;\n" +" st.shared.f32 [%rd39+2560], %f87;\n" +"$Lt_0_25858:\n" +" .loc 16 202 0\n" +" shr.u32 %r58, %r58, 1;\n" +" mov.u32 %r60, 0;\n" +" setp.ne.u32 %p14, %r58, %r60;\n" +" @%p14 bra $Lt_0_25602;\n" +"$Lt_0_25090:\n" +" .loc 16 210 0\n" +" mov.f32 %f6, %f78;\n" +" mov.f32 %f8, %f79;\n" +" mov.f32 %f10, %f80;\n" +" mov.f32 %f12, %f81;\n" +" mov.f32 %f14, %f86;\n" +" mov.f32 %f16, %f87;\n" +"$Lt_0_24578:\n" +"$Lt_0_22530:\n" +" selp.s32 %r61, 1, 0, %p1;\n" +" mov.s32 %r62, 0;\n" +" set.eq.u32.s32 %r63, %r6, %r62;\n" +" neg.s32 %r64, %r63;\n" +" and.b32 %r65, %r61, %r64;\n" +" mov.u32 %r66, 0;\n" +" setp.eq.s32 %p15, %r65, %r66;\n" +" @%p15 bra $Lt_0_26626;\n" +" .loc 16 216 0\n" +" cvt.s64.s32 %rd46, %r9;\n" +" ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd48, %r9, 4;\n" +" add.u64 %rd49, %rd47, %rd48;\n" +" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r68, 0;\n" +" setp.le.s32 %p16, %r67, %r68;\n" +" @%p16 bra $Lt_0_27138;\n" +" .loc 16 218 0\n" +" st.global.f32 [%rd49+0], %f28;\n" +" .loc 16 219 0\n" +" cvt.s64.s32 %rd50, %r10;\n" +" mul.wide.s32 %rd51, %r10, 4;\n" +" add.u64 %rd49, %rd49, %rd51;\n" +"$Lt_0_27138:\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p17, %r69, %r70;\n" +" @%p17 bra $Lt_0_27650;\n" +" .loc 16 223 0\n" +" mov.f32 %f94, %f6;\n" +" st.global.f32 [%rd49+0], %f94;\n" +" .loc 16 224 0\n" +" cvt.s64.s32 %rd52, %r10;\n" +" mul.wide.s32 %rd53, %r10, 4;\n" +" add.u64 %rd54, %rd53, %rd49;\n" +" .loc 16 223 0\n" +" mov.f32 %f95, %f8;\n" +" st.global.f32 [%rd54+0], %f95;\n" +" .loc 16 224 0\n" +" add.u64 %rd55, %rd53, %rd54;\n" +" .loc 16 223 0\n" +" mov.f32 %f96, %f10;\n" +" st.global.f32 [%rd55+0], %f96;\n" +" .loc 16 224 0\n" +" add.u64 %rd56, %rd53, %rd55;\n" +" .loc 16 223 0\n" +" mov.f32 %f97, %f12;\n" +" st.global.f32 [%rd56+0], %f97;\n" +" .loc 16 224 0\n" +" add.u64 %rd49, %rd53, %rd56;\n" +" .loc 16 223 0\n" +" mov.f32 %f98, %f14;\n" +" st.global.f32 [%rd49+0], %f98;\n" +" mov.f32 %f99, %f16;\n" +" add.u64 %rd57, %rd53, %rd49;\n" +" st.global.f32 [%rd57+0], %f99;\n" +"$Lt_0_27650:\n" +" .loc 16 227 0\n" +" ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd59, %rd46, 16;\n" +" add.u64 %rd60, %rd58, %rd59;\n" +" mov.f32 %f100, %f101;\n" +" st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f100};\n" +"$Lt_0_26626:\n" +" .loc 16 229 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<74>;\n" +" .reg .u64 %rd<74>;\n" +" .reg .f32 %f<109>;\n" +" .reg .pred %p<22>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj13296[1936];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32647_34_non_const_lj35232[1936];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32737_35_non_const_red_acc7168[3072];\n" +" .loc 16 237 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 3;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_21250;\n" +" .loc 16 247 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_21250:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;\n" +" mov.u32 %r3, 120;\n" +" setp.gt.s32 %p2, %r1, %r3;\n" +" @%p2 bra $Lt_1_21762;\n" +" .loc 16 249 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;\n" +" cvt.s64.s32 %rd8, %r1;\n" +" mul.wide.s32 %rd9, %r1, 16;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" +" add.u64 %rd11, %rd10, %rd9;\n" +" add.u64 %rd12, %rd9, %rd7;\n" +" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" +" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" +" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r5, 0;\n" +" setp.le.s32 %p3, %r4, %r5;\n" +" @%p3 bra $Lt_1_22274;\n" +" .loc 16 251 0\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;\n" +" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" +" add.u64 %rd15, %rd14, %rd9;\n" +" add.u64 %rd16, %rd9, %rd13;\n" +" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" +" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" +"$Lt_1_22274:\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;\n" +"$Lt_1_21762:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;\n" +" .loc 16 261 0\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, %f14;\n" +" mov.f32 %f16, 0f00000000; \n" +" mov.f32 %f17, %f16;\n" +" mov.f32 %f18, 0f00000000; \n" +" mov.f32 %f19, %f18;\n" +" mov.f32 %f20, 0f00000000; \n" +" mov.f32 %f21, %f20;\n" +" .loc 16 263 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r7, %r1, %r6;\n" +" cvt.s32.u32 %r8, %ntid.x;\n" +" div.s32 %r9, %r8, %r6;\n" +" rem.s32 %r10, %r1, %r6;\n" +" cvt.s32.u32 %r11, %ctaid.x;\n" +" mul.lo.s32 %r12, %r11, %r9;\n" +" add.s32 %r13, %r7, %r12;\n" +" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p4, %r13, %r14;\n" +" @!%p4 bra $Lt_1_23042;\n" +" .loc 16 269 0\n" +" ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd17, %r15;\n" +" mul.wide.s32 %rd18, %r15, 4;\n" +" cvt.s64.s32 %rd19, %r13;\n" +" mul.wide.s32 %rd20, %r13, 4;\n" +" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd22, %rd20, %rd21;\n" +" add.u64 %rd23, %rd18, %rd22;\n" +" ld.global.s32 %r16, [%rd23+0];\n" +" add.u64 %rd24, %rd18, %rd23;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p5, %rd25, %rd21;\n" +" @%p5 bra $Lt_1_23554;\n" +" .loc 16 275 0\n" +" cvt.s32.s64 %r17, %rd17;\n" +" mul.lo.s32 %r18, %r17, %r16;\n" +" cvt.s64.s32 %rd26, %r18;\n" +" mul.wide.s32 %rd27, %r18, 4;\n" +" add.u64 %rd28, %rd24, %rd27;\n" +" .loc 16 276 0\n" +" mul.lo.s32 %r19, %r10, %r17;\n" +" cvt.s64.s32 %rd29, %r19;\n" +" mul.wide.s32 %rd30, %r19, 4;\n" +" add.u64 %rd31, %rd24, %rd30;\n" +" .loc 16 277 0\n" +" mul.lo.s32 %r20, %r17, %r6;\n" +" bra.uni $Lt_1_23298;\n" +"$Lt_1_23554:\n" +" .loc 16 279 0\n" +" ld.global.s32 %r21, [%rd24+0];\n" +" cvt.s64.s32 %rd32, %r21;\n" +" mul.wide.s32 %rd33, %r21, 4;\n" +" add.u64 %rd34, %rd25, %rd33;\n" +" .loc 16 280 0\n" +" cvt.s64.s32 %rd35, %r16;\n" +" mul.wide.s32 %rd36, %r16, 4;\n" +" add.u64 %rd28, %rd34, %rd36;\n" +" .loc 16 281 0\n" +" mov.s32 %r20, %r6;\n" +" .loc 16 282 0\n" +" cvt.s64.s32 %rd37, %r10;\n" +" mul.wide.s32 %rd38, %r10, 4;\n" +" add.u64 %rd31, %rd34, %rd38;\n" +"$Lt_1_23298:\n" +" .loc 16 285 0\n" +" ld.global.s32 %r22, [%rd22+0];\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" mov.s32 %r28, 0;\n" +" mov.u32 %r29, %r28;\n" +" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" mov.f32 %f29, %f25;\n" +" setp.ge.u64 %p6, %rd31, %rd28;\n" +" @%p6 bra $Lt_1_32002;\n" +" cvt.rzi.ftz.s32.f32 %r30, %f29;\n" +" cvt.s64.s32 %rd39, %r20;\n" +" mul.lo.s32 %r31, %r30, 11;\n" +" cvt.rn.f32.s32 %f30, %r31;\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +"$Lt_1_24322:\n" +" .loc 16 292 0\n" +" ld.global.s32 %r32, [%rd31+0];\n" +" .loc 16 296 0\n" +" and.b32 %r33, %r32, 1073741823;\n" +" mov.u32 %r34, %r33;\n" +" mov.s32 %r35, 0;\n" +" mov.u32 %r36, %r35;\n" +" mov.s32 %r37, 0;\n" +" mov.u32 %r38, %r37;\n" +" mov.s32 %r39, 0;\n" +" mov.u32 %r40, %r39;\n" +" tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r34,%r36,%r38,%r40}];\n" +" mov.f32 %f39, %f35;\n" +" mov.f32 %f40, %f36;\n" +" mov.f32 %f41, %f37;\n" +" mov.f32 %f42, %f38;\n" +" sub.ftz.f32 %f43, %f27, %f40;\n" +" sub.ftz.f32 %f44, %f26, %f39;\n" +" sub.ftz.f32 %f45, %f28, %f41;\n" +" mul.ftz.f32 %f46, %f43, %f43;\n" +" fma.rn.ftz.f32 %f47, %f44, %f44, %f46;\n" +" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" +" add.ftz.f32 %f49, %f30, %f42;\n" +" cvt.rzi.ftz.s32.f32 %r41, %f49;\n" +" cvt.s64.s32 %rd40, %r41;\n" +" mul.wide.s32 %rd41, %r41, 16;\n" +" add.u64 %rd42, %rd41, %rd7;\n" +" ld.shared.f32 %f50, [%rd42+8];\n" +" setp.gt.ftz.f32 %p7, %f50, %f48;\n" +" @!%p7 bra $Lt_1_25602;\n" +" .loc 16 309 0\n" +" rcp.approx.ftz.f32 %f51, %f48;\n" +" mul.ftz.f32 %f52, %f51, %f51;\n" +" mul.ftz.f32 %f53, %f51, %f52;\n" +" sqrt.approx.ftz.f32 %f54, %f53;\n" +" mul.ftz.f32 %f55, %f51, %f53;\n" +" ld.shared.v2.f32 {%f56,%f57}, [%rd42+0];\n" +" mul.ftz.f32 %f58, %f56, %f54;\n" +" sub.ftz.f32 %f59, %f58, %f57;\n" +" mul.ftz.f32 %f60, %f55, %f59;\n" +" .loc 16 311 0\n" +" fma.rn.ftz.f32 %f33, %f44, %f60, %f33;\n" +" .loc 16 312 0\n" +" fma.rn.ftz.f32 %f32, %f43, %f60, %f32;\n" +" .loc 16 313 0\n" +" fma.rn.ftz.f32 %f31, %f45, %f60, %f31;\n" +" ld.param.s32 %r42, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r43, 0;\n" +" setp.le.s32 %p8, %r42, %r43;\n" +" @%p8 bra $Lt_1_25090;\n" +" .loc 16 316 0\n" +" add.u64 %rd43, %rd41, %rd13;\n" +" ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd43+0];\n" +" mul.ftz.f32 %f64, %f61, %f54;\n" +" sub.ftz.f32 %f65, %f64, %f62;\n" +" mul.ftz.f32 %f66, %f53, %f65;\n" +" .loc 16 317 0\n" +" shr.s32 %r44, %r32, 30;\n" +" and.b32 %r45, %r44, 3;\n" +" cvt.s64.s32 %rd44, %r45;\n" +" mul.wide.s32 %rd45, %r45, 4;\n" +" add.u64 %rd46, %rd1, %rd45;\n" +" ld.shared.f32 %f67, [%rd46+0];\n" +" sub.ftz.f32 %f68, %f66, %f63;\n" +" fma.rn.ftz.f32 %f34, %f67, %f68, %f34;\n" +"$Lt_1_25090:\n" +" ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r47, 0;\n" +" setp.le.s32 %p9, %r46, %r47;\n" +" @%p9 bra $Lt_1_25602;\n" +" .loc 16 320 0\n" +" mov.f32 %f69, %f11;\n" +" mul.ftz.f32 %f70, %f44, %f44;\n" +" fma.rn.ftz.f32 %f71, %f60, %f70, %f69;\n" +" mov.f32 %f11, %f71;\n" +" .loc 16 321 0\n" +" mov.f32 %f72, %f13;\n" +" fma.rn.ftz.f32 %f73, %f60, %f46, %f72;\n" +" mov.f32 %f13, %f73;\n" +" .loc 16 322 0\n" +" mov.f32 %f74, %f15;\n" +" mul.ftz.f32 %f75, %f45, %f45;\n" +" fma.rn.ftz.f32 %f76, %f60, %f75, %f74;\n" +" mov.f32 %f15, %f76;\n" +" .loc 16 323 0\n" +" mov.f32 %f77, %f17;\n" +" mul.ftz.f32 %f78, %f43, %f44;\n" +" fma.rn.ftz.f32 %f79, %f60, %f78, %f77;\n" +" mov.f32 %f17, %f79;\n" +" .loc 16 324 0\n" +" mov.f32 %f80, %f19;\n" +" mul.ftz.f32 %f81, %f44, %f45;\n" +" fma.rn.ftz.f32 %f82, %f60, %f81, %f80;\n" +" mov.f32 %f19, %f82;\n" +" .loc 16 325 0\n" +" mul.ftz.f32 %f83, %f43, %f45;\n" +" fma.rn.ftz.f32 %f20, %f60, %f83, %f20;\n" +" mov.f32 %f21, %f20;\n" +"$Lt_1_25602:\n" +"$Lt_1_24578:\n" +" .loc 16 290 0\n" +" mul.lo.u64 %rd47, %rd39, 4;\n" +" add.u64 %rd31, %rd31, %rd47;\n" +" setp.lt.u64 %p10, %rd31, %rd28;\n" +" @%p10 bra $Lt_1_24322;\n" +" bra.uni $Lt_1_22786;\n" +"$Lt_1_32002:\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +" bra.uni $Lt_1_22786;\n" +"$Lt_1_23042:\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +"$Lt_1_22786:\n" +" mov.u32 %r48, 1;\n" +" setp.le.s32 %p11, %r6, %r48;\n" +" @%p11 bra $Lt_1_28418;\n" +" .loc 16 336 0\n" +" mov.u64 %rd48, __cuda___cuda_local_var_32737_35_non_const_red_acc7168;\n" +" cvt.s64.s32 %rd49, %r1;\n" +" mul.wide.s32 %rd50, %r1, 4;\n" +" add.u64 %rd51, %rd48, %rd50;\n" +" mov.f32 %f84, %f33;\n" +" st.shared.f32 [%rd51+0], %f84;\n" +" .loc 16 337 0\n" +" mov.f32 %f85, %f32;\n" +" st.shared.f32 [%rd51+512], %f85;\n" +" .loc 16 338 0\n" +" mov.f32 %f86, %f31;\n" +" st.shared.f32 [%rd51+1024], %f86;\n" +" .loc 16 339 0\n" +" mov.f32 %f87, %f34;\n" +" st.shared.f32 [%rd51+1536], %f87;\n" +" .loc 16 341 0\n" +" shr.s32 %r49, %r6, 31;\n" +" mov.s32 %r50, 1;\n" +" and.b32 %r51, %r49, %r50;\n" +" add.s32 %r52, %r51, %r6;\n" +" shr.s32 %r53, %r52, 1;\n" +" mov.s32 %r54, %r53;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p12, %r53, %r55;\n" +" @!%p12 bra $Lt_1_26882;\n" +"$Lt_1_27394:\n" +" setp.ge.u32 %p13, %r10, %r54;\n" +" @%p13 bra $Lt_1_27650;\n" +" .loc 16 344 0\n" +" add.u32 %r56, %r1, %r54;\n" +" cvt.u64.u32 %rd52, %r56;\n" +" mul.wide.u32 %rd53, %r56, 4;\n" +" add.u64 %rd54, %rd48, %rd53;\n" +" ld.shared.f32 %f88, [%rd54+0];\n" +" add.ftz.f32 %f84, %f88, %f84;\n" +" st.shared.f32 [%rd51+0], %f84;\n" +" ld.shared.f32 %f89, [%rd54+512];\n" +" add.ftz.f32 %f85, %f89, %f85;\n" +" st.shared.f32 [%rd51+512], %f85;\n" +" ld.shared.f32 %f90, [%rd54+1024];\n" +" add.ftz.f32 %f86, %f90, %f86;\n" +" st.shared.f32 [%rd51+1024], %f86;\n" +" ld.shared.f32 %f91, [%rd54+1536];\n" +" add.ftz.f32 %f87, %f91, %f87;\n" +" st.shared.f32 [%rd51+1536], %f87;\n" +"$Lt_1_27650:\n" +" .loc 16 341 0\n" +" shr.u32 %r54, %r54, 1;\n" +" mov.u32 %r57, 0;\n" +" setp.ne.u32 %p14, %r54, %r57;\n" +" @%p14 bra $Lt_1_27394;\n" +"$Lt_1_26882:\n" +" .loc 16 348 0\n" +" mov.f32 %f33, %f84;\n" +" .loc 16 349 0\n" +" mov.f32 %f32, %f85;\n" +" .loc 16 350 0\n" +" mov.f32 %f31, %f86;\n" +" .loc 16 351 0\n" +" mov.f32 %f34, %f87;\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p15, %r58, %r59;\n" +" @%p15 bra $Lt_1_28418;\n" +" .loc 16 355 0\n" +" mov.f32 %f84, %f11;\n" +" st.shared.f32 [%rd51+0], %f84;\n" +" mov.f32 %f85, %f13;\n" +" st.shared.f32 [%rd51+512], %f85;\n" +" mov.f32 %f86, %f15;\n" +" st.shared.f32 [%rd51+1024], %f86;\n" +" mov.f32 %f87, %f17;\n" +" st.shared.f32 [%rd51+1536], %f87;\n" +" mov.f32 %f92, %f19;\n" +" st.shared.f32 [%rd51+2048], %f92;\n" +" mov.f32 %f93, %f21;\n" +" st.shared.f32 [%rd51+2560], %f93;\n" +" .loc 16 357 0\n" +" mov.s32 %r60, %r53;\n" +" @!%p12 bra $Lt_1_28930;\n" +"$Lt_1_29442:\n" +" setp.ge.u32 %p16, %r10, %r60;\n" +" @%p16 bra $Lt_1_29698;\n" +" .loc 16 360 0\n" +" add.u32 %r61, %r1, %r60;\n" +" cvt.u64.u32 %rd55, %r61;\n" +" mul.wide.u32 %rd56, %r61, 4;\n" +" add.u64 %rd57, %rd48, %rd56;\n" +" ld.shared.f32 %f94, [%rd57+0];\n" +" add.ftz.f32 %f84, %f94, %f84;\n" +" st.shared.f32 [%rd51+0], %f84;\n" +" ld.shared.f32 %f95, [%rd57+512];\n" +" add.ftz.f32 %f85, %f95, %f85;\n" +" st.shared.f32 [%rd51+512], %f85;\n" +" ld.shared.f32 %f96, [%rd57+1024];\n" +" add.ftz.f32 %f86, %f96, %f86;\n" +" st.shared.f32 [%rd51+1024], %f86;\n" +" ld.shared.f32 %f97, [%rd57+1536];\n" +" add.ftz.f32 %f87, %f97, %f87;\n" +" st.shared.f32 [%rd51+1536], %f87;\n" +" ld.shared.f32 %f98, [%rd57+2048];\n" +" add.ftz.f32 %f92, %f98, %f92;\n" +" st.shared.f32 [%rd51+2048], %f92;\n" +" ld.shared.f32 %f99, [%rd57+2560];\n" +" add.ftz.f32 %f93, %f99, %f93;\n" +" st.shared.f32 [%rd51+2560], %f93;\n" +"$Lt_1_29698:\n" +" .loc 16 357 0\n" +" shr.u32 %r60, %r60, 1;\n" +" mov.u32 %r62, 0;\n" +" setp.ne.u32 %p17, %r60, %r62;\n" +" @%p17 bra $Lt_1_29442;\n" +"$Lt_1_28930:\n" +" .loc 16 365 0\n" +" mov.f32 %f11, %f84;\n" +" mov.f32 %f13, %f85;\n" +" mov.f32 %f15, %f86;\n" +" mov.f32 %f17, %f87;\n" +" mov.f32 %f19, %f92;\n" +" mov.f32 %f21, %f93;\n" +"$Lt_1_28418:\n" +"$Lt_1_26370:\n" +" selp.s32 %r63, 1, 0, %p4;\n" +" mov.s32 %r64, 0;\n" +" set.eq.u32.s32 %r65, %r10, %r64;\n" +" neg.s32 %r66, %r65;\n" +" and.b32 %r67, %r63, %r66;\n" +" mov.u32 %r68, 0;\n" +" setp.eq.s32 %p18, %r67, %r68;\n" +" @%p18 bra $Lt_1_30466;\n" +" .loc 16 371 0\n" +" cvt.s64.s32 %rd58, %r13;\n" +" ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd60, %r13, 4;\n" +" add.u64 %rd61, %rd59, %rd60;\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p19, %r69, %r70;\n" +" @%p19 bra $Lt_1_30978;\n" +" .loc 16 373 0\n" +" st.global.f32 [%rd61+0], %f34;\n" +" .loc 16 374 0\n" +" cvt.s64.s32 %rd62, %r14;\n" +" mul.wide.s32 %rd63, %r14, 4;\n" +" add.u64 %rd61, %rd61, %rd63;\n" +"$Lt_1_30978:\n" +" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r72, 0;\n" +" setp.le.s32 %p20, %r71, %r72;\n" +" @%p20 bra $Lt_1_31490;\n" +" .loc 16 378 0\n" +" mov.f32 %f100, %f11;\n" +" st.global.f32 [%rd61+0], %f100;\n" +" .loc 16 379 0\n" +" cvt.s64.s32 %rd64, %r14;\n" +" mul.wide.s32 %rd65, %r14, 4;\n" +" add.u64 %rd66, %rd65, %rd61;\n" +" .loc 16 378 0\n" +" mov.f32 %f101, %f13;\n" +" st.global.f32 [%rd66+0], %f101;\n" +" .loc 16 379 0\n" +" add.u64 %rd67, %rd65, %rd66;\n" +" .loc 16 378 0\n" +" mov.f32 %f102, %f15;\n" +" st.global.f32 [%rd67+0], %f102;\n" +" .loc 16 379 0\n" +" add.u64 %rd68, %rd65, %rd67;\n" +" .loc 16 378 0\n" +" mov.f32 %f103, %f17;\n" +" st.global.f32 [%rd68+0], %f103;\n" +" .loc 16 379 0\n" +" add.u64 %rd61, %rd65, %rd68;\n" +" .loc 16 378 0\n" +" mov.f32 %f104, %f19;\n" +" st.global.f32 [%rd61+0], %f104;\n" +" mov.f32 %f105, %f21;\n" +" add.u64 %rd69, %rd65, %rd61;\n" +" st.global.f32 [%rd69+0], %f105;\n" +"$Lt_1_31490:\n" +" .loc 16 382 0\n" +" ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd71, %rd58, 16;\n" +" add.u64 %rd72, %rd70, %rd71;\n" +" mov.f32 %f106, %f107;\n" +" st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f106};\n" +"$Lt_1_30466:\n" +" .loc 16 384 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/lj_cut_gpu_kernel.ptx b/lib/gpu/lj_cut_gpu_kernel.ptx new file mode 100644 index 000000000..a2bdb5c3d --- /dev/null +++ b/lib/gpu/lj_cut_gpu_kernel.ptx @@ -0,0 +1,979 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bd91_00000000-9_lj_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.gvU1PY) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bd91_00000000-8_lj_cut_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "lj_cut_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .u64 __cudaparm_kernel_pair_lj3, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<72>; + .reg .u64 %rd<62>; + .reg .f32 %f<102>; + .reg .pred %p<19>; + .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16]; + .shared .align 4 .b8 __cuda___cuda_local_var_32581_35_non_const_red_acc108[3072]; + // __cuda_local_var_32504_10_non_const_f = 48 + // __cuda_local_var_32508_9_non_const_virial = 16 + .loc 16 88 0 +$LDWbegin_kernel_pair: + .loc 16 95 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 96 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 97 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 98 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; + .loc 16 107 0 + mov.f32 %f5, 0f00000000; // 0 + mov.f32 %f6, %f5; + mov.f32 %f7, 0f00000000; // 0 + mov.f32 %f8, %f7; + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_19202; + .loc 16 113 0 + ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd2, %r11; + mul.wide.s32 %rd3, %r11, 4; + cvt.s64.s32 %rd4, %r9; + mul.wide.s32 %rd5, %r9, 4; + ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd7, %rd5, %rd6; + add.u64 %rd8, %rd3, %rd7; + ld.global.s32 %r12, [%rd8+0]; + add.u64 %rd9, %rd3, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd6; + @%p2 bra $Lt_0_19714; + .loc 16 119 0 + cvt.s32.s64 %r13, %rd2; + mul.lo.s32 %r14, %r13, %r12; + cvt.s64.s32 %rd11, %r14; + mul.wide.s32 %rd12, %r14, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 120 0 + mul.lo.s32 %r15, %r6, %r13; + cvt.s64.s32 %rd14, %r15; + mul.wide.s32 %rd15, %r15, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 121 0 + mul.lo.s32 %r16, %r13, %r1; + bra.uni $Lt_0_19458; +$Lt_0_19714: + .loc 16 123 0 + ld.global.s32 %r17, [%rd9+0]; + cvt.s64.s32 %rd17, %r17; + mul.wide.s32 %rd18, %r17, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 124 0 + cvt.s64.s32 %rd20, %r12; + mul.wide.s32 %rd21, %r12, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 125 0 + mov.s32 %r16, %r1; + .loc 16 126 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_19458: + .loc 16 129 0 + ld.global.s32 %r18, [%rd7+0]; + mov.u32 %r19, %r18; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f21, %f17; + mov.f32 %f22, %f18; + mov.f32 %f23, %f19; + mov.f32 %f24, %f20; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_28162; + cvt.rzi.ftz.s32.f32 %r26, %f24; + cvt.s64.s32 %rd24, %r16; + ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r28, %r27, %r26; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1]; + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92; +$Lt_0_20482: + //<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown + .loc 16 135 0 + ld.global.s32 %r29, [%rd16+0]; + .loc 16 136 0 + shr.s32 %r30, %r29, 30; + and.b32 %r31, %r30, 3; + cvt.s64.s32 %rd27, %r31; + mul.wide.s32 %rd28, %r31, 4; + add.u64 %rd29, %rd26, %rd28; + ld.shared.f32 %f29, [%rd29+0]; + .loc 16 139 0 + and.b32 %r32, %r29, 1073741823; + mov.u32 %r33, %r32; + mov.s32 %r34, 0; + mov.u32 %r35, %r34; + mov.s32 %r36, 0; + mov.u32 %r37, %r36; + mov.s32 %r38, 0; + mov.u32 %r39, %r38; + tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}]; + mov.f32 %f34, %f30; + mov.f32 %f35, %f31; + mov.f32 %f36, %f32; + mov.f32 %f37, %f33; + cvt.rzi.ftz.s32.f32 %r40, %f37; + sub.ftz.f32 %f38, %f22, %f35; + sub.ftz.f32 %f39, %f21, %f34; + sub.ftz.f32 %f40, %f23, %f36; + mul.ftz.f32 %f41, %f38, %f38; + fma.rn.ftz.f32 %f42, %f39, %f39, %f41; + fma.rn.ftz.f32 %f43, %f40, %f40, %f42; + add.s32 %r41, %r40, %r28; + cvt.s64.s32 %rd30, %r41; + mul.wide.s32 %rd31, %r41, 16; + add.u64 %rd32, %rd31, %rd25; + ld.global.f32 %f44, [%rd32+8]; + setp.gt.ftz.f32 %p4, %f44, %f43; + @!%p4 bra $Lt_0_21762; + .loc 16 153 0 + rcp.approx.ftz.f32 %f45, %f43; + mul.ftz.f32 %f46, %f45, %f45; + mul.ftz.f32 %f47, %f45, %f46; + mul.ftz.f32 %f48, %f45, %f47; + ld.global.v2.f32 {%f49,%f50}, [%rd32+0]; + mul.ftz.f32 %f51, %f49, %f47; + sub.ftz.f32 %f52, %f51, %f50; + mul.ftz.f32 %f53, %f48, %f52; + mul.ftz.f32 %f54, %f29, %f53; + .loc 16 155 0 + fma.rn.ftz.f32 %f27, %f39, %f54, %f27; + .loc 16 156 0 + fma.rn.ftz.f32 %f26, %f38, %f54, %f26; + .loc 16 157 0 + fma.rn.ftz.f32 %f25, %f40, %f54, %f25; + ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r43, 0; + setp.le.s32 %p5, %r42, %r43; + @%p5 bra $Lt_0_21250; + .loc 16 161 0 + ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd34, %rd33, %rd31; + ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd34+0]; + mul.ftz.f32 %f58, %f55, %f47; + sub.ftz.f32 %f59, %f58, %f56; + mul.ftz.f32 %f60, %f47, %f59; + sub.ftz.f32 %f61, %f60, %f57; + fma.rn.ftz.f32 %f28, %f29, %f61, %f28; +$Lt_0_21250: + ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r45, 0; + setp.le.s32 %p6, %r44, %r45; + @%p6 bra $Lt_0_21762; + .loc 16 164 0 + mov.f32 %f62, %f6; + mul.ftz.f32 %f63, %f39, %f39; + fma.rn.ftz.f32 %f64, %f54, %f63, %f62; + mov.f32 %f6, %f64; + .loc 16 165 0 + mov.f32 %f65, %f8; + fma.rn.ftz.f32 %f66, %f54, %f41, %f65; + mov.f32 %f8, %f66; + .loc 16 166 0 + mov.f32 %f67, %f10; + mul.ftz.f32 %f68, %f40, %f40; + fma.rn.ftz.f32 %f69, %f54, %f68, %f67; + mov.f32 %f10, %f69; + .loc 16 167 0 + mov.f32 %f70, %f12; + mul.ftz.f32 %f71, %f38, %f39; + fma.rn.ftz.f32 %f72, %f54, %f71, %f70; + mov.f32 %f12, %f72; + .loc 16 168 0 + mov.f32 %f73, %f14; + mul.ftz.f32 %f74, %f39, %f40; + fma.rn.ftz.f32 %f75, %f54, %f74, %f73; + mov.f32 %f14, %f75; + .loc 16 169 0 + mul.ftz.f32 %f76, %f38, %f40; + fma.rn.ftz.f32 %f15, %f54, %f76, %f15; + mov.f32 %f16, %f15; +$Lt_0_21762: +$Lt_0_20738: + .loc 16 133 0 + mul.lo.u64 %rd35, %rd24, 4; + add.u64 %rd16, %rd16, %rd35; + setp.lt.u64 %p7, %rd16, %rd13; + @%p7 bra $Lt_0_20482; + bra.uni $Lt_0_18946; +$Lt_0_28162: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + bra.uni $Lt_0_18946; +$Lt_0_19202: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 +$Lt_0_18946: + mov.u32 %r46, 1; + setp.le.s32 %p8, %r1, %r46; + @%p8 bra $Lt_0_24578; + .loc 16 180 0 + mov.u64 %rd36, __cuda___cuda_local_var_32581_35_non_const_red_acc108; + cvt.s64.s32 %rd37, %r2; + mul.wide.s32 %rd38, %r2, 4; + add.u64 %rd39, %rd36, %rd38; + mov.f32 %f77, %f27; + st.shared.f32 [%rd39+0], %f77; + .loc 16 181 0 + mov.f32 %f78, %f26; + st.shared.f32 [%rd39+512], %f78; + .loc 16 182 0 + mov.f32 %f79, %f25; + st.shared.f32 [%rd39+1024], %f79; + .loc 16 183 0 + mov.f32 %f80, %f28; + st.shared.f32 [%rd39+1536], %f80; + .loc 16 185 0 + shr.s32 %r47, %r1, 31; + mov.s32 %r48, 1; + and.b32 %r49, %r47, %r48; + add.s32 %r50, %r49, %r1; + shr.s32 %r51, %r50, 1; + mov.s32 %r52, %r51; + mov.u32 %r53, 0; + setp.ne.u32 %p9, %r51, %r53; + @!%p9 bra $Lt_0_23042; +$Lt_0_23554: + setp.ge.u32 %p10, %r6, %r52; + @%p10 bra $Lt_0_23810; + .loc 16 188 0 + add.u32 %r54, %r2, %r52; + cvt.u64.u32 %rd40, %r54; + mul.wide.u32 %rd41, %r54, 4; + add.u64 %rd42, %rd36, %rd41; + ld.shared.f32 %f81, [%rd42+0]; + add.ftz.f32 %f77, %f81, %f77; + st.shared.f32 [%rd39+0], %f77; + ld.shared.f32 %f82, [%rd42+512]; + add.ftz.f32 %f78, %f82, %f78; + st.shared.f32 [%rd39+512], %f78; + ld.shared.f32 %f83, [%rd42+1024]; + add.ftz.f32 %f79, %f83, %f79; + st.shared.f32 [%rd39+1024], %f79; + ld.shared.f32 %f84, [%rd42+1536]; + add.ftz.f32 %f80, %f84, %f80; + st.shared.f32 [%rd39+1536], %f80; +$Lt_0_23810: + .loc 16 185 0 + shr.u32 %r52, %r52, 1; + mov.u32 %r55, 0; + setp.ne.u32 %p11, %r52, %r55; + @%p11 bra $Lt_0_23554; +$Lt_0_23042: + .loc 16 192 0 + mov.f32 %f27, %f77; + .loc 16 193 0 + mov.f32 %f26, %f78; + .loc 16 194 0 + mov.f32 %f25, %f79; + .loc 16 195 0 + mov.f32 %f28, %f80; + ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r57, 0; + setp.le.s32 %p12, %r56, %r57; + @%p12 bra $Lt_0_24578; + .loc 16 199 0 + mov.f32 %f77, %f6; + st.shared.f32 [%rd39+0], %f77; + mov.f32 %f78, %f8; + st.shared.f32 [%rd39+512], %f78; + mov.f32 %f79, %f10; + st.shared.f32 [%rd39+1024], %f79; + mov.f32 %f80, %f12; + st.shared.f32 [%rd39+1536], %f80; + mov.f32 %f85, %f14; + st.shared.f32 [%rd39+2048], %f85; + mov.f32 %f86, %f16; + st.shared.f32 [%rd39+2560], %f86; + .loc 16 201 0 + mov.s32 %r58, %r51; + @!%p9 bra $Lt_0_25090; +$Lt_0_25602: + setp.ge.u32 %p13, %r6, %r58; + @%p13 bra $Lt_0_25858; + .loc 16 204 0 + add.u32 %r59, %r2, %r58; + cvt.u64.u32 %rd43, %r59; + mul.wide.u32 %rd44, %r59, 4; + add.u64 %rd45, %rd36, %rd44; + ld.shared.f32 %f87, [%rd45+0]; + add.ftz.f32 %f77, %f87, %f77; + st.shared.f32 [%rd39+0], %f77; + ld.shared.f32 %f88, [%rd45+512]; + add.ftz.f32 %f78, %f88, %f78; + st.shared.f32 [%rd39+512], %f78; + ld.shared.f32 %f89, [%rd45+1024]; + add.ftz.f32 %f79, %f89, %f79; + st.shared.f32 [%rd39+1024], %f79; + ld.shared.f32 %f90, [%rd45+1536]; + add.ftz.f32 %f80, %f90, %f80; + st.shared.f32 [%rd39+1536], %f80; + ld.shared.f32 %f91, [%rd45+2048]; + add.ftz.f32 %f85, %f91, %f85; + st.shared.f32 [%rd39+2048], %f85; + ld.shared.f32 %f92, [%rd45+2560]; + add.ftz.f32 %f86, %f92, %f86; + st.shared.f32 [%rd39+2560], %f86; +$Lt_0_25858: + .loc 16 201 0 + shr.u32 %r58, %r58, 1; + mov.u32 %r60, 0; + setp.ne.u32 %p14, %r58, %r60; + @%p14 bra $Lt_0_25602; +$Lt_0_25090: + .loc 16 209 0 + mov.f32 %f6, %f77; + mov.f32 %f8, %f78; + mov.f32 %f10, %f79; + mov.f32 %f12, %f80; + mov.f32 %f14, %f85; + mov.f32 %f16, %f86; +$Lt_0_24578: +$Lt_0_22530: + selp.s32 %r61, 1, 0, %p1; + mov.s32 %r62, 0; + set.eq.u32.s32 %r63, %r6, %r62; + neg.s32 %r64, %r63; + and.b32 %r65, %r61, %r64; + mov.u32 %r66, 0; + setp.eq.s32 %p15, %r65, %r66; + @%p15 bra $Lt_0_26626; + .loc 16 215 0 + cvt.s64.s32 %rd46, %r9; + ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd48, %r9, 4; + add.u64 %rd49, %rd47, %rd48; + ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r68, 0; + setp.le.s32 %p16, %r67, %r68; + @%p16 bra $Lt_0_27138; + .loc 16 217 0 + st.global.f32 [%rd49+0], %f28; + .loc 16 218 0 + cvt.s64.s32 %rd50, %r10; + mul.wide.s32 %rd51, %r10, 4; + add.u64 %rd49, %rd49, %rd51; +$Lt_0_27138: + ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r70, 0; + setp.le.s32 %p17, %r69, %r70; + @%p17 bra $Lt_0_27650; + .loc 16 222 0 + mov.f32 %f93, %f6; + st.global.f32 [%rd49+0], %f93; + .loc 16 223 0 + cvt.s64.s32 %rd52, %r10; + mul.wide.s32 %rd53, %r10, 4; + add.u64 %rd54, %rd53, %rd49; + .loc 16 222 0 + mov.f32 %f94, %f8; + st.global.f32 [%rd54+0], %f94; + .loc 16 223 0 + add.u64 %rd55, %rd53, %rd54; + .loc 16 222 0 + mov.f32 %f95, %f10; + st.global.f32 [%rd55+0], %f95; + .loc 16 223 0 + add.u64 %rd56, %rd53, %rd55; + .loc 16 222 0 + mov.f32 %f96, %f12; + st.global.f32 [%rd56+0], %f96; + .loc 16 223 0 + add.u64 %rd49, %rd53, %rd56; + .loc 16 222 0 + mov.f32 %f97, %f14; + st.global.f32 [%rd49+0], %f97; + mov.f32 %f98, %f16; + add.u64 %rd57, %rd53, %rd49; + st.global.f32 [%rd57+0], %f98; +$Lt_0_27650: + .loc 16 226 0 + ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd59, %rd46, 16; + add.u64 %rd60, %rd58, %rd59; + mov.f32 %f99, %f100; + st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f99}; +$Lt_0_26626: + .loc 16 228 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_lj1_in, + .param .u64 __cudaparm_kernel_pair_fast_lj3_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<74>; + .reg .u64 %rd<74>; + .reg .f32 %f<109>; + .reg .pred %p<22>; + .shared .align 4 .b8 __cuda___cuda_local_var_32647_33_non_const_sp_lj3268[16]; + .shared .align 16 .b8 __cuda___cuda_local_var_32645_34_non_const_lj13296[1936]; + .shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj35232[1936]; + .shared .align 4 .b8 __cuda___cuda_local_var_32735_35_non_const_red_acc7168[3072]; + // __cuda_local_var_32657_10_non_const_f = 48 + // __cuda_local_var_32661_9_non_const_virial = 16 + .loc 16 236 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 3; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_21250; + .loc 16 246 0 + mov.u64 %rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_21250: + mov.u64 %rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268; + mov.u32 %r3, 120; + setp.gt.s32 %p2, %r1, %r3; + @%p2 bra $Lt_1_21762; + .loc 16 248 0 + mov.u64 %rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296; + cvt.s64.s32 %rd8, %r1; + mul.wide.s32 %rd9, %r1, 16; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; + add.u64 %rd11, %rd10, %rd9; + add.u64 %rd12, %rd9, %rd7; + ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; + st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; + ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r5, 0; + setp.le.s32 %p3, %r4, %r5; + @%p3 bra $Lt_1_22274; + .loc 16 250 0 + mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232; + ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; + add.u64 %rd15, %rd14, %rd9; + add.u64 %rd16, %rd9, %rd13; + ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; + st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; +$Lt_1_22274: + mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232; +$Lt_1_21762: + mov.u64 %rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296; + mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232; + .loc 16 260 0 + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, %f14; + mov.f32 %f16, 0f00000000; // 0 + mov.f32 %f17, %f16; + mov.f32 %f18, 0f00000000; // 0 + mov.f32 %f19, %f18; + mov.f32 %f20, 0f00000000; // 0 + mov.f32 %f21, %f20; + .loc 16 262 0 + bar.sync 0; + ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r7, %r1, %r6; + cvt.s32.u32 %r8, %ntid.x; + div.s32 %r9, %r8, %r6; + rem.s32 %r10, %r1, %r6; + cvt.s32.u32 %r11, %ctaid.x; + mul.lo.s32 %r12, %r11, %r9; + add.s32 %r13, %r7, %r12; + ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p4, %r13, %r14; + @!%p4 bra $Lt_1_23042; + .loc 16 268 0 + ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd17, %r15; + mul.wide.s32 %rd18, %r15, 4; + cvt.s64.s32 %rd19, %r13; + mul.wide.s32 %rd20, %r13, 4; + ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd22, %rd20, %rd21; + add.u64 %rd23, %rd18, %rd22; + ld.global.s32 %r16, [%rd23+0]; + add.u64 %rd24, %rd18, %rd23; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p5, %rd25, %rd21; + @%p5 bra $Lt_1_23554; + .loc 16 274 0 + cvt.s32.s64 %r17, %rd17; + mul.lo.s32 %r18, %r17, %r16; + cvt.s64.s32 %rd26, %r18; + mul.wide.s32 %rd27, %r18, 4; + add.u64 %rd28, %rd24, %rd27; + .loc 16 275 0 + mul.lo.s32 %r19, %r10, %r17; + cvt.s64.s32 %rd29, %r19; + mul.wide.s32 %rd30, %r19, 4; + add.u64 %rd31, %rd24, %rd30; + .loc 16 276 0 + mul.lo.s32 %r20, %r17, %r6; + bra.uni $Lt_1_23298; +$Lt_1_23554: + .loc 16 278 0 + ld.global.s32 %r21, [%rd24+0]; + cvt.s64.s32 %rd32, %r21; + mul.wide.s32 %rd33, %r21, 4; + add.u64 %rd34, %rd25, %rd33; + .loc 16 279 0 + cvt.s64.s32 %rd35, %r16; + mul.wide.s32 %rd36, %r16, 4; + add.u64 %rd28, %rd34, %rd36; + .loc 16 280 0 + mov.s32 %r20, %r6; + .loc 16 281 0 + cvt.s64.s32 %rd37, %r10; + mul.wide.s32 %rd38, %r10, 4; + add.u64 %rd31, %rd34, %rd38; +$Lt_1_23298: + .loc 16 284 0 + ld.global.s32 %r22, [%rd22+0]; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + mov.s32 %r28, 0; + mov.u32 %r29, %r28; + tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}]; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + mov.f32 %f29, %f25; + setp.ge.u64 %p6, %rd31, %rd28; + @%p6 bra $Lt_1_32002; + cvt.rzi.ftz.s32.f32 %r30, %f29; + cvt.s64.s32 %rd39, %r20; + mul.lo.s32 %r31, %r30, 11; + cvt.rn.f32.s32 %f30, %r31; + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 +$Lt_1_24322: + //<loop> Loop body line 284, nesting depth: 1, estimated iterations: unknown + .loc 16 291 0 + ld.global.s32 %r32, [%rd31+0]; + .loc 16 292 0 + shr.s32 %r33, %r32, 30; + and.b32 %r34, %r33, 3; + cvt.s64.s32 %rd40, %r34; + mul.wide.s32 %rd41, %r34, 4; + add.u64 %rd42, %rd1, %rd41; + ld.shared.f32 %f35, [%rd42+0]; + .loc 16 295 0 + and.b32 %r35, %r32, 1073741823; + mov.u32 %r36, %r35; + mov.s32 %r37, 0; + mov.u32 %r38, %r37; + mov.s32 %r39, 0; + mov.u32 %r40, %r39; + mov.s32 %r41, 0; + mov.u32 %r42, %r41; + tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}]; + mov.f32 %f40, %f36; + mov.f32 %f41, %f37; + mov.f32 %f42, %f38; + mov.f32 %f43, %f39; + sub.ftz.f32 %f44, %f27, %f41; + sub.ftz.f32 %f45, %f26, %f40; + sub.ftz.f32 %f46, %f28, %f42; + mul.ftz.f32 %f47, %f44, %f44; + fma.rn.ftz.f32 %f48, %f45, %f45, %f47; + fma.rn.ftz.f32 %f49, %f46, %f46, %f48; + add.ftz.f32 %f50, %f30, %f43; + cvt.rzi.ftz.s32.f32 %r43, %f50; + cvt.s64.s32 %rd43, %r43; + mul.wide.s32 %rd44, %r43, 16; + add.u64 %rd45, %rd44, %rd7; + ld.shared.f32 %f51, [%rd45+8]; + setp.gt.ftz.f32 %p7, %f51, %f49; + @!%p7 bra $Lt_1_25602; + .loc 16 307 0 + rcp.approx.ftz.f32 %f52, %f49; + mul.ftz.f32 %f53, %f52, %f52; + mul.ftz.f32 %f54, %f52, %f53; + mul.ftz.f32 %f55, %f52, %f35; + mul.ftz.f32 %f56, %f54, %f55; + ld.shared.v2.f32 {%f57,%f58}, [%rd45+0]; + mul.ftz.f32 %f59, %f57, %f54; + sub.ftz.f32 %f60, %f59, %f58; + mul.ftz.f32 %f61, %f56, %f60; + .loc 16 309 0 + fma.rn.ftz.f32 %f33, %f45, %f61, %f33; + .loc 16 310 0 + fma.rn.ftz.f32 %f32, %f44, %f61, %f32; + .loc 16 311 0 + fma.rn.ftz.f32 %f31, %f46, %f61, %f31; + ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r45, 0; + setp.le.s32 %p8, %r44, %r45; + @%p8 bra $Lt_1_25090; + .loc 16 314 0 + add.u64 %rd46, %rd44, %rd13; + ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd46+0]; + mul.ftz.f32 %f65, %f62, %f54; + sub.ftz.f32 %f66, %f65, %f63; + mul.ftz.f32 %f67, %f54, %f66; + .loc 16 315 0 + sub.ftz.f32 %f68, %f67, %f64; + fma.rn.ftz.f32 %f34, %f35, %f68, %f34; +$Lt_1_25090: + ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r47, 0; + setp.le.s32 %p9, %r46, %r47; + @%p9 bra $Lt_1_25602; + .loc 16 318 0 + mov.f32 %f69, %f11; + mul.ftz.f32 %f70, %f45, %f45; + fma.rn.ftz.f32 %f71, %f61, %f70, %f69; + mov.f32 %f11, %f71; + .loc 16 319 0 + mov.f32 %f72, %f13; + fma.rn.ftz.f32 %f73, %f61, %f47, %f72; + mov.f32 %f13, %f73; + .loc 16 320 0 + mov.f32 %f74, %f15; + mul.ftz.f32 %f75, %f46, %f46; + fma.rn.ftz.f32 %f76, %f61, %f75, %f74; + mov.f32 %f15, %f76; + .loc 16 321 0 + mov.f32 %f77, %f17; + mul.ftz.f32 %f78, %f44, %f45; + fma.rn.ftz.f32 %f79, %f61, %f78, %f77; + mov.f32 %f17, %f79; + .loc 16 322 0 + mov.f32 %f80, %f19; + mul.ftz.f32 %f81, %f45, %f46; + fma.rn.ftz.f32 %f82, %f61, %f81, %f80; + mov.f32 %f19, %f82; + .loc 16 323 0 + mul.ftz.f32 %f83, %f44, %f46; + fma.rn.ftz.f32 %f20, %f61, %f83, %f20; + mov.f32 %f21, %f20; +$Lt_1_25602: +$Lt_1_24578: + .loc 16 289 0 + mul.lo.u64 %rd47, %rd39, 4; + add.u64 %rd31, %rd31, %rd47; + setp.lt.u64 %p10, %rd31, %rd28; + @%p10 bra $Lt_1_24322; + bra.uni $Lt_1_22786; +$Lt_1_32002: + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 + bra.uni $Lt_1_22786; +$Lt_1_23042: + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 +$Lt_1_22786: + mov.u32 %r48, 1; + setp.le.s32 %p11, %r6, %r48; + @%p11 bra $Lt_1_28418; + .loc 16 334 0 + mov.u64 %rd48, __cuda___cuda_local_var_32735_35_non_const_red_acc7168; + cvt.s64.s32 %rd49, %r1; + mul.wide.s32 %rd50, %r1, 4; + add.u64 %rd51, %rd48, %rd50; + mov.f32 %f84, %f33; + st.shared.f32 [%rd51+0], %f84; + .loc 16 335 0 + mov.f32 %f85, %f32; + st.shared.f32 [%rd51+512], %f85; + .loc 16 336 0 + mov.f32 %f86, %f31; + st.shared.f32 [%rd51+1024], %f86; + .loc 16 337 0 + mov.f32 %f87, %f34; + st.shared.f32 [%rd51+1536], %f87; + .loc 16 339 0 + shr.s32 %r49, %r6, 31; + mov.s32 %r50, 1; + and.b32 %r51, %r49, %r50; + add.s32 %r52, %r51, %r6; + shr.s32 %r53, %r52, 1; + mov.s32 %r54, %r53; + mov.u32 %r55, 0; + setp.ne.u32 %p12, %r53, %r55; + @!%p12 bra $Lt_1_26882; +$Lt_1_27394: + setp.ge.u32 %p13, %r10, %r54; + @%p13 bra $Lt_1_27650; + .loc 16 342 0 + add.u32 %r56, %r1, %r54; + cvt.u64.u32 %rd52, %r56; + mul.wide.u32 %rd53, %r56, 4; + add.u64 %rd54, %rd48, %rd53; + ld.shared.f32 %f88, [%rd54+0]; + add.ftz.f32 %f84, %f88, %f84; + st.shared.f32 [%rd51+0], %f84; + ld.shared.f32 %f89, [%rd54+512]; + add.ftz.f32 %f85, %f89, %f85; + st.shared.f32 [%rd51+512], %f85; + ld.shared.f32 %f90, [%rd54+1024]; + add.ftz.f32 %f86, %f90, %f86; + st.shared.f32 [%rd51+1024], %f86; + ld.shared.f32 %f91, [%rd54+1536]; + add.ftz.f32 %f87, %f91, %f87; + st.shared.f32 [%rd51+1536], %f87; +$Lt_1_27650: + .loc 16 339 0 + shr.u32 %r54, %r54, 1; + mov.u32 %r57, 0; + setp.ne.u32 %p14, %r54, %r57; + @%p14 bra $Lt_1_27394; +$Lt_1_26882: + .loc 16 346 0 + mov.f32 %f33, %f84; + .loc 16 347 0 + mov.f32 %f32, %f85; + .loc 16 348 0 + mov.f32 %f31, %f86; + .loc 16 349 0 + mov.f32 %f34, %f87; + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p15, %r58, %r59; + @%p15 bra $Lt_1_28418; + .loc 16 353 0 + mov.f32 %f84, %f11; + st.shared.f32 [%rd51+0], %f84; + mov.f32 %f85, %f13; + st.shared.f32 [%rd51+512], %f85; + mov.f32 %f86, %f15; + st.shared.f32 [%rd51+1024], %f86; + mov.f32 %f87, %f17; + st.shared.f32 [%rd51+1536], %f87; + mov.f32 %f92, %f19; + st.shared.f32 [%rd51+2048], %f92; + mov.f32 %f93, %f21; + st.shared.f32 [%rd51+2560], %f93; + .loc 16 355 0 + mov.s32 %r60, %r53; + @!%p12 bra $Lt_1_28930; +$Lt_1_29442: + setp.ge.u32 %p16, %r10, %r60; + @%p16 bra $Lt_1_29698; + .loc 16 358 0 + add.u32 %r61, %r1, %r60; + cvt.u64.u32 %rd55, %r61; + mul.wide.u32 %rd56, %r61, 4; + add.u64 %rd57, %rd48, %rd56; + ld.shared.f32 %f94, [%rd57+0]; + add.ftz.f32 %f84, %f94, %f84; + st.shared.f32 [%rd51+0], %f84; + ld.shared.f32 %f95, [%rd57+512]; + add.ftz.f32 %f85, %f95, %f85; + st.shared.f32 [%rd51+512], %f85; + ld.shared.f32 %f96, [%rd57+1024]; + add.ftz.f32 %f86, %f96, %f86; + st.shared.f32 [%rd51+1024], %f86; + ld.shared.f32 %f97, [%rd57+1536]; + add.ftz.f32 %f87, %f97, %f87; + st.shared.f32 [%rd51+1536], %f87; + ld.shared.f32 %f98, [%rd57+2048]; + add.ftz.f32 %f92, %f98, %f92; + st.shared.f32 [%rd51+2048], %f92; + ld.shared.f32 %f99, [%rd57+2560]; + add.ftz.f32 %f93, %f99, %f93; + st.shared.f32 [%rd51+2560], %f93; +$Lt_1_29698: + .loc 16 355 0 + shr.u32 %r60, %r60, 1; + mov.u32 %r62, 0; + setp.ne.u32 %p17, %r60, %r62; + @%p17 bra $Lt_1_29442; +$Lt_1_28930: + .loc 16 363 0 + mov.f32 %f11, %f84; + mov.f32 %f13, %f85; + mov.f32 %f15, %f86; + mov.f32 %f17, %f87; + mov.f32 %f19, %f92; + mov.f32 %f21, %f93; +$Lt_1_28418: +$Lt_1_26370: + selp.s32 %r63, 1, 0, %p4; + mov.s32 %r64, 0; + set.eq.u32.s32 %r65, %r10, %r64; + neg.s32 %r66, %r65; + and.b32 %r67, %r63, %r66; + mov.u32 %r68, 0; + setp.eq.s32 %p18, %r67, %r68; + @%p18 bra $Lt_1_30466; + .loc 16 369 0 + cvt.s64.s32 %rd58, %r13; + ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd60, %r13, 4; + add.u64 %rd61, %rd59, %rd60; + ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r70, 0; + setp.le.s32 %p19, %r69, %r70; + @%p19 bra $Lt_1_30978; + .loc 16 371 0 + st.global.f32 [%rd61+0], %f34; + .loc 16 372 0 + cvt.s64.s32 %rd62, %r14; + mul.wide.s32 %rd63, %r14, 4; + add.u64 %rd61, %rd61, %rd63; +$Lt_1_30978: + ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r72, 0; + setp.le.s32 %p20, %r71, %r72; + @%p20 bra $Lt_1_31490; + .loc 16 376 0 + mov.f32 %f100, %f11; + st.global.f32 [%rd61+0], %f100; + .loc 16 377 0 + cvt.s64.s32 %rd64, %r14; + mul.wide.s32 %rd65, %r14, 4; + add.u64 %rd66, %rd65, %rd61; + .loc 16 376 0 + mov.f32 %f101, %f13; + st.global.f32 [%rd66+0], %f101; + .loc 16 377 0 + add.u64 %rd67, %rd65, %rd66; + .loc 16 376 0 + mov.f32 %f102, %f15; + st.global.f32 [%rd67+0], %f102; + .loc 16 377 0 + add.u64 %rd68, %rd65, %rd67; + .loc 16 376 0 + mov.f32 %f103, %f17; + st.global.f32 [%rd68+0], %f103; + .loc 16 377 0 + add.u64 %rd61, %rd65, %rd68; + .loc 16 376 0 + mov.f32 %f104, %f19; + st.global.f32 [%rd61+0], %f104; + mov.f32 %f105, %f21; + add.u64 %rd69, %rd65, %rd61; + st.global.f32 [%rd69+0], %f105; +$Lt_1_31490: + .loc 16 380 0 + ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd71, %rd58, 16; + add.u64 %rd72, %rd70, %rd71; + mov.f32 %f106, %f107; + st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f106}; +$Lt_1_30466: + .loc 16 382 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/lj_cut_gpu_ptx.h b/lib/gpu/lj_cut_gpu_ptx.h new file mode 100644 index 000000000..c78094dd8 --- /dev/null +++ b/lib/gpu/lj_cut_gpu_ptx.h @@ -0,0 +1,927 @@ +const char * lj_cut_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .u64 __cudaparm_kernel_pair_lj3,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<72>;\n" +" .reg .u64 %rd<62>;\n" +" .reg .f32 %f<102>;\n" +" .reg .pred %p<19>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32581_35_non_const_red_acc108[3072];\n" +" .loc 16 88 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 95 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 96 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 97 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 98 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 107 0\n" +" mov.f32 %f5, 0f00000000; \n" +" mov.f32 %f6, %f5;\n" +" mov.f32 %f7, 0f00000000; \n" +" mov.f32 %f8, %f7;\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_19202;\n" +" .loc 16 113 0\n" +" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd2, %r11;\n" +" mul.wide.s32 %rd3, %r11, 4;\n" +" cvt.s64.s32 %rd4, %r9;\n" +" mul.wide.s32 %rd5, %r9, 4;\n" +" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd7, %rd5, %rd6;\n" +" add.u64 %rd8, %rd3, %rd7;\n" +" ld.global.s32 %r12, [%rd8+0];\n" +" add.u64 %rd9, %rd3, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd6;\n" +" @%p2 bra $Lt_0_19714;\n" +" .loc 16 119 0\n" +" cvt.s32.s64 %r13, %rd2;\n" +" mul.lo.s32 %r14, %r13, %r12;\n" +" cvt.s64.s32 %rd11, %r14;\n" +" mul.wide.s32 %rd12, %r14, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 120 0\n" +" mul.lo.s32 %r15, %r6, %r13;\n" +" cvt.s64.s32 %rd14, %r15;\n" +" mul.wide.s32 %rd15, %r15, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 121 0\n" +" mul.lo.s32 %r16, %r13, %r1;\n" +" bra.uni $Lt_0_19458;\n" +"$Lt_0_19714:\n" +" .loc 16 123 0\n" +" ld.global.s32 %r17, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r17;\n" +" mul.wide.s32 %rd18, %r17, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 124 0\n" +" cvt.s64.s32 %rd20, %r12;\n" +" mul.wide.s32 %rd21, %r12, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 125 0\n" +" mov.s32 %r16, %r1;\n" +" .loc 16 126 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_19458:\n" +" .loc 16 129 0\n" +" ld.global.s32 %r18, [%rd7+0];\n" +" mov.u32 %r19, %r18;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f21, %f17;\n" +" mov.f32 %f22, %f18;\n" +" mov.f32 %f23, %f19;\n" +" mov.f32 %f24, %f20;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_28162;\n" +" cvt.rzi.ftz.s32.f32 %r26, %f24;\n" +" cvt.s64.s32 %rd24, %r16;\n" +" ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r28, %r27, %r26;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;\n" +"$Lt_0_20482:\n" +" .loc 16 135 0\n" +" ld.global.s32 %r29, [%rd16+0];\n" +" .loc 16 136 0\n" +" shr.s32 %r30, %r29, 30;\n" +" and.b32 %r31, %r30, 3;\n" +" cvt.s64.s32 %rd27, %r31;\n" +" mul.wide.s32 %rd28, %r31, 4;\n" +" add.u64 %rd29, %rd26, %rd28;\n" +" ld.shared.f32 %f29, [%rd29+0];\n" +" .loc 16 139 0\n" +" and.b32 %r32, %r29, 1073741823;\n" +" mov.u32 %r33, %r32;\n" +" mov.s32 %r34, 0;\n" +" mov.u32 %r35, %r34;\n" +" mov.s32 %r36, 0;\n" +" mov.u32 %r37, %r36;\n" +" mov.s32 %r38, 0;\n" +" mov.u32 %r39, %r38;\n" +" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];\n" +" mov.f32 %f34, %f30;\n" +" mov.f32 %f35, %f31;\n" +" mov.f32 %f36, %f32;\n" +" mov.f32 %f37, %f33;\n" +" cvt.rzi.ftz.s32.f32 %r40, %f37;\n" +" sub.ftz.f32 %f38, %f22, %f35;\n" +" sub.ftz.f32 %f39, %f21, %f34;\n" +" sub.ftz.f32 %f40, %f23, %f36;\n" +" mul.ftz.f32 %f41, %f38, %f38;\n" +" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" +" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" +" add.s32 %r41, %r40, %r28;\n" +" cvt.s64.s32 %rd30, %r41;\n" +" mul.wide.s32 %rd31, %r41, 16;\n" +" add.u64 %rd32, %rd31, %rd25;\n" +" ld.global.f32 %f44, [%rd32+8];\n" +" setp.gt.ftz.f32 %p4, %f44, %f43;\n" +" @!%p4 bra $Lt_0_21762;\n" +" .loc 16 153 0\n" +" rcp.approx.ftz.f32 %f45, %f43;\n" +" mul.ftz.f32 %f46, %f45, %f45;\n" +" mul.ftz.f32 %f47, %f45, %f46;\n" +" mul.ftz.f32 %f48, %f45, %f47;\n" +" ld.global.v2.f32 {%f49,%f50}, [%rd32+0];\n" +" mul.ftz.f32 %f51, %f49, %f47;\n" +" sub.ftz.f32 %f52, %f51, %f50;\n" +" mul.ftz.f32 %f53, %f48, %f52;\n" +" mul.ftz.f32 %f54, %f29, %f53;\n" +" .loc 16 155 0\n" +" fma.rn.ftz.f32 %f27, %f39, %f54, %f27;\n" +" .loc 16 156 0\n" +" fma.rn.ftz.f32 %f26, %f38, %f54, %f26;\n" +" .loc 16 157 0\n" +" fma.rn.ftz.f32 %f25, %f40, %f54, %f25;\n" +" ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r43, 0;\n" +" setp.le.s32 %p5, %r42, %r43;\n" +" @%p5 bra $Lt_0_21250;\n" +" .loc 16 161 0\n" +" ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd34, %rd33, %rd31;\n" +" ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd34+0];\n" +" mul.ftz.f32 %f58, %f55, %f47;\n" +" sub.ftz.f32 %f59, %f58, %f56;\n" +" mul.ftz.f32 %f60, %f47, %f59;\n" +" sub.ftz.f32 %f61, %f60, %f57;\n" +" fma.rn.ftz.f32 %f28, %f29, %f61, %f28;\n" +"$Lt_0_21250:\n" +" ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r45, 0;\n" +" setp.le.s32 %p6, %r44, %r45;\n" +" @%p6 bra $Lt_0_21762;\n" +" .loc 16 164 0\n" +" mov.f32 %f62, %f6;\n" +" mul.ftz.f32 %f63, %f39, %f39;\n" +" fma.rn.ftz.f32 %f64, %f54, %f63, %f62;\n" +" mov.f32 %f6, %f64;\n" +" .loc 16 165 0\n" +" mov.f32 %f65, %f8;\n" +" fma.rn.ftz.f32 %f66, %f54, %f41, %f65;\n" +" mov.f32 %f8, %f66;\n" +" .loc 16 166 0\n" +" mov.f32 %f67, %f10;\n" +" mul.ftz.f32 %f68, %f40, %f40;\n" +" fma.rn.ftz.f32 %f69, %f54, %f68, %f67;\n" +" mov.f32 %f10, %f69;\n" +" .loc 16 167 0\n" +" mov.f32 %f70, %f12;\n" +" mul.ftz.f32 %f71, %f38, %f39;\n" +" fma.rn.ftz.f32 %f72, %f54, %f71, %f70;\n" +" mov.f32 %f12, %f72;\n" +" .loc 16 168 0\n" +" mov.f32 %f73, %f14;\n" +" mul.ftz.f32 %f74, %f39, %f40;\n" +" fma.rn.ftz.f32 %f75, %f54, %f74, %f73;\n" +" mov.f32 %f14, %f75;\n" +" .loc 16 169 0\n" +" mul.ftz.f32 %f76, %f38, %f40;\n" +" fma.rn.ftz.f32 %f15, %f54, %f76, %f15;\n" +" mov.f32 %f16, %f15;\n" +"$Lt_0_21762:\n" +"$Lt_0_20738:\n" +" .loc 16 133 0\n" +" mul.lo.u64 %rd35, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd35;\n" +" setp.lt.u64 %p7, %rd16, %rd13;\n" +" @%p7 bra $Lt_0_20482;\n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_28162:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_19202:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +"$Lt_0_18946:\n" +" mov.u32 %r46, 1;\n" +" setp.le.s32 %p8, %r1, %r46;\n" +" @%p8 bra $Lt_0_24578;\n" +" .loc 16 180 0\n" +" mov.u64 %rd36, __cuda___cuda_local_var_32581_35_non_const_red_acc108;\n" +" cvt.s64.s32 %rd37, %r2;\n" +" mul.wide.s32 %rd38, %r2, 4;\n" +" add.u64 %rd39, %rd36, %rd38;\n" +" mov.f32 %f77, %f27;\n" +" st.shared.f32 [%rd39+0], %f77;\n" +" .loc 16 181 0\n" +" mov.f32 %f78, %f26;\n" +" st.shared.f32 [%rd39+512], %f78;\n" +" .loc 16 182 0\n" +" mov.f32 %f79, %f25;\n" +" st.shared.f32 [%rd39+1024], %f79;\n" +" .loc 16 183 0\n" +" mov.f32 %f80, %f28;\n" +" st.shared.f32 [%rd39+1536], %f80;\n" +" .loc 16 185 0\n" +" shr.s32 %r47, %r1, 31;\n" +" mov.s32 %r48, 1;\n" +" and.b32 %r49, %r47, %r48;\n" +" add.s32 %r50, %r49, %r1;\n" +" shr.s32 %r51, %r50, 1;\n" +" mov.s32 %r52, %r51;\n" +" mov.u32 %r53, 0;\n" +" setp.ne.u32 %p9, %r51, %r53;\n" +" @!%p9 bra $Lt_0_23042;\n" +"$Lt_0_23554:\n" +" setp.ge.u32 %p10, %r6, %r52;\n" +" @%p10 bra $Lt_0_23810;\n" +" .loc 16 188 0\n" +" add.u32 %r54, %r2, %r52;\n" +" cvt.u64.u32 %rd40, %r54;\n" +" mul.wide.u32 %rd41, %r54, 4;\n" +" add.u64 %rd42, %rd36, %rd41;\n" +" ld.shared.f32 %f81, [%rd42+0];\n" +" add.ftz.f32 %f77, %f81, %f77;\n" +" st.shared.f32 [%rd39+0], %f77;\n" +" ld.shared.f32 %f82, [%rd42+512];\n" +" add.ftz.f32 %f78, %f82, %f78;\n" +" st.shared.f32 [%rd39+512], %f78;\n" +" ld.shared.f32 %f83, [%rd42+1024];\n" +" add.ftz.f32 %f79, %f83, %f79;\n" +" st.shared.f32 [%rd39+1024], %f79;\n" +" ld.shared.f32 %f84, [%rd42+1536];\n" +" add.ftz.f32 %f80, %f84, %f80;\n" +" st.shared.f32 [%rd39+1536], %f80;\n" +"$Lt_0_23810:\n" +" .loc 16 185 0\n" +" shr.u32 %r52, %r52, 1;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p11, %r52, %r55;\n" +" @%p11 bra $Lt_0_23554;\n" +"$Lt_0_23042:\n" +" .loc 16 192 0\n" +" mov.f32 %f27, %f77;\n" +" .loc 16 193 0\n" +" mov.f32 %f26, %f78;\n" +" .loc 16 194 0\n" +" mov.f32 %f25, %f79;\n" +" .loc 16 195 0\n" +" mov.f32 %f28, %f80;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p12, %r56, %r57;\n" +" @%p12 bra $Lt_0_24578;\n" +" .loc 16 199 0\n" +" mov.f32 %f77, %f6;\n" +" st.shared.f32 [%rd39+0], %f77;\n" +" mov.f32 %f78, %f8;\n" +" st.shared.f32 [%rd39+512], %f78;\n" +" mov.f32 %f79, %f10;\n" +" st.shared.f32 [%rd39+1024], %f79;\n" +" mov.f32 %f80, %f12;\n" +" st.shared.f32 [%rd39+1536], %f80;\n" +" mov.f32 %f85, %f14;\n" +" st.shared.f32 [%rd39+2048], %f85;\n" +" mov.f32 %f86, %f16;\n" +" st.shared.f32 [%rd39+2560], %f86;\n" +" .loc 16 201 0\n" +" mov.s32 %r58, %r51;\n" +" @!%p9 bra $Lt_0_25090;\n" +"$Lt_0_25602:\n" +" setp.ge.u32 %p13, %r6, %r58;\n" +" @%p13 bra $Lt_0_25858;\n" +" .loc 16 204 0\n" +" add.u32 %r59, %r2, %r58;\n" +" cvt.u64.u32 %rd43, %r59;\n" +" mul.wide.u32 %rd44, %r59, 4;\n" +" add.u64 %rd45, %rd36, %rd44;\n" +" ld.shared.f32 %f87, [%rd45+0];\n" +" add.ftz.f32 %f77, %f87, %f77;\n" +" st.shared.f32 [%rd39+0], %f77;\n" +" ld.shared.f32 %f88, [%rd45+512];\n" +" add.ftz.f32 %f78, %f88, %f78;\n" +" st.shared.f32 [%rd39+512], %f78;\n" +" ld.shared.f32 %f89, [%rd45+1024];\n" +" add.ftz.f32 %f79, %f89, %f79;\n" +" st.shared.f32 [%rd39+1024], %f79;\n" +" ld.shared.f32 %f90, [%rd45+1536];\n" +" add.ftz.f32 %f80, %f90, %f80;\n" +" st.shared.f32 [%rd39+1536], %f80;\n" +" ld.shared.f32 %f91, [%rd45+2048];\n" +" add.ftz.f32 %f85, %f91, %f85;\n" +" st.shared.f32 [%rd39+2048], %f85;\n" +" ld.shared.f32 %f92, [%rd45+2560];\n" +" add.ftz.f32 %f86, %f92, %f86;\n" +" st.shared.f32 [%rd39+2560], %f86;\n" +"$Lt_0_25858:\n" +" .loc 16 201 0\n" +" shr.u32 %r58, %r58, 1;\n" +" mov.u32 %r60, 0;\n" +" setp.ne.u32 %p14, %r58, %r60;\n" +" @%p14 bra $Lt_0_25602;\n" +"$Lt_0_25090:\n" +" .loc 16 209 0\n" +" mov.f32 %f6, %f77;\n" +" mov.f32 %f8, %f78;\n" +" mov.f32 %f10, %f79;\n" +" mov.f32 %f12, %f80;\n" +" mov.f32 %f14, %f85;\n" +" mov.f32 %f16, %f86;\n" +"$Lt_0_24578:\n" +"$Lt_0_22530:\n" +" selp.s32 %r61, 1, 0, %p1;\n" +" mov.s32 %r62, 0;\n" +" set.eq.u32.s32 %r63, %r6, %r62;\n" +" neg.s32 %r64, %r63;\n" +" and.b32 %r65, %r61, %r64;\n" +" mov.u32 %r66, 0;\n" +" setp.eq.s32 %p15, %r65, %r66;\n" +" @%p15 bra $Lt_0_26626;\n" +" .loc 16 215 0\n" +" cvt.s64.s32 %rd46, %r9;\n" +" ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd48, %r9, 4;\n" +" add.u64 %rd49, %rd47, %rd48;\n" +" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r68, 0;\n" +" setp.le.s32 %p16, %r67, %r68;\n" +" @%p16 bra $Lt_0_27138;\n" +" .loc 16 217 0\n" +" st.global.f32 [%rd49+0], %f28;\n" +" .loc 16 218 0\n" +" cvt.s64.s32 %rd50, %r10;\n" +" mul.wide.s32 %rd51, %r10, 4;\n" +" add.u64 %rd49, %rd49, %rd51;\n" +"$Lt_0_27138:\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p17, %r69, %r70;\n" +" @%p17 bra $Lt_0_27650;\n" +" .loc 16 222 0\n" +" mov.f32 %f93, %f6;\n" +" st.global.f32 [%rd49+0], %f93;\n" +" .loc 16 223 0\n" +" cvt.s64.s32 %rd52, %r10;\n" +" mul.wide.s32 %rd53, %r10, 4;\n" +" add.u64 %rd54, %rd53, %rd49;\n" +" .loc 16 222 0\n" +" mov.f32 %f94, %f8;\n" +" st.global.f32 [%rd54+0], %f94;\n" +" .loc 16 223 0\n" +" add.u64 %rd55, %rd53, %rd54;\n" +" .loc 16 222 0\n" +" mov.f32 %f95, %f10;\n" +" st.global.f32 [%rd55+0], %f95;\n" +" .loc 16 223 0\n" +" add.u64 %rd56, %rd53, %rd55;\n" +" .loc 16 222 0\n" +" mov.f32 %f96, %f12;\n" +" st.global.f32 [%rd56+0], %f96;\n" +" .loc 16 223 0\n" +" add.u64 %rd49, %rd53, %rd56;\n" +" .loc 16 222 0\n" +" mov.f32 %f97, %f14;\n" +" st.global.f32 [%rd49+0], %f97;\n" +" mov.f32 %f98, %f16;\n" +" add.u64 %rd57, %rd53, %rd49;\n" +" st.global.f32 [%rd57+0], %f98;\n" +"$Lt_0_27650:\n" +" .loc 16 226 0\n" +" ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd59, %rd46, 16;\n" +" add.u64 %rd60, %rd58, %rd59;\n" +" mov.f32 %f99, %f100;\n" +" st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f99};\n" +"$Lt_0_26626:\n" +" .loc 16 228 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<74>;\n" +" .reg .u64 %rd<74>;\n" +" .reg .f32 %f<109>;\n" +" .reg .pred %p<22>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32647_33_non_const_sp_lj3268[16];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32645_34_non_const_lj13296[1936];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj35232[1936];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32735_35_non_const_red_acc7168[3072];\n" +" .loc 16 236 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 3;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_21250;\n" +" .loc 16 246 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_21250:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;\n" +" mov.u32 %r3, 120;\n" +" setp.gt.s32 %p2, %r1, %r3;\n" +" @%p2 bra $Lt_1_21762;\n" +" .loc 16 248 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;\n" +" cvt.s64.s32 %rd8, %r1;\n" +" mul.wide.s32 %rd9, %r1, 16;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" +" add.u64 %rd11, %rd10, %rd9;\n" +" add.u64 %rd12, %rd9, %rd7;\n" +" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" +" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" +" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r5, 0;\n" +" setp.le.s32 %p3, %r4, %r5;\n" +" @%p3 bra $Lt_1_22274;\n" +" .loc 16 250 0\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;\n" +" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" +" add.u64 %rd15, %rd14, %rd9;\n" +" add.u64 %rd16, %rd9, %rd13;\n" +" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" +" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" +"$Lt_1_22274:\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;\n" +"$Lt_1_21762:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;\n" +" .loc 16 260 0\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, %f14;\n" +" mov.f32 %f16, 0f00000000; \n" +" mov.f32 %f17, %f16;\n" +" mov.f32 %f18, 0f00000000; \n" +" mov.f32 %f19, %f18;\n" +" mov.f32 %f20, 0f00000000; \n" +" mov.f32 %f21, %f20;\n" +" .loc 16 262 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r7, %r1, %r6;\n" +" cvt.s32.u32 %r8, %ntid.x;\n" +" div.s32 %r9, %r8, %r6;\n" +" rem.s32 %r10, %r1, %r6;\n" +" cvt.s32.u32 %r11, %ctaid.x;\n" +" mul.lo.s32 %r12, %r11, %r9;\n" +" add.s32 %r13, %r7, %r12;\n" +" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p4, %r13, %r14;\n" +" @!%p4 bra $Lt_1_23042;\n" +" .loc 16 268 0\n" +" ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd17, %r15;\n" +" mul.wide.s32 %rd18, %r15, 4;\n" +" cvt.s64.s32 %rd19, %r13;\n" +" mul.wide.s32 %rd20, %r13, 4;\n" +" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd22, %rd20, %rd21;\n" +" add.u64 %rd23, %rd18, %rd22;\n" +" ld.global.s32 %r16, [%rd23+0];\n" +" add.u64 %rd24, %rd18, %rd23;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p5, %rd25, %rd21;\n" +" @%p5 bra $Lt_1_23554;\n" +" .loc 16 274 0\n" +" cvt.s32.s64 %r17, %rd17;\n" +" mul.lo.s32 %r18, %r17, %r16;\n" +" cvt.s64.s32 %rd26, %r18;\n" +" mul.wide.s32 %rd27, %r18, 4;\n" +" add.u64 %rd28, %rd24, %rd27;\n" +" .loc 16 275 0\n" +" mul.lo.s32 %r19, %r10, %r17;\n" +" cvt.s64.s32 %rd29, %r19;\n" +" mul.wide.s32 %rd30, %r19, 4;\n" +" add.u64 %rd31, %rd24, %rd30;\n" +" .loc 16 276 0\n" +" mul.lo.s32 %r20, %r17, %r6;\n" +" bra.uni $Lt_1_23298;\n" +"$Lt_1_23554:\n" +" .loc 16 278 0\n" +" ld.global.s32 %r21, [%rd24+0];\n" +" cvt.s64.s32 %rd32, %r21;\n" +" mul.wide.s32 %rd33, %r21, 4;\n" +" add.u64 %rd34, %rd25, %rd33;\n" +" .loc 16 279 0\n" +" cvt.s64.s32 %rd35, %r16;\n" +" mul.wide.s32 %rd36, %r16, 4;\n" +" add.u64 %rd28, %rd34, %rd36;\n" +" .loc 16 280 0\n" +" mov.s32 %r20, %r6;\n" +" .loc 16 281 0\n" +" cvt.s64.s32 %rd37, %r10;\n" +" mul.wide.s32 %rd38, %r10, 4;\n" +" add.u64 %rd31, %rd34, %rd38;\n" +"$Lt_1_23298:\n" +" .loc 16 284 0\n" +" ld.global.s32 %r22, [%rd22+0];\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" mov.s32 %r28, 0;\n" +" mov.u32 %r29, %r28;\n" +" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" mov.f32 %f29, %f25;\n" +" setp.ge.u64 %p6, %rd31, %rd28;\n" +" @%p6 bra $Lt_1_32002;\n" +" cvt.rzi.ftz.s32.f32 %r30, %f29;\n" +" cvt.s64.s32 %rd39, %r20;\n" +" mul.lo.s32 %r31, %r30, 11;\n" +" cvt.rn.f32.s32 %f30, %r31;\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +"$Lt_1_24322:\n" +" .loc 16 291 0\n" +" ld.global.s32 %r32, [%rd31+0];\n" +" .loc 16 292 0\n" +" shr.s32 %r33, %r32, 30;\n" +" and.b32 %r34, %r33, 3;\n" +" cvt.s64.s32 %rd40, %r34;\n" +" mul.wide.s32 %rd41, %r34, 4;\n" +" add.u64 %rd42, %rd1, %rd41;\n" +" ld.shared.f32 %f35, [%rd42+0];\n" +" .loc 16 295 0\n" +" and.b32 %r35, %r32, 1073741823;\n" +" mov.u32 %r36, %r35;\n" +" mov.s32 %r37, 0;\n" +" mov.u32 %r38, %r37;\n" +" mov.s32 %r39, 0;\n" +" mov.u32 %r40, %r39;\n" +" mov.s32 %r41, 0;\n" +" mov.u32 %r42, %r41;\n" +" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];\n" +" mov.f32 %f40, %f36;\n" +" mov.f32 %f41, %f37;\n" +" mov.f32 %f42, %f38;\n" +" mov.f32 %f43, %f39;\n" +" sub.ftz.f32 %f44, %f27, %f41;\n" +" sub.ftz.f32 %f45, %f26, %f40;\n" +" sub.ftz.f32 %f46, %f28, %f42;\n" +" mul.ftz.f32 %f47, %f44, %f44;\n" +" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" +" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n" +" add.ftz.f32 %f50, %f30, %f43;\n" +" cvt.rzi.ftz.s32.f32 %r43, %f50;\n" +" cvt.s64.s32 %rd43, %r43;\n" +" mul.wide.s32 %rd44, %r43, 16;\n" +" add.u64 %rd45, %rd44, %rd7;\n" +" ld.shared.f32 %f51, [%rd45+8];\n" +" setp.gt.ftz.f32 %p7, %f51, %f49;\n" +" @!%p7 bra $Lt_1_25602;\n" +" .loc 16 307 0\n" +" rcp.approx.ftz.f32 %f52, %f49;\n" +" mul.ftz.f32 %f53, %f52, %f52;\n" +" mul.ftz.f32 %f54, %f52, %f53;\n" +" mul.ftz.f32 %f55, %f52, %f35;\n" +" mul.ftz.f32 %f56, %f54, %f55;\n" +" ld.shared.v2.f32 {%f57,%f58}, [%rd45+0];\n" +" mul.ftz.f32 %f59, %f57, %f54;\n" +" sub.ftz.f32 %f60, %f59, %f58;\n" +" mul.ftz.f32 %f61, %f56, %f60;\n" +" .loc 16 309 0\n" +" fma.rn.ftz.f32 %f33, %f45, %f61, %f33;\n" +" .loc 16 310 0\n" +" fma.rn.ftz.f32 %f32, %f44, %f61, %f32;\n" +" .loc 16 311 0\n" +" fma.rn.ftz.f32 %f31, %f46, %f61, %f31;\n" +" ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r45, 0;\n" +" setp.le.s32 %p8, %r44, %r45;\n" +" @%p8 bra $Lt_1_25090;\n" +" .loc 16 314 0\n" +" add.u64 %rd46, %rd44, %rd13;\n" +" ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd46+0];\n" +" mul.ftz.f32 %f65, %f62, %f54;\n" +" sub.ftz.f32 %f66, %f65, %f63;\n" +" mul.ftz.f32 %f67, %f54, %f66;\n" +" .loc 16 315 0\n" +" sub.ftz.f32 %f68, %f67, %f64;\n" +" fma.rn.ftz.f32 %f34, %f35, %f68, %f34;\n" +"$Lt_1_25090:\n" +" ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r47, 0;\n" +" setp.le.s32 %p9, %r46, %r47;\n" +" @%p9 bra $Lt_1_25602;\n" +" .loc 16 318 0\n" +" mov.f32 %f69, %f11;\n" +" mul.ftz.f32 %f70, %f45, %f45;\n" +" fma.rn.ftz.f32 %f71, %f61, %f70, %f69;\n" +" mov.f32 %f11, %f71;\n" +" .loc 16 319 0\n" +" mov.f32 %f72, %f13;\n" +" fma.rn.ftz.f32 %f73, %f61, %f47, %f72;\n" +" mov.f32 %f13, %f73;\n" +" .loc 16 320 0\n" +" mov.f32 %f74, %f15;\n" +" mul.ftz.f32 %f75, %f46, %f46;\n" +" fma.rn.ftz.f32 %f76, %f61, %f75, %f74;\n" +" mov.f32 %f15, %f76;\n" +" .loc 16 321 0\n" +" mov.f32 %f77, %f17;\n" +" mul.ftz.f32 %f78, %f44, %f45;\n" +" fma.rn.ftz.f32 %f79, %f61, %f78, %f77;\n" +" mov.f32 %f17, %f79;\n" +" .loc 16 322 0\n" +" mov.f32 %f80, %f19;\n" +" mul.ftz.f32 %f81, %f45, %f46;\n" +" fma.rn.ftz.f32 %f82, %f61, %f81, %f80;\n" +" mov.f32 %f19, %f82;\n" +" .loc 16 323 0\n" +" mul.ftz.f32 %f83, %f44, %f46;\n" +" fma.rn.ftz.f32 %f20, %f61, %f83, %f20;\n" +" mov.f32 %f21, %f20;\n" +"$Lt_1_25602:\n" +"$Lt_1_24578:\n" +" .loc 16 289 0\n" +" mul.lo.u64 %rd47, %rd39, 4;\n" +" add.u64 %rd31, %rd31, %rd47;\n" +" setp.lt.u64 %p10, %rd31, %rd28;\n" +" @%p10 bra $Lt_1_24322;\n" +" bra.uni $Lt_1_22786;\n" +"$Lt_1_32002:\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +" bra.uni $Lt_1_22786;\n" +"$Lt_1_23042:\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +"$Lt_1_22786:\n" +" mov.u32 %r48, 1;\n" +" setp.le.s32 %p11, %r6, %r48;\n" +" @%p11 bra $Lt_1_28418;\n" +" .loc 16 334 0\n" +" mov.u64 %rd48, __cuda___cuda_local_var_32735_35_non_const_red_acc7168;\n" +" cvt.s64.s32 %rd49, %r1;\n" +" mul.wide.s32 %rd50, %r1, 4;\n" +" add.u64 %rd51, %rd48, %rd50;\n" +" mov.f32 %f84, %f33;\n" +" st.shared.f32 [%rd51+0], %f84;\n" +" .loc 16 335 0\n" +" mov.f32 %f85, %f32;\n" +" st.shared.f32 [%rd51+512], %f85;\n" +" .loc 16 336 0\n" +" mov.f32 %f86, %f31;\n" +" st.shared.f32 [%rd51+1024], %f86;\n" +" .loc 16 337 0\n" +" mov.f32 %f87, %f34;\n" +" st.shared.f32 [%rd51+1536], %f87;\n" +" .loc 16 339 0\n" +" shr.s32 %r49, %r6, 31;\n" +" mov.s32 %r50, 1;\n" +" and.b32 %r51, %r49, %r50;\n" +" add.s32 %r52, %r51, %r6;\n" +" shr.s32 %r53, %r52, 1;\n" +" mov.s32 %r54, %r53;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p12, %r53, %r55;\n" +" @!%p12 bra $Lt_1_26882;\n" +"$Lt_1_27394:\n" +" setp.ge.u32 %p13, %r10, %r54;\n" +" @%p13 bra $Lt_1_27650;\n" +" .loc 16 342 0\n" +" add.u32 %r56, %r1, %r54;\n" +" cvt.u64.u32 %rd52, %r56;\n" +" mul.wide.u32 %rd53, %r56, 4;\n" +" add.u64 %rd54, %rd48, %rd53;\n" +" ld.shared.f32 %f88, [%rd54+0];\n" +" add.ftz.f32 %f84, %f88, %f84;\n" +" st.shared.f32 [%rd51+0], %f84;\n" +" ld.shared.f32 %f89, [%rd54+512];\n" +" add.ftz.f32 %f85, %f89, %f85;\n" +" st.shared.f32 [%rd51+512], %f85;\n" +" ld.shared.f32 %f90, [%rd54+1024];\n" +" add.ftz.f32 %f86, %f90, %f86;\n" +" st.shared.f32 [%rd51+1024], %f86;\n" +" ld.shared.f32 %f91, [%rd54+1536];\n" +" add.ftz.f32 %f87, %f91, %f87;\n" +" st.shared.f32 [%rd51+1536], %f87;\n" +"$Lt_1_27650:\n" +" .loc 16 339 0\n" +" shr.u32 %r54, %r54, 1;\n" +" mov.u32 %r57, 0;\n" +" setp.ne.u32 %p14, %r54, %r57;\n" +" @%p14 bra $Lt_1_27394;\n" +"$Lt_1_26882:\n" +" .loc 16 346 0\n" +" mov.f32 %f33, %f84;\n" +" .loc 16 347 0\n" +" mov.f32 %f32, %f85;\n" +" .loc 16 348 0\n" +" mov.f32 %f31, %f86;\n" +" .loc 16 349 0\n" +" mov.f32 %f34, %f87;\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p15, %r58, %r59;\n" +" @%p15 bra $Lt_1_28418;\n" +" .loc 16 353 0\n" +" mov.f32 %f84, %f11;\n" +" st.shared.f32 [%rd51+0], %f84;\n" +" mov.f32 %f85, %f13;\n" +" st.shared.f32 [%rd51+512], %f85;\n" +" mov.f32 %f86, %f15;\n" +" st.shared.f32 [%rd51+1024], %f86;\n" +" mov.f32 %f87, %f17;\n" +" st.shared.f32 [%rd51+1536], %f87;\n" +" mov.f32 %f92, %f19;\n" +" st.shared.f32 [%rd51+2048], %f92;\n" +" mov.f32 %f93, %f21;\n" +" st.shared.f32 [%rd51+2560], %f93;\n" +" .loc 16 355 0\n" +" mov.s32 %r60, %r53;\n" +" @!%p12 bra $Lt_1_28930;\n" +"$Lt_1_29442:\n" +" setp.ge.u32 %p16, %r10, %r60;\n" +" @%p16 bra $Lt_1_29698;\n" +" .loc 16 358 0\n" +" add.u32 %r61, %r1, %r60;\n" +" cvt.u64.u32 %rd55, %r61;\n" +" mul.wide.u32 %rd56, %r61, 4;\n" +" add.u64 %rd57, %rd48, %rd56;\n" +" ld.shared.f32 %f94, [%rd57+0];\n" +" add.ftz.f32 %f84, %f94, %f84;\n" +" st.shared.f32 [%rd51+0], %f84;\n" +" ld.shared.f32 %f95, [%rd57+512];\n" +" add.ftz.f32 %f85, %f95, %f85;\n" +" st.shared.f32 [%rd51+512], %f85;\n" +" ld.shared.f32 %f96, [%rd57+1024];\n" +" add.ftz.f32 %f86, %f96, %f86;\n" +" st.shared.f32 [%rd51+1024], %f86;\n" +" ld.shared.f32 %f97, [%rd57+1536];\n" +" add.ftz.f32 %f87, %f97, %f87;\n" +" st.shared.f32 [%rd51+1536], %f87;\n" +" ld.shared.f32 %f98, [%rd57+2048];\n" +" add.ftz.f32 %f92, %f98, %f92;\n" +" st.shared.f32 [%rd51+2048], %f92;\n" +" ld.shared.f32 %f99, [%rd57+2560];\n" +" add.ftz.f32 %f93, %f99, %f93;\n" +" st.shared.f32 [%rd51+2560], %f93;\n" +"$Lt_1_29698:\n" +" .loc 16 355 0\n" +" shr.u32 %r60, %r60, 1;\n" +" mov.u32 %r62, 0;\n" +" setp.ne.u32 %p17, %r60, %r62;\n" +" @%p17 bra $Lt_1_29442;\n" +"$Lt_1_28930:\n" +" .loc 16 363 0\n" +" mov.f32 %f11, %f84;\n" +" mov.f32 %f13, %f85;\n" +" mov.f32 %f15, %f86;\n" +" mov.f32 %f17, %f87;\n" +" mov.f32 %f19, %f92;\n" +" mov.f32 %f21, %f93;\n" +"$Lt_1_28418:\n" +"$Lt_1_26370:\n" +" selp.s32 %r63, 1, 0, %p4;\n" +" mov.s32 %r64, 0;\n" +" set.eq.u32.s32 %r65, %r10, %r64;\n" +" neg.s32 %r66, %r65;\n" +" and.b32 %r67, %r63, %r66;\n" +" mov.u32 %r68, 0;\n" +" setp.eq.s32 %p18, %r67, %r68;\n" +" @%p18 bra $Lt_1_30466;\n" +" .loc 16 369 0\n" +" cvt.s64.s32 %rd58, %r13;\n" +" ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd60, %r13, 4;\n" +" add.u64 %rd61, %rd59, %rd60;\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p19, %r69, %r70;\n" +" @%p19 bra $Lt_1_30978;\n" +" .loc 16 371 0\n" +" st.global.f32 [%rd61+0], %f34;\n" +" .loc 16 372 0\n" +" cvt.s64.s32 %rd62, %r14;\n" +" mul.wide.s32 %rd63, %r14, 4;\n" +" add.u64 %rd61, %rd61, %rd63;\n" +"$Lt_1_30978:\n" +" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r72, 0;\n" +" setp.le.s32 %p20, %r71, %r72;\n" +" @%p20 bra $Lt_1_31490;\n" +" .loc 16 376 0\n" +" mov.f32 %f100, %f11;\n" +" st.global.f32 [%rd61+0], %f100;\n" +" .loc 16 377 0\n" +" cvt.s64.s32 %rd64, %r14;\n" +" mul.wide.s32 %rd65, %r14, 4;\n" +" add.u64 %rd66, %rd65, %rd61;\n" +" .loc 16 376 0\n" +" mov.f32 %f101, %f13;\n" +" st.global.f32 [%rd66+0], %f101;\n" +" .loc 16 377 0\n" +" add.u64 %rd67, %rd65, %rd66;\n" +" .loc 16 376 0\n" +" mov.f32 %f102, %f15;\n" +" st.global.f32 [%rd67+0], %f102;\n" +" .loc 16 377 0\n" +" add.u64 %rd68, %rd65, %rd67;\n" +" .loc 16 376 0\n" +" mov.f32 %f103, %f17;\n" +" st.global.f32 [%rd68+0], %f103;\n" +" .loc 16 377 0\n" +" add.u64 %rd61, %rd65, %rd68;\n" +" .loc 16 376 0\n" +" mov.f32 %f104, %f19;\n" +" st.global.f32 [%rd61+0], %f104;\n" +" mov.f32 %f105, %f21;\n" +" add.u64 %rd69, %rd65, %rd61;\n" +" st.global.f32 [%rd69+0], %f105;\n" +"$Lt_1_31490:\n" +" .loc 16 380 0\n" +" ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd71, %rd58, 16;\n" +" add.u64 %rd72, %rd70, %rd71;\n" +" mov.f32 %f106, %f107;\n" +" st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f106};\n" +"$Lt_1_30466:\n" +" .loc 16 382 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/lj_expand_gpu_kernel.ptx b/lib/gpu/lj_expand_gpu_kernel.ptx new file mode 100644 index 000000000..dbe96f266 --- /dev/null +++ b/lib/gpu/lj_expand_gpu_kernel.ptx @@ -0,0 +1,993 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000be22_00000000-9_lj_expand_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.LdVC9u) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000be22_00000000-8_lj_expand_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "lj_expand_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .u64 __cudaparm_kernel_pair_lj3, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<72>; + .reg .u64 %rd<62>; + .reg .f32 %f<107>; + .reg .pred %p<19>; + .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16]; + .shared .align 4 .b8 __cuda___cuda_local_var_32584_35_non_const_red_acc108[3072]; + // __cuda_local_var_32504_10_non_const_f = 48 + // __cuda_local_var_32508_9_non_const_virial = 16 + .loc 16 88 0 +$LDWbegin_kernel_pair: + .loc 16 95 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 96 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 97 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 98 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; + .loc 16 107 0 + mov.f32 %f5, 0f00000000; // 0 + mov.f32 %f6, %f5; + mov.f32 %f7, 0f00000000; // 0 + mov.f32 %f8, %f7; + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_19202; + .loc 16 113 0 + ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd2, %r11; + mul.wide.s32 %rd3, %r11, 4; + cvt.s64.s32 %rd4, %r9; + mul.wide.s32 %rd5, %r9, 4; + ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd7, %rd5, %rd6; + add.u64 %rd8, %rd3, %rd7; + ld.global.s32 %r12, [%rd8+0]; + add.u64 %rd9, %rd3, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd6; + @%p2 bra $Lt_0_19714; + .loc 16 119 0 + cvt.s32.s64 %r13, %rd2; + mul.lo.s32 %r14, %r13, %r12; + cvt.s64.s32 %rd11, %r14; + mul.wide.s32 %rd12, %r14, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 120 0 + mul.lo.s32 %r15, %r6, %r13; + cvt.s64.s32 %rd14, %r15; + mul.wide.s32 %rd15, %r15, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 121 0 + mul.lo.s32 %r16, %r13, %r1; + bra.uni $Lt_0_19458; +$Lt_0_19714: + .loc 16 123 0 + ld.global.s32 %r17, [%rd9+0]; + cvt.s64.s32 %rd17, %r17; + mul.wide.s32 %rd18, %r17, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 124 0 + cvt.s64.s32 %rd20, %r12; + mul.wide.s32 %rd21, %r12, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 125 0 + mov.s32 %r16, %r1; + .loc 16 126 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_19458: + .loc 16 129 0 + ld.global.s32 %r18, [%rd7+0]; + mov.u32 %r19, %r18; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f21, %f17; + mov.f32 %f22, %f18; + mov.f32 %f23, %f19; + mov.f32 %f24, %f20; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_28162; + cvt.rzi.ftz.s32.f32 %r26, %f24; + cvt.s64.s32 %rd24, %r16; + ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r28, %r27, %r26; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1]; + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92; +$Lt_0_20482: + //<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown + .loc 16 135 0 + ld.global.s32 %r29, [%rd16+0]; + .loc 16 136 0 + shr.s32 %r30, %r29, 30; + and.b32 %r31, %r30, 3; + cvt.s64.s32 %rd27, %r31; + mul.wide.s32 %rd28, %r31, 4; + add.u64 %rd29, %rd26, %rd28; + ld.shared.f32 %f29, [%rd29+0]; + .loc 16 139 0 + and.b32 %r32, %r29, 1073741823; + mov.u32 %r33, %r32; + mov.s32 %r34, 0; + mov.u32 %r35, %r34; + mov.s32 %r36, 0; + mov.u32 %r37, %r36; + mov.s32 %r38, 0; + mov.u32 %r39, %r38; + tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}]; + mov.f32 %f34, %f30; + mov.f32 %f35, %f31; + mov.f32 %f36, %f32; + mov.f32 %f37, %f33; + cvt.rzi.ftz.s32.f32 %r40, %f37; + sub.ftz.f32 %f38, %f22, %f35; + sub.ftz.f32 %f39, %f21, %f34; + sub.ftz.f32 %f40, %f23, %f36; + mul.ftz.f32 %f41, %f38, %f38; + fma.rn.ftz.f32 %f42, %f39, %f39, %f41; + fma.rn.ftz.f32 %f43, %f40, %f40, %f42; + add.s32 %r41, %r40, %r28; + cvt.s64.s32 %rd30, %r41; + mul.wide.s32 %rd31, %r41, 16; + add.u64 %rd32, %rd31, %rd25; + ld.global.f32 %f44, [%rd32+8]; + setp.gt.ftz.f32 %p4, %f44, %f43; + @!%p4 bra $Lt_0_21762; + .loc 16 151 0 + sqrt.approx.ftz.f32 %f45, %f43; + ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd32+0]; + sub.ftz.f32 %f49, %f45, %f48; + .loc 16 156 0 + mul.ftz.f32 %f50, %f49, %f49; + rcp.approx.ftz.f32 %f51, %f50; + mul.ftz.f32 %f52, %f51, %f51; + mul.ftz.f32 %f53, %f51, %f52; + div.approx.ftz.f32 %f54, %f29, %f49; + div.approx.ftz.f32 %f55, %f54, %f45; + mul.ftz.f32 %f56, %f46, %f53; + sub.ftz.f32 %f57, %f56, %f47; + mul.ftz.f32 %f58, %f53, %f57; + mul.ftz.f32 %f59, %f55, %f58; + .loc 16 158 0 + fma.rn.ftz.f32 %f27, %f39, %f59, %f27; + .loc 16 159 0 + fma.rn.ftz.f32 %f26, %f38, %f59, %f26; + .loc 16 160 0 + fma.rn.ftz.f32 %f25, %f40, %f59, %f25; + ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r43, 0; + setp.le.s32 %p5, %r42, %r43; + @%p5 bra $Lt_0_21250; + .loc 16 164 0 + ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd34, %rd33, %rd31; + ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd34+0]; + mul.ftz.f32 %f63, %f60, %f53; + sub.ftz.f32 %f64, %f63, %f61; + mul.ftz.f32 %f65, %f53, %f64; + sub.ftz.f32 %f66, %f65, %f62; + fma.rn.ftz.f32 %f28, %f29, %f66, %f28; +$Lt_0_21250: + ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r45, 0; + setp.le.s32 %p6, %r44, %r45; + @%p6 bra $Lt_0_21762; + .loc 16 167 0 + mov.f32 %f67, %f6; + mul.ftz.f32 %f68, %f39, %f39; + fma.rn.ftz.f32 %f69, %f59, %f68, %f67; + mov.f32 %f6, %f69; + .loc 16 168 0 + mov.f32 %f70, %f8; + fma.rn.ftz.f32 %f71, %f59, %f41, %f70; + mov.f32 %f8, %f71; + .loc 16 169 0 + mov.f32 %f72, %f10; + mul.ftz.f32 %f73, %f40, %f40; + fma.rn.ftz.f32 %f74, %f59, %f73, %f72; + mov.f32 %f10, %f74; + .loc 16 170 0 + mov.f32 %f75, %f12; + mul.ftz.f32 %f76, %f38, %f39; + fma.rn.ftz.f32 %f77, %f59, %f76, %f75; + mov.f32 %f12, %f77; + .loc 16 171 0 + mov.f32 %f78, %f14; + mul.ftz.f32 %f79, %f39, %f40; + fma.rn.ftz.f32 %f80, %f59, %f79, %f78; + mov.f32 %f14, %f80; + .loc 16 172 0 + mul.ftz.f32 %f81, %f38, %f40; + fma.rn.ftz.f32 %f15, %f59, %f81, %f15; + mov.f32 %f16, %f15; +$Lt_0_21762: +$Lt_0_20738: + .loc 16 133 0 + mul.lo.u64 %rd35, %rd24, 4; + add.u64 %rd16, %rd16, %rd35; + setp.lt.u64 %p7, %rd16, %rd13; + @%p7 bra $Lt_0_20482; + bra.uni $Lt_0_18946; +$Lt_0_28162: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + bra.uni $Lt_0_18946; +$Lt_0_19202: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 +$Lt_0_18946: + mov.u32 %r46, 1; + setp.le.s32 %p8, %r1, %r46; + @%p8 bra $Lt_0_24578; + .loc 16 183 0 + mov.u64 %rd36, __cuda___cuda_local_var_32584_35_non_const_red_acc108; + cvt.s64.s32 %rd37, %r2; + mul.wide.s32 %rd38, %r2, 4; + add.u64 %rd39, %rd36, %rd38; + mov.f32 %f82, %f27; + st.shared.f32 [%rd39+0], %f82; + .loc 16 184 0 + mov.f32 %f83, %f26; + st.shared.f32 [%rd39+512], %f83; + .loc 16 185 0 + mov.f32 %f84, %f25; + st.shared.f32 [%rd39+1024], %f84; + .loc 16 186 0 + mov.f32 %f85, %f28; + st.shared.f32 [%rd39+1536], %f85; + .loc 16 188 0 + shr.s32 %r47, %r1, 31; + mov.s32 %r48, 1; + and.b32 %r49, %r47, %r48; + add.s32 %r50, %r49, %r1; + shr.s32 %r51, %r50, 1; + mov.s32 %r52, %r51; + mov.u32 %r53, 0; + setp.ne.u32 %p9, %r51, %r53; + @!%p9 bra $Lt_0_23042; +$Lt_0_23554: + setp.ge.u32 %p10, %r6, %r52; + @%p10 bra $Lt_0_23810; + .loc 16 191 0 + add.u32 %r54, %r2, %r52; + cvt.u64.u32 %rd40, %r54; + mul.wide.u32 %rd41, %r54, 4; + add.u64 %rd42, %rd36, %rd41; + ld.shared.f32 %f86, [%rd42+0]; + add.ftz.f32 %f82, %f86, %f82; + st.shared.f32 [%rd39+0], %f82; + ld.shared.f32 %f87, [%rd42+512]; + add.ftz.f32 %f83, %f87, %f83; + st.shared.f32 [%rd39+512], %f83; + ld.shared.f32 %f88, [%rd42+1024]; + add.ftz.f32 %f84, %f88, %f84; + st.shared.f32 [%rd39+1024], %f84; + ld.shared.f32 %f89, [%rd42+1536]; + add.ftz.f32 %f85, %f89, %f85; + st.shared.f32 [%rd39+1536], %f85; +$Lt_0_23810: + .loc 16 188 0 + shr.u32 %r52, %r52, 1; + mov.u32 %r55, 0; + setp.ne.u32 %p11, %r52, %r55; + @%p11 bra $Lt_0_23554; +$Lt_0_23042: + .loc 16 195 0 + mov.f32 %f27, %f82; + .loc 16 196 0 + mov.f32 %f26, %f83; + .loc 16 197 0 + mov.f32 %f25, %f84; + .loc 16 198 0 + mov.f32 %f28, %f85; + ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r57, 0; + setp.le.s32 %p12, %r56, %r57; + @%p12 bra $Lt_0_24578; + .loc 16 202 0 + mov.f32 %f82, %f6; + st.shared.f32 [%rd39+0], %f82; + mov.f32 %f83, %f8; + st.shared.f32 [%rd39+512], %f83; + mov.f32 %f84, %f10; + st.shared.f32 [%rd39+1024], %f84; + mov.f32 %f85, %f12; + st.shared.f32 [%rd39+1536], %f85; + mov.f32 %f90, %f14; + st.shared.f32 [%rd39+2048], %f90; + mov.f32 %f91, %f16; + st.shared.f32 [%rd39+2560], %f91; + .loc 16 204 0 + mov.s32 %r58, %r51; + @!%p9 bra $Lt_0_25090; +$Lt_0_25602: + setp.ge.u32 %p13, %r6, %r58; + @%p13 bra $Lt_0_25858; + .loc 16 207 0 + add.u32 %r59, %r2, %r58; + cvt.u64.u32 %rd43, %r59; + mul.wide.u32 %rd44, %r59, 4; + add.u64 %rd45, %rd36, %rd44; + ld.shared.f32 %f92, [%rd45+0]; + add.ftz.f32 %f82, %f92, %f82; + st.shared.f32 [%rd39+0], %f82; + ld.shared.f32 %f93, [%rd45+512]; + add.ftz.f32 %f83, %f93, %f83; + st.shared.f32 [%rd39+512], %f83; + ld.shared.f32 %f94, [%rd45+1024]; + add.ftz.f32 %f84, %f94, %f84; + st.shared.f32 [%rd39+1024], %f84; + ld.shared.f32 %f95, [%rd45+1536]; + add.ftz.f32 %f85, %f95, %f85; + st.shared.f32 [%rd39+1536], %f85; + ld.shared.f32 %f96, [%rd45+2048]; + add.ftz.f32 %f90, %f96, %f90; + st.shared.f32 [%rd39+2048], %f90; + ld.shared.f32 %f97, [%rd45+2560]; + add.ftz.f32 %f91, %f97, %f91; + st.shared.f32 [%rd39+2560], %f91; +$Lt_0_25858: + .loc 16 204 0 + shr.u32 %r58, %r58, 1; + mov.u32 %r60, 0; + setp.ne.u32 %p14, %r58, %r60; + @%p14 bra $Lt_0_25602; +$Lt_0_25090: + .loc 16 212 0 + mov.f32 %f6, %f82; + mov.f32 %f8, %f83; + mov.f32 %f10, %f84; + mov.f32 %f12, %f85; + mov.f32 %f14, %f90; + mov.f32 %f16, %f91; +$Lt_0_24578: +$Lt_0_22530: + selp.s32 %r61, 1, 0, %p1; + mov.s32 %r62, 0; + set.eq.u32.s32 %r63, %r6, %r62; + neg.s32 %r64, %r63; + and.b32 %r65, %r61, %r64; + mov.u32 %r66, 0; + setp.eq.s32 %p15, %r65, %r66; + @%p15 bra $Lt_0_26626; + .loc 16 218 0 + cvt.s64.s32 %rd46, %r9; + ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd48, %r9, 4; + add.u64 %rd49, %rd47, %rd48; + ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r68, 0; + setp.le.s32 %p16, %r67, %r68; + @%p16 bra $Lt_0_27138; + .loc 16 220 0 + st.global.f32 [%rd49+0], %f28; + .loc 16 221 0 + cvt.s64.s32 %rd50, %r10; + mul.wide.s32 %rd51, %r10, 4; + add.u64 %rd49, %rd49, %rd51; +$Lt_0_27138: + ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r70, 0; + setp.le.s32 %p17, %r69, %r70; + @%p17 bra $Lt_0_27650; + .loc 16 225 0 + mov.f32 %f98, %f6; + st.global.f32 [%rd49+0], %f98; + .loc 16 226 0 + cvt.s64.s32 %rd52, %r10; + mul.wide.s32 %rd53, %r10, 4; + add.u64 %rd54, %rd53, %rd49; + .loc 16 225 0 + mov.f32 %f99, %f8; + st.global.f32 [%rd54+0], %f99; + .loc 16 226 0 + add.u64 %rd55, %rd53, %rd54; + .loc 16 225 0 + mov.f32 %f100, %f10; + st.global.f32 [%rd55+0], %f100; + .loc 16 226 0 + add.u64 %rd56, %rd53, %rd55; + .loc 16 225 0 + mov.f32 %f101, %f12; + st.global.f32 [%rd56+0], %f101; + .loc 16 226 0 + add.u64 %rd49, %rd53, %rd56; + .loc 16 225 0 + mov.f32 %f102, %f14; + st.global.f32 [%rd49+0], %f102; + mov.f32 %f103, %f16; + add.u64 %rd57, %rd53, %rd49; + st.global.f32 [%rd57+0], %f103; +$Lt_0_27650: + .loc 16 229 0 + ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd59, %rd46, 16; + add.u64 %rd60, %rd58, %rd59; + mov.f32 %f104, %f105; + st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f104}; +$Lt_0_26626: + .loc 16 231 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_lj1_in, + .param .u64 __cudaparm_kernel_pair_fast_lj3_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<74>; + .reg .u64 %rd<74>; + .reg .f32 %f<114>; + .reg .f64 %fd<4>; + .reg .pred %p<22>; + .shared .align 4 .b8 __cuda___cuda_local_var_32650_33_non_const_sp_lj3268[16]; + .shared .align 16 .b8 __cuda___cuda_local_var_32648_34_non_const_lj13296[1936]; + .shared .align 16 .b8 __cuda___cuda_local_var_32649_34_non_const_lj35232[1936]; + .shared .align 4 .b8 __cuda___cuda_local_var_32742_35_non_const_red_acc7168[3072]; + // __cuda_local_var_32660_10_non_const_f = 48 + // __cuda_local_var_32664_9_non_const_virial = 16 + .loc 16 239 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 3; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_21250; + .loc 16 249 0 + mov.u64 %rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_21250: + mov.u64 %rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268; + mov.u32 %r3, 120; + setp.gt.s32 %p2, %r1, %r3; + @%p2 bra $Lt_1_21762; + .loc 16 251 0 + mov.u64 %rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296; + cvt.s64.s32 %rd8, %r1; + mul.wide.s32 %rd9, %r1, 16; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; + add.u64 %rd11, %rd10, %rd9; + add.u64 %rd12, %rd9, %rd7; + ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; + st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; + ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r5, 0; + setp.le.s32 %p3, %r4, %r5; + @%p3 bra $Lt_1_22274; + .loc 16 253 0 + mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232; + ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; + add.u64 %rd15, %rd14, %rd9; + add.u64 %rd16, %rd9, %rd13; + ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; + st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; +$Lt_1_22274: + mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232; +$Lt_1_21762: + mov.u64 %rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296; + mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232; + .loc 16 263 0 + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, %f14; + mov.f32 %f16, 0f00000000; // 0 + mov.f32 %f17, %f16; + mov.f32 %f18, 0f00000000; // 0 + mov.f32 %f19, %f18; + mov.f32 %f20, 0f00000000; // 0 + mov.f32 %f21, %f20; + .loc 16 265 0 + bar.sync 0; + ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r7, %r1, %r6; + cvt.s32.u32 %r8, %ntid.x; + div.s32 %r9, %r8, %r6; + rem.s32 %r10, %r1, %r6; + cvt.s32.u32 %r11, %ctaid.x; + mul.lo.s32 %r12, %r11, %r9; + add.s32 %r13, %r7, %r12; + ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p4, %r13, %r14; + @!%p4 bra $Lt_1_23042; + .loc 16 271 0 + ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd17, %r15; + mul.wide.s32 %rd18, %r15, 4; + cvt.s64.s32 %rd19, %r13; + mul.wide.s32 %rd20, %r13, 4; + ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd22, %rd20, %rd21; + add.u64 %rd23, %rd18, %rd22; + ld.global.s32 %r16, [%rd23+0]; + add.u64 %rd24, %rd18, %rd23; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p5, %rd25, %rd21; + @%p5 bra $Lt_1_23554; + .loc 16 277 0 + cvt.s32.s64 %r17, %rd17; + mul.lo.s32 %r18, %r17, %r16; + cvt.s64.s32 %rd26, %r18; + mul.wide.s32 %rd27, %r18, 4; + add.u64 %rd28, %rd24, %rd27; + .loc 16 278 0 + mul.lo.s32 %r19, %r10, %r17; + cvt.s64.s32 %rd29, %r19; + mul.wide.s32 %rd30, %r19, 4; + add.u64 %rd31, %rd24, %rd30; + .loc 16 279 0 + mul.lo.s32 %r20, %r17, %r6; + bra.uni $Lt_1_23298; +$Lt_1_23554: + .loc 16 281 0 + ld.global.s32 %r21, [%rd24+0]; + cvt.s64.s32 %rd32, %r21; + mul.wide.s32 %rd33, %r21, 4; + add.u64 %rd34, %rd25, %rd33; + .loc 16 282 0 + cvt.s64.s32 %rd35, %r16; + mul.wide.s32 %rd36, %r16, 4; + add.u64 %rd28, %rd34, %rd36; + .loc 16 283 0 + mov.s32 %r20, %r6; + .loc 16 284 0 + cvt.s64.s32 %rd37, %r10; + mul.wide.s32 %rd38, %r10, 4; + add.u64 %rd31, %rd34, %rd38; +$Lt_1_23298: + .loc 16 287 0 + ld.global.s32 %r22, [%rd22+0]; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + mov.s32 %r28, 0; + mov.u32 %r29, %r28; + tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}]; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + mov.f32 %f29, %f25; + setp.ge.u64 %p6, %rd31, %rd28; + @%p6 bra $Lt_1_32002; + cvt.rzi.ftz.s32.f32 %r30, %f29; + cvt.s64.s32 %rd39, %r20; + mul.lo.s32 %r31, %r30, 11; + cvt.rn.f32.s32 %f30, %r31; + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 +$Lt_1_24322: + //<loop> Loop body line 287, nesting depth: 1, estimated iterations: unknown + .loc 16 294 0 + ld.global.s32 %r32, [%rd31+0]; + .loc 16 295 0 + shr.s32 %r33, %r32, 30; + and.b32 %r34, %r33, 3; + cvt.s64.s32 %rd40, %r34; + mul.wide.s32 %rd41, %r34, 4; + add.u64 %rd42, %rd1, %rd41; + ld.shared.f32 %f35, [%rd42+0]; + .loc 16 298 0 + and.b32 %r35, %r32, 1073741823; + mov.u32 %r36, %r35; + mov.s32 %r37, 0; + mov.u32 %r38, %r37; + mov.s32 %r39, 0; + mov.u32 %r40, %r39; + mov.s32 %r41, 0; + mov.u32 %r42, %r41; + tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}]; + mov.f32 %f40, %f36; + mov.f32 %f41, %f37; + mov.f32 %f42, %f38; + mov.f32 %f43, %f39; + sub.ftz.f32 %f44, %f27, %f41; + sub.ftz.f32 %f45, %f26, %f40; + sub.ftz.f32 %f46, %f28, %f42; + mul.ftz.f32 %f47, %f44, %f44; + fma.rn.ftz.f32 %f48, %f45, %f45, %f47; + fma.rn.ftz.f32 %f49, %f46, %f46, %f48; + add.ftz.f32 %f50, %f30, %f43; + cvt.rzi.ftz.s32.f32 %r43, %f50; + cvt.s64.s32 %rd43, %r43; + mul.wide.s32 %rd44, %r43, 16; + add.u64 %rd45, %rd44, %rd7; + ld.shared.f32 %f51, [%rd45+8]; + setp.gt.ftz.f32 %p7, %f51, %f49; + @!%p7 bra $Lt_1_25602; + .loc 16 309 0 + sqrt.approx.ftz.f32 %f52, %f49; + ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd45+0]; + sub.ftz.f32 %f56, %f52, %f55; + .loc 16 313 0 + mul.ftz.f32 %f57, %f56, %f56; + cvt.ftz.f64.f32 %fd1, %f57; + rcp.rn.f64 %fd2, %fd1; + cvt.rn.ftz.f32.f64 %f58, %fd2; + mul.ftz.f32 %f59, %f58, %f58; + mul.ftz.f32 %f60, %f58, %f59; + mul.ftz.f32 %f61, %f53, %f60; + sub.ftz.f32 %f62, %f61, %f54; + mul.ftz.f32 %f63, %f60, %f62; + .loc 16 314 0 + div.approx.ftz.f32 %f64, %f35, %f56; + div.approx.ftz.f32 %f65, %f64, %f52; + mul.ftz.f32 %f66, %f63, %f65; + .loc 16 316 0 + fma.rn.ftz.f32 %f33, %f45, %f66, %f33; + .loc 16 317 0 + fma.rn.ftz.f32 %f32, %f44, %f66, %f32; + .loc 16 318 0 + fma.rn.ftz.f32 %f31, %f46, %f66, %f31; + ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r45, 0; + setp.le.s32 %p8, %r44, %r45; + @%p8 bra $Lt_1_25090; + .loc 16 321 0 + add.u64 %rd46, %rd44, %rd13; + ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd46+0]; + mul.ftz.f32 %f70, %f67, %f60; + sub.ftz.f32 %f71, %f70, %f68; + mul.ftz.f32 %f72, %f60, %f71; + .loc 16 322 0 + sub.ftz.f32 %f73, %f72, %f69; + fma.rn.ftz.f32 %f34, %f35, %f73, %f34; +$Lt_1_25090: + ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r47, 0; + setp.le.s32 %p9, %r46, %r47; + @%p9 bra $Lt_1_25602; + .loc 16 325 0 + mov.f32 %f74, %f11; + mul.ftz.f32 %f75, %f45, %f45; + fma.rn.ftz.f32 %f76, %f66, %f75, %f74; + mov.f32 %f11, %f76; + .loc 16 326 0 + mov.f32 %f77, %f13; + fma.rn.ftz.f32 %f78, %f66, %f47, %f77; + mov.f32 %f13, %f78; + .loc 16 327 0 + mov.f32 %f79, %f15; + mul.ftz.f32 %f80, %f46, %f46; + fma.rn.ftz.f32 %f81, %f66, %f80, %f79; + mov.f32 %f15, %f81; + .loc 16 328 0 + mov.f32 %f82, %f17; + mul.ftz.f32 %f83, %f44, %f45; + fma.rn.ftz.f32 %f84, %f66, %f83, %f82; + mov.f32 %f17, %f84; + .loc 16 329 0 + mov.f32 %f85, %f19; + mul.ftz.f32 %f86, %f45, %f46; + fma.rn.ftz.f32 %f87, %f66, %f86, %f85; + mov.f32 %f19, %f87; + .loc 16 330 0 + mul.ftz.f32 %f88, %f44, %f46; + fma.rn.ftz.f32 %f20, %f66, %f88, %f20; + mov.f32 %f21, %f20; +$Lt_1_25602: +$Lt_1_24578: + .loc 16 292 0 + mul.lo.u64 %rd47, %rd39, 4; + add.u64 %rd31, %rd31, %rd47; + setp.lt.u64 %p10, %rd31, %rd28; + @%p10 bra $Lt_1_24322; + bra.uni $Lt_1_22786; +$Lt_1_32002: + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 + bra.uni $Lt_1_22786; +$Lt_1_23042: + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 +$Lt_1_22786: + mov.u32 %r48, 1; + setp.le.s32 %p11, %r6, %r48; + @%p11 bra $Lt_1_28418; + .loc 16 341 0 + mov.u64 %rd48, __cuda___cuda_local_var_32742_35_non_const_red_acc7168; + cvt.s64.s32 %rd49, %r1; + mul.wide.s32 %rd50, %r1, 4; + add.u64 %rd51, %rd48, %rd50; + mov.f32 %f89, %f33; + st.shared.f32 [%rd51+0], %f89; + .loc 16 342 0 + mov.f32 %f90, %f32; + st.shared.f32 [%rd51+512], %f90; + .loc 16 343 0 + mov.f32 %f91, %f31; + st.shared.f32 [%rd51+1024], %f91; + .loc 16 344 0 + mov.f32 %f92, %f34; + st.shared.f32 [%rd51+1536], %f92; + .loc 16 346 0 + shr.s32 %r49, %r6, 31; + mov.s32 %r50, 1; + and.b32 %r51, %r49, %r50; + add.s32 %r52, %r51, %r6; + shr.s32 %r53, %r52, 1; + mov.s32 %r54, %r53; + mov.u32 %r55, 0; + setp.ne.u32 %p12, %r53, %r55; + @!%p12 bra $Lt_1_26882; +$Lt_1_27394: + setp.ge.u32 %p13, %r10, %r54; + @%p13 bra $Lt_1_27650; + .loc 16 349 0 + add.u32 %r56, %r1, %r54; + cvt.u64.u32 %rd52, %r56; + mul.wide.u32 %rd53, %r56, 4; + add.u64 %rd54, %rd48, %rd53; + ld.shared.f32 %f93, [%rd54+0]; + add.ftz.f32 %f89, %f93, %f89; + st.shared.f32 [%rd51+0], %f89; + ld.shared.f32 %f94, [%rd54+512]; + add.ftz.f32 %f90, %f94, %f90; + st.shared.f32 [%rd51+512], %f90; + ld.shared.f32 %f95, [%rd54+1024]; + add.ftz.f32 %f91, %f95, %f91; + st.shared.f32 [%rd51+1024], %f91; + ld.shared.f32 %f96, [%rd54+1536]; + add.ftz.f32 %f92, %f96, %f92; + st.shared.f32 [%rd51+1536], %f92; +$Lt_1_27650: + .loc 16 346 0 + shr.u32 %r54, %r54, 1; + mov.u32 %r57, 0; + setp.ne.u32 %p14, %r54, %r57; + @%p14 bra $Lt_1_27394; +$Lt_1_26882: + .loc 16 353 0 + mov.f32 %f33, %f89; + .loc 16 354 0 + mov.f32 %f32, %f90; + .loc 16 355 0 + mov.f32 %f31, %f91; + .loc 16 356 0 + mov.f32 %f34, %f92; + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p15, %r58, %r59; + @%p15 bra $Lt_1_28418; + .loc 16 360 0 + mov.f32 %f89, %f11; + st.shared.f32 [%rd51+0], %f89; + mov.f32 %f90, %f13; + st.shared.f32 [%rd51+512], %f90; + mov.f32 %f91, %f15; + st.shared.f32 [%rd51+1024], %f91; + mov.f32 %f92, %f17; + st.shared.f32 [%rd51+1536], %f92; + mov.f32 %f97, %f19; + st.shared.f32 [%rd51+2048], %f97; + mov.f32 %f98, %f21; + st.shared.f32 [%rd51+2560], %f98; + .loc 16 362 0 + mov.s32 %r60, %r53; + @!%p12 bra $Lt_1_28930; +$Lt_1_29442: + setp.ge.u32 %p16, %r10, %r60; + @%p16 bra $Lt_1_29698; + .loc 16 365 0 + add.u32 %r61, %r1, %r60; + cvt.u64.u32 %rd55, %r61; + mul.wide.u32 %rd56, %r61, 4; + add.u64 %rd57, %rd48, %rd56; + ld.shared.f32 %f99, [%rd57+0]; + add.ftz.f32 %f89, %f99, %f89; + st.shared.f32 [%rd51+0], %f89; + ld.shared.f32 %f100, [%rd57+512]; + add.ftz.f32 %f90, %f100, %f90; + st.shared.f32 [%rd51+512], %f90; + ld.shared.f32 %f101, [%rd57+1024]; + add.ftz.f32 %f91, %f101, %f91; + st.shared.f32 [%rd51+1024], %f91; + ld.shared.f32 %f102, [%rd57+1536]; + add.ftz.f32 %f92, %f102, %f92; + st.shared.f32 [%rd51+1536], %f92; + ld.shared.f32 %f103, [%rd57+2048]; + add.ftz.f32 %f97, %f103, %f97; + st.shared.f32 [%rd51+2048], %f97; + ld.shared.f32 %f104, [%rd57+2560]; + add.ftz.f32 %f98, %f104, %f98; + st.shared.f32 [%rd51+2560], %f98; +$Lt_1_29698: + .loc 16 362 0 + shr.u32 %r60, %r60, 1; + mov.u32 %r62, 0; + setp.ne.u32 %p17, %r60, %r62; + @%p17 bra $Lt_1_29442; +$Lt_1_28930: + .loc 16 370 0 + mov.f32 %f11, %f89; + mov.f32 %f13, %f90; + mov.f32 %f15, %f91; + mov.f32 %f17, %f92; + mov.f32 %f19, %f97; + mov.f32 %f21, %f98; +$Lt_1_28418: +$Lt_1_26370: + selp.s32 %r63, 1, 0, %p4; + mov.s32 %r64, 0; + set.eq.u32.s32 %r65, %r10, %r64; + neg.s32 %r66, %r65; + and.b32 %r67, %r63, %r66; + mov.u32 %r68, 0; + setp.eq.s32 %p18, %r67, %r68; + @%p18 bra $Lt_1_30466; + .loc 16 376 0 + cvt.s64.s32 %rd58, %r13; + ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd60, %r13, 4; + add.u64 %rd61, %rd59, %rd60; + ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r70, 0; + setp.le.s32 %p19, %r69, %r70; + @%p19 bra $Lt_1_30978; + .loc 16 378 0 + st.global.f32 [%rd61+0], %f34; + .loc 16 379 0 + cvt.s64.s32 %rd62, %r14; + mul.wide.s32 %rd63, %r14, 4; + add.u64 %rd61, %rd61, %rd63; +$Lt_1_30978: + ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r72, 0; + setp.le.s32 %p20, %r71, %r72; + @%p20 bra $Lt_1_31490; + .loc 16 383 0 + mov.f32 %f105, %f11; + st.global.f32 [%rd61+0], %f105; + .loc 16 384 0 + cvt.s64.s32 %rd64, %r14; + mul.wide.s32 %rd65, %r14, 4; + add.u64 %rd66, %rd65, %rd61; + .loc 16 383 0 + mov.f32 %f106, %f13; + st.global.f32 [%rd66+0], %f106; + .loc 16 384 0 + add.u64 %rd67, %rd65, %rd66; + .loc 16 383 0 + mov.f32 %f107, %f15; + st.global.f32 [%rd67+0], %f107; + .loc 16 384 0 + add.u64 %rd68, %rd65, %rd67; + .loc 16 383 0 + mov.f32 %f108, %f17; + st.global.f32 [%rd68+0], %f108; + .loc 16 384 0 + add.u64 %rd61, %rd65, %rd68; + .loc 16 383 0 + mov.f32 %f109, %f19; + st.global.f32 [%rd61+0], %f109; + mov.f32 %f110, %f21; + add.u64 %rd69, %rd65, %rd61; + st.global.f32 [%rd69+0], %f110; +$Lt_1_31490: + .loc 16 387 0 + ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd71, %rd58, 16; + add.u64 %rd72, %rd70, %rd71; + mov.f32 %f111, %f112; + st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f111}; +$Lt_1_30466: + .loc 16 389 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/lj_expand_gpu_ptx.h b/lib/gpu/lj_expand_gpu_ptx.h new file mode 100644 index 000000000..b8d650f2b --- /dev/null +++ b/lib/gpu/lj_expand_gpu_ptx.h @@ -0,0 +1,941 @@ +const char * lj_expand_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .u64 __cudaparm_kernel_pair_lj3,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<72>;\n" +" .reg .u64 %rd<62>;\n" +" .reg .f32 %f<107>;\n" +" .reg .pred %p<19>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32584_35_non_const_red_acc108[3072];\n" +" .loc 16 88 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 95 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 96 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 97 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 98 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 107 0\n" +" mov.f32 %f5, 0f00000000; \n" +" mov.f32 %f6, %f5;\n" +" mov.f32 %f7, 0f00000000; \n" +" mov.f32 %f8, %f7;\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_19202;\n" +" .loc 16 113 0\n" +" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd2, %r11;\n" +" mul.wide.s32 %rd3, %r11, 4;\n" +" cvt.s64.s32 %rd4, %r9;\n" +" mul.wide.s32 %rd5, %r9, 4;\n" +" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd7, %rd5, %rd6;\n" +" add.u64 %rd8, %rd3, %rd7;\n" +" ld.global.s32 %r12, [%rd8+0];\n" +" add.u64 %rd9, %rd3, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd6;\n" +" @%p2 bra $Lt_0_19714;\n" +" .loc 16 119 0\n" +" cvt.s32.s64 %r13, %rd2;\n" +" mul.lo.s32 %r14, %r13, %r12;\n" +" cvt.s64.s32 %rd11, %r14;\n" +" mul.wide.s32 %rd12, %r14, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 120 0\n" +" mul.lo.s32 %r15, %r6, %r13;\n" +" cvt.s64.s32 %rd14, %r15;\n" +" mul.wide.s32 %rd15, %r15, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 121 0\n" +" mul.lo.s32 %r16, %r13, %r1;\n" +" bra.uni $Lt_0_19458;\n" +"$Lt_0_19714:\n" +" .loc 16 123 0\n" +" ld.global.s32 %r17, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r17;\n" +" mul.wide.s32 %rd18, %r17, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 124 0\n" +" cvt.s64.s32 %rd20, %r12;\n" +" mul.wide.s32 %rd21, %r12, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 125 0\n" +" mov.s32 %r16, %r1;\n" +" .loc 16 126 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_19458:\n" +" .loc 16 129 0\n" +" ld.global.s32 %r18, [%rd7+0];\n" +" mov.u32 %r19, %r18;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f21, %f17;\n" +" mov.f32 %f22, %f18;\n" +" mov.f32 %f23, %f19;\n" +" mov.f32 %f24, %f20;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_28162;\n" +" cvt.rzi.ftz.s32.f32 %r26, %f24;\n" +" cvt.s64.s32 %rd24, %r16;\n" +" ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r28, %r27, %r26;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;\n" +"$Lt_0_20482:\n" +" .loc 16 135 0\n" +" ld.global.s32 %r29, [%rd16+0];\n" +" .loc 16 136 0\n" +" shr.s32 %r30, %r29, 30;\n" +" and.b32 %r31, %r30, 3;\n" +" cvt.s64.s32 %rd27, %r31;\n" +" mul.wide.s32 %rd28, %r31, 4;\n" +" add.u64 %rd29, %rd26, %rd28;\n" +" ld.shared.f32 %f29, [%rd29+0];\n" +" .loc 16 139 0\n" +" and.b32 %r32, %r29, 1073741823;\n" +" mov.u32 %r33, %r32;\n" +" mov.s32 %r34, 0;\n" +" mov.u32 %r35, %r34;\n" +" mov.s32 %r36, 0;\n" +" mov.u32 %r37, %r36;\n" +" mov.s32 %r38, 0;\n" +" mov.u32 %r39, %r38;\n" +" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];\n" +" mov.f32 %f34, %f30;\n" +" mov.f32 %f35, %f31;\n" +" mov.f32 %f36, %f32;\n" +" mov.f32 %f37, %f33;\n" +" cvt.rzi.ftz.s32.f32 %r40, %f37;\n" +" sub.ftz.f32 %f38, %f22, %f35;\n" +" sub.ftz.f32 %f39, %f21, %f34;\n" +" sub.ftz.f32 %f40, %f23, %f36;\n" +" mul.ftz.f32 %f41, %f38, %f38;\n" +" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" +" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" +" add.s32 %r41, %r40, %r28;\n" +" cvt.s64.s32 %rd30, %r41;\n" +" mul.wide.s32 %rd31, %r41, 16;\n" +" add.u64 %rd32, %rd31, %rd25;\n" +" ld.global.f32 %f44, [%rd32+8];\n" +" setp.gt.ftz.f32 %p4, %f44, %f43;\n" +" @!%p4 bra $Lt_0_21762;\n" +" .loc 16 151 0\n" +" sqrt.approx.ftz.f32 %f45, %f43;\n" +" ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd32+0];\n" +" sub.ftz.f32 %f49, %f45, %f48;\n" +" .loc 16 156 0\n" +" mul.ftz.f32 %f50, %f49, %f49;\n" +" rcp.approx.ftz.f32 %f51, %f50;\n" +" mul.ftz.f32 %f52, %f51, %f51;\n" +" mul.ftz.f32 %f53, %f51, %f52;\n" +" div.approx.ftz.f32 %f54, %f29, %f49;\n" +" div.approx.ftz.f32 %f55, %f54, %f45;\n" +" mul.ftz.f32 %f56, %f46, %f53;\n" +" sub.ftz.f32 %f57, %f56, %f47;\n" +" mul.ftz.f32 %f58, %f53, %f57;\n" +" mul.ftz.f32 %f59, %f55, %f58;\n" +" .loc 16 158 0\n" +" fma.rn.ftz.f32 %f27, %f39, %f59, %f27;\n" +" .loc 16 159 0\n" +" fma.rn.ftz.f32 %f26, %f38, %f59, %f26;\n" +" .loc 16 160 0\n" +" fma.rn.ftz.f32 %f25, %f40, %f59, %f25;\n" +" ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r43, 0;\n" +" setp.le.s32 %p5, %r42, %r43;\n" +" @%p5 bra $Lt_0_21250;\n" +" .loc 16 164 0\n" +" ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd34, %rd33, %rd31;\n" +" ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd34+0];\n" +" mul.ftz.f32 %f63, %f60, %f53;\n" +" sub.ftz.f32 %f64, %f63, %f61;\n" +" mul.ftz.f32 %f65, %f53, %f64;\n" +" sub.ftz.f32 %f66, %f65, %f62;\n" +" fma.rn.ftz.f32 %f28, %f29, %f66, %f28;\n" +"$Lt_0_21250:\n" +" ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r45, 0;\n" +" setp.le.s32 %p6, %r44, %r45;\n" +" @%p6 bra $Lt_0_21762;\n" +" .loc 16 167 0\n" +" mov.f32 %f67, %f6;\n" +" mul.ftz.f32 %f68, %f39, %f39;\n" +" fma.rn.ftz.f32 %f69, %f59, %f68, %f67;\n" +" mov.f32 %f6, %f69;\n" +" .loc 16 168 0\n" +" mov.f32 %f70, %f8;\n" +" fma.rn.ftz.f32 %f71, %f59, %f41, %f70;\n" +" mov.f32 %f8, %f71;\n" +" .loc 16 169 0\n" +" mov.f32 %f72, %f10;\n" +" mul.ftz.f32 %f73, %f40, %f40;\n" +" fma.rn.ftz.f32 %f74, %f59, %f73, %f72;\n" +" mov.f32 %f10, %f74;\n" +" .loc 16 170 0\n" +" mov.f32 %f75, %f12;\n" +" mul.ftz.f32 %f76, %f38, %f39;\n" +" fma.rn.ftz.f32 %f77, %f59, %f76, %f75;\n" +" mov.f32 %f12, %f77;\n" +" .loc 16 171 0\n" +" mov.f32 %f78, %f14;\n" +" mul.ftz.f32 %f79, %f39, %f40;\n" +" fma.rn.ftz.f32 %f80, %f59, %f79, %f78;\n" +" mov.f32 %f14, %f80;\n" +" .loc 16 172 0\n" +" mul.ftz.f32 %f81, %f38, %f40;\n" +" fma.rn.ftz.f32 %f15, %f59, %f81, %f15;\n" +" mov.f32 %f16, %f15;\n" +"$Lt_0_21762:\n" +"$Lt_0_20738:\n" +" .loc 16 133 0\n" +" mul.lo.u64 %rd35, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd35;\n" +" setp.lt.u64 %p7, %rd16, %rd13;\n" +" @%p7 bra $Lt_0_20482;\n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_28162:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_19202:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +"$Lt_0_18946:\n" +" mov.u32 %r46, 1;\n" +" setp.le.s32 %p8, %r1, %r46;\n" +" @%p8 bra $Lt_0_24578;\n" +" .loc 16 183 0\n" +" mov.u64 %rd36, __cuda___cuda_local_var_32584_35_non_const_red_acc108;\n" +" cvt.s64.s32 %rd37, %r2;\n" +" mul.wide.s32 %rd38, %r2, 4;\n" +" add.u64 %rd39, %rd36, %rd38;\n" +" mov.f32 %f82, %f27;\n" +" st.shared.f32 [%rd39+0], %f82;\n" +" .loc 16 184 0\n" +" mov.f32 %f83, %f26;\n" +" st.shared.f32 [%rd39+512], %f83;\n" +" .loc 16 185 0\n" +" mov.f32 %f84, %f25;\n" +" st.shared.f32 [%rd39+1024], %f84;\n" +" .loc 16 186 0\n" +" mov.f32 %f85, %f28;\n" +" st.shared.f32 [%rd39+1536], %f85;\n" +" .loc 16 188 0\n" +" shr.s32 %r47, %r1, 31;\n" +" mov.s32 %r48, 1;\n" +" and.b32 %r49, %r47, %r48;\n" +" add.s32 %r50, %r49, %r1;\n" +" shr.s32 %r51, %r50, 1;\n" +" mov.s32 %r52, %r51;\n" +" mov.u32 %r53, 0;\n" +" setp.ne.u32 %p9, %r51, %r53;\n" +" @!%p9 bra $Lt_0_23042;\n" +"$Lt_0_23554:\n" +" setp.ge.u32 %p10, %r6, %r52;\n" +" @%p10 bra $Lt_0_23810;\n" +" .loc 16 191 0\n" +" add.u32 %r54, %r2, %r52;\n" +" cvt.u64.u32 %rd40, %r54;\n" +" mul.wide.u32 %rd41, %r54, 4;\n" +" add.u64 %rd42, %rd36, %rd41;\n" +" ld.shared.f32 %f86, [%rd42+0];\n" +" add.ftz.f32 %f82, %f86, %f82;\n" +" st.shared.f32 [%rd39+0], %f82;\n" +" ld.shared.f32 %f87, [%rd42+512];\n" +" add.ftz.f32 %f83, %f87, %f83;\n" +" st.shared.f32 [%rd39+512], %f83;\n" +" ld.shared.f32 %f88, [%rd42+1024];\n" +" add.ftz.f32 %f84, %f88, %f84;\n" +" st.shared.f32 [%rd39+1024], %f84;\n" +" ld.shared.f32 %f89, [%rd42+1536];\n" +" add.ftz.f32 %f85, %f89, %f85;\n" +" st.shared.f32 [%rd39+1536], %f85;\n" +"$Lt_0_23810:\n" +" .loc 16 188 0\n" +" shr.u32 %r52, %r52, 1;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p11, %r52, %r55;\n" +" @%p11 bra $Lt_0_23554;\n" +"$Lt_0_23042:\n" +" .loc 16 195 0\n" +" mov.f32 %f27, %f82;\n" +" .loc 16 196 0\n" +" mov.f32 %f26, %f83;\n" +" .loc 16 197 0\n" +" mov.f32 %f25, %f84;\n" +" .loc 16 198 0\n" +" mov.f32 %f28, %f85;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p12, %r56, %r57;\n" +" @%p12 bra $Lt_0_24578;\n" +" .loc 16 202 0\n" +" mov.f32 %f82, %f6;\n" +" st.shared.f32 [%rd39+0], %f82;\n" +" mov.f32 %f83, %f8;\n" +" st.shared.f32 [%rd39+512], %f83;\n" +" mov.f32 %f84, %f10;\n" +" st.shared.f32 [%rd39+1024], %f84;\n" +" mov.f32 %f85, %f12;\n" +" st.shared.f32 [%rd39+1536], %f85;\n" +" mov.f32 %f90, %f14;\n" +" st.shared.f32 [%rd39+2048], %f90;\n" +" mov.f32 %f91, %f16;\n" +" st.shared.f32 [%rd39+2560], %f91;\n" +" .loc 16 204 0\n" +" mov.s32 %r58, %r51;\n" +" @!%p9 bra $Lt_0_25090;\n" +"$Lt_0_25602:\n" +" setp.ge.u32 %p13, %r6, %r58;\n" +" @%p13 bra $Lt_0_25858;\n" +" .loc 16 207 0\n" +" add.u32 %r59, %r2, %r58;\n" +" cvt.u64.u32 %rd43, %r59;\n" +" mul.wide.u32 %rd44, %r59, 4;\n" +" add.u64 %rd45, %rd36, %rd44;\n" +" ld.shared.f32 %f92, [%rd45+0];\n" +" add.ftz.f32 %f82, %f92, %f82;\n" +" st.shared.f32 [%rd39+0], %f82;\n" +" ld.shared.f32 %f93, [%rd45+512];\n" +" add.ftz.f32 %f83, %f93, %f83;\n" +" st.shared.f32 [%rd39+512], %f83;\n" +" ld.shared.f32 %f94, [%rd45+1024];\n" +" add.ftz.f32 %f84, %f94, %f84;\n" +" st.shared.f32 [%rd39+1024], %f84;\n" +" ld.shared.f32 %f95, [%rd45+1536];\n" +" add.ftz.f32 %f85, %f95, %f85;\n" +" st.shared.f32 [%rd39+1536], %f85;\n" +" ld.shared.f32 %f96, [%rd45+2048];\n" +" add.ftz.f32 %f90, %f96, %f90;\n" +" st.shared.f32 [%rd39+2048], %f90;\n" +" ld.shared.f32 %f97, [%rd45+2560];\n" +" add.ftz.f32 %f91, %f97, %f91;\n" +" st.shared.f32 [%rd39+2560], %f91;\n" +"$Lt_0_25858:\n" +" .loc 16 204 0\n" +" shr.u32 %r58, %r58, 1;\n" +" mov.u32 %r60, 0;\n" +" setp.ne.u32 %p14, %r58, %r60;\n" +" @%p14 bra $Lt_0_25602;\n" +"$Lt_0_25090:\n" +" .loc 16 212 0\n" +" mov.f32 %f6, %f82;\n" +" mov.f32 %f8, %f83;\n" +" mov.f32 %f10, %f84;\n" +" mov.f32 %f12, %f85;\n" +" mov.f32 %f14, %f90;\n" +" mov.f32 %f16, %f91;\n" +"$Lt_0_24578:\n" +"$Lt_0_22530:\n" +" selp.s32 %r61, 1, 0, %p1;\n" +" mov.s32 %r62, 0;\n" +" set.eq.u32.s32 %r63, %r6, %r62;\n" +" neg.s32 %r64, %r63;\n" +" and.b32 %r65, %r61, %r64;\n" +" mov.u32 %r66, 0;\n" +" setp.eq.s32 %p15, %r65, %r66;\n" +" @%p15 bra $Lt_0_26626;\n" +" .loc 16 218 0\n" +" cvt.s64.s32 %rd46, %r9;\n" +" ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd48, %r9, 4;\n" +" add.u64 %rd49, %rd47, %rd48;\n" +" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r68, 0;\n" +" setp.le.s32 %p16, %r67, %r68;\n" +" @%p16 bra $Lt_0_27138;\n" +" .loc 16 220 0\n" +" st.global.f32 [%rd49+0], %f28;\n" +" .loc 16 221 0\n" +" cvt.s64.s32 %rd50, %r10;\n" +" mul.wide.s32 %rd51, %r10, 4;\n" +" add.u64 %rd49, %rd49, %rd51;\n" +"$Lt_0_27138:\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p17, %r69, %r70;\n" +" @%p17 bra $Lt_0_27650;\n" +" .loc 16 225 0\n" +" mov.f32 %f98, %f6;\n" +" st.global.f32 [%rd49+0], %f98;\n" +" .loc 16 226 0\n" +" cvt.s64.s32 %rd52, %r10;\n" +" mul.wide.s32 %rd53, %r10, 4;\n" +" add.u64 %rd54, %rd53, %rd49;\n" +" .loc 16 225 0\n" +" mov.f32 %f99, %f8;\n" +" st.global.f32 [%rd54+0], %f99;\n" +" .loc 16 226 0\n" +" add.u64 %rd55, %rd53, %rd54;\n" +" .loc 16 225 0\n" +" mov.f32 %f100, %f10;\n" +" st.global.f32 [%rd55+0], %f100;\n" +" .loc 16 226 0\n" +" add.u64 %rd56, %rd53, %rd55;\n" +" .loc 16 225 0\n" +" mov.f32 %f101, %f12;\n" +" st.global.f32 [%rd56+0], %f101;\n" +" .loc 16 226 0\n" +" add.u64 %rd49, %rd53, %rd56;\n" +" .loc 16 225 0\n" +" mov.f32 %f102, %f14;\n" +" st.global.f32 [%rd49+0], %f102;\n" +" mov.f32 %f103, %f16;\n" +" add.u64 %rd57, %rd53, %rd49;\n" +" st.global.f32 [%rd57+0], %f103;\n" +"$Lt_0_27650:\n" +" .loc 16 229 0\n" +" ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd59, %rd46, 16;\n" +" add.u64 %rd60, %rd58, %rd59;\n" +" mov.f32 %f104, %f105;\n" +" st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f104};\n" +"$Lt_0_26626:\n" +" .loc 16 231 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<74>;\n" +" .reg .u64 %rd<74>;\n" +" .reg .f32 %f<114>;\n" +" .reg .f64 %fd<4>;\n" +" .reg .pred %p<22>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32650_33_non_const_sp_lj3268[16];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32648_34_non_const_lj13296[1936];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32649_34_non_const_lj35232[1936];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32742_35_non_const_red_acc7168[3072];\n" +" .loc 16 239 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 3;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_21250;\n" +" .loc 16 249 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_21250:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;\n" +" mov.u32 %r3, 120;\n" +" setp.gt.s32 %p2, %r1, %r3;\n" +" @%p2 bra $Lt_1_21762;\n" +" .loc 16 251 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;\n" +" cvt.s64.s32 %rd8, %r1;\n" +" mul.wide.s32 %rd9, %r1, 16;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" +" add.u64 %rd11, %rd10, %rd9;\n" +" add.u64 %rd12, %rd9, %rd7;\n" +" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" +" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" +" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r5, 0;\n" +" setp.le.s32 %p3, %r4, %r5;\n" +" @%p3 bra $Lt_1_22274;\n" +" .loc 16 253 0\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;\n" +" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" +" add.u64 %rd15, %rd14, %rd9;\n" +" add.u64 %rd16, %rd9, %rd13;\n" +" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" +" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" +"$Lt_1_22274:\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;\n" +"$Lt_1_21762:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;\n" +" .loc 16 263 0\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, %f14;\n" +" mov.f32 %f16, 0f00000000; \n" +" mov.f32 %f17, %f16;\n" +" mov.f32 %f18, 0f00000000; \n" +" mov.f32 %f19, %f18;\n" +" mov.f32 %f20, 0f00000000; \n" +" mov.f32 %f21, %f20;\n" +" .loc 16 265 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r7, %r1, %r6;\n" +" cvt.s32.u32 %r8, %ntid.x;\n" +" div.s32 %r9, %r8, %r6;\n" +" rem.s32 %r10, %r1, %r6;\n" +" cvt.s32.u32 %r11, %ctaid.x;\n" +" mul.lo.s32 %r12, %r11, %r9;\n" +" add.s32 %r13, %r7, %r12;\n" +" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p4, %r13, %r14;\n" +" @!%p4 bra $Lt_1_23042;\n" +" .loc 16 271 0\n" +" ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd17, %r15;\n" +" mul.wide.s32 %rd18, %r15, 4;\n" +" cvt.s64.s32 %rd19, %r13;\n" +" mul.wide.s32 %rd20, %r13, 4;\n" +" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd22, %rd20, %rd21;\n" +" add.u64 %rd23, %rd18, %rd22;\n" +" ld.global.s32 %r16, [%rd23+0];\n" +" add.u64 %rd24, %rd18, %rd23;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p5, %rd25, %rd21;\n" +" @%p5 bra $Lt_1_23554;\n" +" .loc 16 277 0\n" +" cvt.s32.s64 %r17, %rd17;\n" +" mul.lo.s32 %r18, %r17, %r16;\n" +" cvt.s64.s32 %rd26, %r18;\n" +" mul.wide.s32 %rd27, %r18, 4;\n" +" add.u64 %rd28, %rd24, %rd27;\n" +" .loc 16 278 0\n" +" mul.lo.s32 %r19, %r10, %r17;\n" +" cvt.s64.s32 %rd29, %r19;\n" +" mul.wide.s32 %rd30, %r19, 4;\n" +" add.u64 %rd31, %rd24, %rd30;\n" +" .loc 16 279 0\n" +" mul.lo.s32 %r20, %r17, %r6;\n" +" bra.uni $Lt_1_23298;\n" +"$Lt_1_23554:\n" +" .loc 16 281 0\n" +" ld.global.s32 %r21, [%rd24+0];\n" +" cvt.s64.s32 %rd32, %r21;\n" +" mul.wide.s32 %rd33, %r21, 4;\n" +" add.u64 %rd34, %rd25, %rd33;\n" +" .loc 16 282 0\n" +" cvt.s64.s32 %rd35, %r16;\n" +" mul.wide.s32 %rd36, %r16, 4;\n" +" add.u64 %rd28, %rd34, %rd36;\n" +" .loc 16 283 0\n" +" mov.s32 %r20, %r6;\n" +" .loc 16 284 0\n" +" cvt.s64.s32 %rd37, %r10;\n" +" mul.wide.s32 %rd38, %r10, 4;\n" +" add.u64 %rd31, %rd34, %rd38;\n" +"$Lt_1_23298:\n" +" .loc 16 287 0\n" +" ld.global.s32 %r22, [%rd22+0];\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" mov.s32 %r28, 0;\n" +" mov.u32 %r29, %r28;\n" +" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" mov.f32 %f29, %f25;\n" +" setp.ge.u64 %p6, %rd31, %rd28;\n" +" @%p6 bra $Lt_1_32002;\n" +" cvt.rzi.ftz.s32.f32 %r30, %f29;\n" +" cvt.s64.s32 %rd39, %r20;\n" +" mul.lo.s32 %r31, %r30, 11;\n" +" cvt.rn.f32.s32 %f30, %r31;\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +"$Lt_1_24322:\n" +" .loc 16 294 0\n" +" ld.global.s32 %r32, [%rd31+0];\n" +" .loc 16 295 0\n" +" shr.s32 %r33, %r32, 30;\n" +" and.b32 %r34, %r33, 3;\n" +" cvt.s64.s32 %rd40, %r34;\n" +" mul.wide.s32 %rd41, %r34, 4;\n" +" add.u64 %rd42, %rd1, %rd41;\n" +" ld.shared.f32 %f35, [%rd42+0];\n" +" .loc 16 298 0\n" +" and.b32 %r35, %r32, 1073741823;\n" +" mov.u32 %r36, %r35;\n" +" mov.s32 %r37, 0;\n" +" mov.u32 %r38, %r37;\n" +" mov.s32 %r39, 0;\n" +" mov.u32 %r40, %r39;\n" +" mov.s32 %r41, 0;\n" +" mov.u32 %r42, %r41;\n" +" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];\n" +" mov.f32 %f40, %f36;\n" +" mov.f32 %f41, %f37;\n" +" mov.f32 %f42, %f38;\n" +" mov.f32 %f43, %f39;\n" +" sub.ftz.f32 %f44, %f27, %f41;\n" +" sub.ftz.f32 %f45, %f26, %f40;\n" +" sub.ftz.f32 %f46, %f28, %f42;\n" +" mul.ftz.f32 %f47, %f44, %f44;\n" +" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" +" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n" +" add.ftz.f32 %f50, %f30, %f43;\n" +" cvt.rzi.ftz.s32.f32 %r43, %f50;\n" +" cvt.s64.s32 %rd43, %r43;\n" +" mul.wide.s32 %rd44, %r43, 16;\n" +" add.u64 %rd45, %rd44, %rd7;\n" +" ld.shared.f32 %f51, [%rd45+8];\n" +" setp.gt.ftz.f32 %p7, %f51, %f49;\n" +" @!%p7 bra $Lt_1_25602;\n" +" .loc 16 309 0\n" +" sqrt.approx.ftz.f32 %f52, %f49;\n" +" ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd45+0];\n" +" sub.ftz.f32 %f56, %f52, %f55;\n" +" .loc 16 313 0\n" +" mul.ftz.f32 %f57, %f56, %f56;\n" +" cvt.ftz.f64.f32 %fd1, %f57;\n" +" rcp.rn.f64 %fd2, %fd1;\n" +" cvt.rn.ftz.f32.f64 %f58, %fd2;\n" +" mul.ftz.f32 %f59, %f58, %f58;\n" +" mul.ftz.f32 %f60, %f58, %f59;\n" +" mul.ftz.f32 %f61, %f53, %f60;\n" +" sub.ftz.f32 %f62, %f61, %f54;\n" +" mul.ftz.f32 %f63, %f60, %f62;\n" +" .loc 16 314 0\n" +" div.approx.ftz.f32 %f64, %f35, %f56;\n" +" div.approx.ftz.f32 %f65, %f64, %f52;\n" +" mul.ftz.f32 %f66, %f63, %f65;\n" +" .loc 16 316 0\n" +" fma.rn.ftz.f32 %f33, %f45, %f66, %f33;\n" +" .loc 16 317 0\n" +" fma.rn.ftz.f32 %f32, %f44, %f66, %f32;\n" +" .loc 16 318 0\n" +" fma.rn.ftz.f32 %f31, %f46, %f66, %f31;\n" +" ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r45, 0;\n" +" setp.le.s32 %p8, %r44, %r45;\n" +" @%p8 bra $Lt_1_25090;\n" +" .loc 16 321 0\n" +" add.u64 %rd46, %rd44, %rd13;\n" +" ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd46+0];\n" +" mul.ftz.f32 %f70, %f67, %f60;\n" +" sub.ftz.f32 %f71, %f70, %f68;\n" +" mul.ftz.f32 %f72, %f60, %f71;\n" +" .loc 16 322 0\n" +" sub.ftz.f32 %f73, %f72, %f69;\n" +" fma.rn.ftz.f32 %f34, %f35, %f73, %f34;\n" +"$Lt_1_25090:\n" +" ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r47, 0;\n" +" setp.le.s32 %p9, %r46, %r47;\n" +" @%p9 bra $Lt_1_25602;\n" +" .loc 16 325 0\n" +" mov.f32 %f74, %f11;\n" +" mul.ftz.f32 %f75, %f45, %f45;\n" +" fma.rn.ftz.f32 %f76, %f66, %f75, %f74;\n" +" mov.f32 %f11, %f76;\n" +" .loc 16 326 0\n" +" mov.f32 %f77, %f13;\n" +" fma.rn.ftz.f32 %f78, %f66, %f47, %f77;\n" +" mov.f32 %f13, %f78;\n" +" .loc 16 327 0\n" +" mov.f32 %f79, %f15;\n" +" mul.ftz.f32 %f80, %f46, %f46;\n" +" fma.rn.ftz.f32 %f81, %f66, %f80, %f79;\n" +" mov.f32 %f15, %f81;\n" +" .loc 16 328 0\n" +" mov.f32 %f82, %f17;\n" +" mul.ftz.f32 %f83, %f44, %f45;\n" +" fma.rn.ftz.f32 %f84, %f66, %f83, %f82;\n" +" mov.f32 %f17, %f84;\n" +" .loc 16 329 0\n" +" mov.f32 %f85, %f19;\n" +" mul.ftz.f32 %f86, %f45, %f46;\n" +" fma.rn.ftz.f32 %f87, %f66, %f86, %f85;\n" +" mov.f32 %f19, %f87;\n" +" .loc 16 330 0\n" +" mul.ftz.f32 %f88, %f44, %f46;\n" +" fma.rn.ftz.f32 %f20, %f66, %f88, %f20;\n" +" mov.f32 %f21, %f20;\n" +"$Lt_1_25602:\n" +"$Lt_1_24578:\n" +" .loc 16 292 0\n" +" mul.lo.u64 %rd47, %rd39, 4;\n" +" add.u64 %rd31, %rd31, %rd47;\n" +" setp.lt.u64 %p10, %rd31, %rd28;\n" +" @%p10 bra $Lt_1_24322;\n" +" bra.uni $Lt_1_22786;\n" +"$Lt_1_32002:\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +" bra.uni $Lt_1_22786;\n" +"$Lt_1_23042:\n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +"$Lt_1_22786:\n" +" mov.u32 %r48, 1;\n" +" setp.le.s32 %p11, %r6, %r48;\n" +" @%p11 bra $Lt_1_28418;\n" +" .loc 16 341 0\n" +" mov.u64 %rd48, __cuda___cuda_local_var_32742_35_non_const_red_acc7168;\n" +" cvt.s64.s32 %rd49, %r1;\n" +" mul.wide.s32 %rd50, %r1, 4;\n" +" add.u64 %rd51, %rd48, %rd50;\n" +" mov.f32 %f89, %f33;\n" +" st.shared.f32 [%rd51+0], %f89;\n" +" .loc 16 342 0\n" +" mov.f32 %f90, %f32;\n" +" st.shared.f32 [%rd51+512], %f90;\n" +" .loc 16 343 0\n" +" mov.f32 %f91, %f31;\n" +" st.shared.f32 [%rd51+1024], %f91;\n" +" .loc 16 344 0\n" +" mov.f32 %f92, %f34;\n" +" st.shared.f32 [%rd51+1536], %f92;\n" +" .loc 16 346 0\n" +" shr.s32 %r49, %r6, 31;\n" +" mov.s32 %r50, 1;\n" +" and.b32 %r51, %r49, %r50;\n" +" add.s32 %r52, %r51, %r6;\n" +" shr.s32 %r53, %r52, 1;\n" +" mov.s32 %r54, %r53;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p12, %r53, %r55;\n" +" @!%p12 bra $Lt_1_26882;\n" +"$Lt_1_27394:\n" +" setp.ge.u32 %p13, %r10, %r54;\n" +" @%p13 bra $Lt_1_27650;\n" +" .loc 16 349 0\n" +" add.u32 %r56, %r1, %r54;\n" +" cvt.u64.u32 %rd52, %r56;\n" +" mul.wide.u32 %rd53, %r56, 4;\n" +" add.u64 %rd54, %rd48, %rd53;\n" +" ld.shared.f32 %f93, [%rd54+0];\n" +" add.ftz.f32 %f89, %f93, %f89;\n" +" st.shared.f32 [%rd51+0], %f89;\n" +" ld.shared.f32 %f94, [%rd54+512];\n" +" add.ftz.f32 %f90, %f94, %f90;\n" +" st.shared.f32 [%rd51+512], %f90;\n" +" ld.shared.f32 %f95, [%rd54+1024];\n" +" add.ftz.f32 %f91, %f95, %f91;\n" +" st.shared.f32 [%rd51+1024], %f91;\n" +" ld.shared.f32 %f96, [%rd54+1536];\n" +" add.ftz.f32 %f92, %f96, %f92;\n" +" st.shared.f32 [%rd51+1536], %f92;\n" +"$Lt_1_27650:\n" +" .loc 16 346 0\n" +" shr.u32 %r54, %r54, 1;\n" +" mov.u32 %r57, 0;\n" +" setp.ne.u32 %p14, %r54, %r57;\n" +" @%p14 bra $Lt_1_27394;\n" +"$Lt_1_26882:\n" +" .loc 16 353 0\n" +" mov.f32 %f33, %f89;\n" +" .loc 16 354 0\n" +" mov.f32 %f32, %f90;\n" +" .loc 16 355 0\n" +" mov.f32 %f31, %f91;\n" +" .loc 16 356 0\n" +" mov.f32 %f34, %f92;\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p15, %r58, %r59;\n" +" @%p15 bra $Lt_1_28418;\n" +" .loc 16 360 0\n" +" mov.f32 %f89, %f11;\n" +" st.shared.f32 [%rd51+0], %f89;\n" +" mov.f32 %f90, %f13;\n" +" st.shared.f32 [%rd51+512], %f90;\n" +" mov.f32 %f91, %f15;\n" +" st.shared.f32 [%rd51+1024], %f91;\n" +" mov.f32 %f92, %f17;\n" +" st.shared.f32 [%rd51+1536], %f92;\n" +" mov.f32 %f97, %f19;\n" +" st.shared.f32 [%rd51+2048], %f97;\n" +" mov.f32 %f98, %f21;\n" +" st.shared.f32 [%rd51+2560], %f98;\n" +" .loc 16 362 0\n" +" mov.s32 %r60, %r53;\n" +" @!%p12 bra $Lt_1_28930;\n" +"$Lt_1_29442:\n" +" setp.ge.u32 %p16, %r10, %r60;\n" +" @%p16 bra $Lt_1_29698;\n" +" .loc 16 365 0\n" +" add.u32 %r61, %r1, %r60;\n" +" cvt.u64.u32 %rd55, %r61;\n" +" mul.wide.u32 %rd56, %r61, 4;\n" +" add.u64 %rd57, %rd48, %rd56;\n" +" ld.shared.f32 %f99, [%rd57+0];\n" +" add.ftz.f32 %f89, %f99, %f89;\n" +" st.shared.f32 [%rd51+0], %f89;\n" +" ld.shared.f32 %f100, [%rd57+512];\n" +" add.ftz.f32 %f90, %f100, %f90;\n" +" st.shared.f32 [%rd51+512], %f90;\n" +" ld.shared.f32 %f101, [%rd57+1024];\n" +" add.ftz.f32 %f91, %f101, %f91;\n" +" st.shared.f32 [%rd51+1024], %f91;\n" +" ld.shared.f32 %f102, [%rd57+1536];\n" +" add.ftz.f32 %f92, %f102, %f92;\n" +" st.shared.f32 [%rd51+1536], %f92;\n" +" ld.shared.f32 %f103, [%rd57+2048];\n" +" add.ftz.f32 %f97, %f103, %f97;\n" +" st.shared.f32 [%rd51+2048], %f97;\n" +" ld.shared.f32 %f104, [%rd57+2560];\n" +" add.ftz.f32 %f98, %f104, %f98;\n" +" st.shared.f32 [%rd51+2560], %f98;\n" +"$Lt_1_29698:\n" +" .loc 16 362 0\n" +" shr.u32 %r60, %r60, 1;\n" +" mov.u32 %r62, 0;\n" +" setp.ne.u32 %p17, %r60, %r62;\n" +" @%p17 bra $Lt_1_29442;\n" +"$Lt_1_28930:\n" +" .loc 16 370 0\n" +" mov.f32 %f11, %f89;\n" +" mov.f32 %f13, %f90;\n" +" mov.f32 %f15, %f91;\n" +" mov.f32 %f17, %f92;\n" +" mov.f32 %f19, %f97;\n" +" mov.f32 %f21, %f98;\n" +"$Lt_1_28418:\n" +"$Lt_1_26370:\n" +" selp.s32 %r63, 1, 0, %p4;\n" +" mov.s32 %r64, 0;\n" +" set.eq.u32.s32 %r65, %r10, %r64;\n" +" neg.s32 %r66, %r65;\n" +" and.b32 %r67, %r63, %r66;\n" +" mov.u32 %r68, 0;\n" +" setp.eq.s32 %p18, %r67, %r68;\n" +" @%p18 bra $Lt_1_30466;\n" +" .loc 16 376 0\n" +" cvt.s64.s32 %rd58, %r13;\n" +" ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd60, %r13, 4;\n" +" add.u64 %rd61, %rd59, %rd60;\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p19, %r69, %r70;\n" +" @%p19 bra $Lt_1_30978;\n" +" .loc 16 378 0\n" +" st.global.f32 [%rd61+0], %f34;\n" +" .loc 16 379 0\n" +" cvt.s64.s32 %rd62, %r14;\n" +" mul.wide.s32 %rd63, %r14, 4;\n" +" add.u64 %rd61, %rd61, %rd63;\n" +"$Lt_1_30978:\n" +" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r72, 0;\n" +" setp.le.s32 %p20, %r71, %r72;\n" +" @%p20 bra $Lt_1_31490;\n" +" .loc 16 383 0\n" +" mov.f32 %f105, %f11;\n" +" st.global.f32 [%rd61+0], %f105;\n" +" .loc 16 384 0\n" +" cvt.s64.s32 %rd64, %r14;\n" +" mul.wide.s32 %rd65, %r14, 4;\n" +" add.u64 %rd66, %rd65, %rd61;\n" +" .loc 16 383 0\n" +" mov.f32 %f106, %f13;\n" +" st.global.f32 [%rd66+0], %f106;\n" +" .loc 16 384 0\n" +" add.u64 %rd67, %rd65, %rd66;\n" +" .loc 16 383 0\n" +" mov.f32 %f107, %f15;\n" +" st.global.f32 [%rd67+0], %f107;\n" +" .loc 16 384 0\n" +" add.u64 %rd68, %rd65, %rd67;\n" +" .loc 16 383 0\n" +" mov.f32 %f108, %f17;\n" +" st.global.f32 [%rd68+0], %f108;\n" +" .loc 16 384 0\n" +" add.u64 %rd61, %rd65, %rd68;\n" +" .loc 16 383 0\n" +" mov.f32 %f109, %f19;\n" +" st.global.f32 [%rd61+0], %f109;\n" +" mov.f32 %f110, %f21;\n" +" add.u64 %rd69, %rd65, %rd61;\n" +" st.global.f32 [%rd69+0], %f110;\n" +"$Lt_1_31490:\n" +" .loc 16 387 0\n" +" ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd71, %rd58, 16;\n" +" add.u64 %rd72, %rd70, %rd71;\n" +" mov.f32 %f111, %f112;\n" +" st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f111};\n" +"$Lt_1_30466:\n" +" .loc 16 389 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/ljc_cut_gpu_kernel.ptx b/lib/gpu/ljc_cut_gpu_kernel.ptx new file mode 100644 index 000000000..da24f7325 --- /dev/null +++ b/lib/gpu/ljc_cut_gpu_kernel.ptx @@ -0,0 +1,1146 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000be65_00000000-9_ljc_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.y5QnHe) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000be65_00000000-8_ljc_cut_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "ljc_cut_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + .global .texref q_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .u64 __cudaparm_kernel_pair_lj3, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_q_, + .param .u64 __cudaparm_kernel_pair_cutsq, + .param .f32 __cudaparm_kernel_pair_qqrd2e, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<86>; + .reg .u64 %rd<66>; + .reg .f32 %f<130>; + .reg .pred %p<21>; + .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_lj112[32]; + .shared .align 4 .b8 __cuda___cuda_local_var_32603_35_non_const_red_acc144[3072]; + // __cuda_local_var_32510_10_non_const_f = 48 + // __cuda_local_var_32514_9_non_const_virial = 16 + // __cuda_local_var_32562_43_non_const_r6inv = 40 + .loc 16 100 0 +$LDWbegin_kernel_pair: + .loc 16 107 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 108 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 109 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 110 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4}; + .loc 16 111 0 + ld.global.f32 %f5, [%rd1+16]; + .loc 16 112 0 + ld.global.f32 %f6, [%rd1+20]; + .loc 16 113 0 + ld.global.f32 %f7, [%rd1+24]; + .loc 16 114 0 + ld.global.f32 %f8, [%rd1+28]; + st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8}; + .loc 16 124 0 + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + mov.f32 %f17, 0f00000000; // 0 + mov.f32 %f18, %f17; + mov.f32 %f19, 0f00000000; // 0 + mov.f32 %f20, %f19; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_21506; + .loc 16 128 0 + cvt.s64.s32 %rd2, %r9; + mul.wide.s32 %rd3, %r9, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd5, %rd3, %rd4; + ld.global.s32 %r11, [%rd5+0]; + .loc 16 130 0 + ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd6, %r12; + mul.wide.s32 %rd7, %r12, 4; + add.u64 %rd8, %rd7, %rd5; + ld.global.s32 %r13, [%rd8+0]; + add.u64 %rd9, %rd7, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd4; + @%p2 bra $Lt_0_22018; + .loc 16 136 0 + cvt.s32.s64 %r14, %rd6; + mul.lo.s32 %r15, %r14, %r13; + cvt.s64.s32 %rd11, %r15; + mul.wide.s32 %rd12, %r15, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 137 0 + mul.lo.s32 %r16, %r6, %r14; + cvt.s64.s32 %rd14, %r16; + mul.wide.s32 %rd15, %r16, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 138 0 + mul.lo.s32 %r17, %r14, %r1; + bra.uni $Lt_0_21762; +$Lt_0_22018: + .loc 16 140 0 + ld.global.s32 %r18, [%rd9+0]; + cvt.s64.s32 %rd17, %r18; + mul.wide.s32 %rd18, %r18, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 141 0 + cvt.s64.s32 %rd20, %r13; + mul.wide.s32 %rd21, %r13, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 142 0 + mov.s32 %r17, %r1; + .loc 16 143 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_21762: + .loc 16 146 0 + mov.u32 %r19, %r11; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f25, %f21; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + .loc 16 147 0 + mov.u32 %r26, %r11; + mov.s32 %r27, 0; + mov.u32 %r28, %r27; + mov.s32 %r29, 0; + mov.u32 %r30, %r29; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r26,%r28,%r30,%r32}]; + mov.f32 %f33, %f29; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_32002; + cvt.rzi.ftz.s32.f32 %r33, %f28; + cvt.s64.s32 %rd24, %r17; + ld.param.s32 %r34, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r35, %r34, %r33; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_cutsq]; + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.u64 %rd26, __cuda___cuda_local_var_32498_33_non_const_sp_lj112; +$Lt_0_22786: + //<loop> Loop body line 147, nesting depth: 1, estimated iterations: unknown + .loc 16 151 0 + ld.global.s32 %r36, [%rd16+0]; + .loc 16 154 0 + shr.s32 %r37, %r36, 30; + and.b32 %r38, %r37, 3; + cvt.s64.s32 %rd27, %r38; + mul.wide.s32 %rd28, %r38, 4; + add.u64 %rd29, %rd26, %rd28; + ld.shared.f32 %f39, [%rd29+0]; + .loc 16 158 0 + and.b32 %r39, %r36, 1073741823; + mov.u32 %r40, %r39; + mov.s32 %r41, 0; + mov.u32 %r42, %r41; + mov.s32 %r43, 0; + mov.u32 %r44, %r43; + mov.s32 %r45, 0; + mov.u32 %r46, %r45; + tex.1d.v4.f32.s32 {%f40,%f41,%f42,%f43},[pos_tex,{%r40,%r42,%r44,%r46}]; + mov.f32 %f44, %f40; + mov.f32 %f45, %f41; + mov.f32 %f46, %f42; + mov.f32 %f47, %f43; + cvt.rzi.ftz.s32.f32 %r47, %f47; + sub.ftz.f32 %f48, %f26, %f45; + sub.ftz.f32 %f49, %f25, %f44; + sub.ftz.f32 %f50, %f27, %f46; + mul.ftz.f32 %f51, %f48, %f48; + fma.rn.ftz.f32 %f52, %f49, %f49, %f51; + add.s32 %r48, %r47, %r35; + cvt.s64.s32 %rd30, %r48; + fma.rn.ftz.f32 %f53, %f50, %f50, %f52; + mul.wide.s32 %rd31, %r48, 4; + add.u64 %rd32, %rd25, %rd31; + ld.global.f32 %f54, [%rd32+0]; + setp.gt.ftz.f32 %p4, %f54, %f53; + @!%p4 bra $Lt_0_25602; + mul.lo.u64 %rd33, %rd30, 16; + rcp.approx.ftz.f32 %f55, %f53; + ld.param.u64 %rd34, [__cudaparm_kernel_pair_lj1]; + add.u64 %rd35, %rd34, %rd33; + ld.global.f32 %f56, [%rd35+8]; + setp.lt.ftz.f32 %p5, %f53, %f56; + @!%p5 bra $Lt_0_23810; + .loc 16 173 0 + mul.ftz.f32 %f57, %f55, %f55; + mul.ftz.f32 %f58, %f55, %f57; + mov.f32 %f59, %f58; + .loc 16 174 0 + mul.ftz.f32 %f60, %f58, %f39; + ld.global.v2.f32 {%f61,%f62}, [%rd35+0]; + mul.ftz.f32 %f63, %f61, %f58; + sub.ftz.f32 %f64, %f63, %f62; + mul.ftz.f32 %f65, %f60, %f64; + bra.uni $Lt_0_23554; +$Lt_0_23810: + .loc 16 176 0 + mov.f32 %f65, 0f00000000; // 0 +$Lt_0_23554: + ld.global.f32 %f66, [%rd35+12]; + setp.gt.ftz.f32 %p6, %f66, %f53; + @!%p6 bra $Lt_0_24322; + .loc 16 179 0 + mov.u32 %r49, %r39; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + tex.1d.v4.f32.s32 {%f67,%f68,%f69,%f70},[q_tex,{%r49,%r51,%r53,%r55}]; + mov.f32 %f71, %f67; + ld.shared.f32 %f72, [%rd29+16]; + ld.param.f32 %f73, [__cudaparm_kernel_pair_qqrd2e]; + mul.ftz.f32 %f74, %f73, %f33; + mul.ftz.f32 %f75, %f71, %f74; + sqrt.approx.ftz.f32 %f76, %f55; + mul.ftz.f32 %f77, %f75, %f76; + mul.ftz.f32 %f78, %f72, %f77; + bra.uni $Lt_0_24066; +$Lt_0_24322: + .loc 16 181 0 + mov.f32 %f78, 0f00000000; // 0 +$Lt_0_24066: + .loc 16 185 0 + add.ftz.f32 %f79, %f78, %f65; + mul.ftz.f32 %f80, %f79, %f55; + fma.rn.ftz.f32 %f36, %f49, %f80, %f36; + .loc 16 186 0 + fma.rn.ftz.f32 %f35, %f48, %f80, %f35; + .loc 16 187 0 + fma.rn.ftz.f32 %f34, %f50, %f80, %f34; + ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r57, 0; + setp.le.s32 %p7, %r56, %r57; + @%p7 bra $Lt_0_25090; + .loc 16 190 0 + add.ftz.f32 %f37, %f78, %f37; + @!%p5 bra $Lt_0_25090; + .loc 16 193 0 + ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd37, %rd36, %rd33; + mov.f32 %f81, %f59; + ld.global.v4.f32 {%f82,%f83,%f84,_}, [%rd37+0]; + mul.ftz.f32 %f85, %f82, %f81; + sub.ftz.f32 %f86, %f85, %f83; + mul.ftz.f32 %f87, %f81, %f86; + sub.ftz.f32 %f88, %f87, %f84; + fma.rn.ftz.f32 %f38, %f39, %f88, %f38; +$Lt_0_25090: +$Lt_0_24578: + ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p8, %r58, %r59; + @%p8 bra $Lt_0_25602; + .loc 16 197 0 + mov.f32 %f89, %f10; + mul.ftz.f32 %f90, %f49, %f49; + fma.rn.ftz.f32 %f91, %f80, %f90, %f89; + mov.f32 %f10, %f91; + .loc 16 198 0 + mov.f32 %f92, %f12; + fma.rn.ftz.f32 %f93, %f80, %f51, %f92; + mov.f32 %f12, %f93; + .loc 16 199 0 + mov.f32 %f94, %f14; + mul.ftz.f32 %f95, %f50, %f50; + fma.rn.ftz.f32 %f96, %f80, %f95, %f94; + mov.f32 %f14, %f96; + .loc 16 200 0 + mov.f32 %f97, %f16; + mul.ftz.f32 %f98, %f48, %f49; + fma.rn.ftz.f32 %f99, %f80, %f98, %f97; + mov.f32 %f16, %f99; + .loc 16 201 0 + mov.f32 %f100, %f18; + mul.ftz.f32 %f101, %f49, %f50; + fma.rn.ftz.f32 %f102, %f80, %f101, %f100; + mov.f32 %f18, %f102; + .loc 16 202 0 + mul.ftz.f32 %f103, %f48, %f50; + fma.rn.ftz.f32 %f19, %f80, %f103, %f19; + mov.f32 %f20, %f19; +$Lt_0_25602: +$Lt_0_23042: + .loc 16 150 0 + mul.lo.u64 %rd38, %rd24, 4; + add.u64 %rd16, %rd16, %rd38; + setp.lt.u64 %p9, %rd16, %rd13; + @%p9 bra $Lt_0_22786; + bra.uni $Lt_0_21250; +$Lt_0_32002: + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + bra.uni $Lt_0_21250; +$Lt_0_21506: + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 +$Lt_0_21250: + mov.u32 %r60, 1; + setp.le.s32 %p10, %r1, %r60; + @%p10 bra $Lt_0_28418; + .loc 16 213 0 + mov.u64 %rd39, __cuda___cuda_local_var_32603_35_non_const_red_acc144; + cvt.s64.s32 %rd40, %r2; + mul.wide.s32 %rd41, %r2, 4; + add.u64 %rd42, %rd39, %rd41; + mov.f32 %f104, %f36; + st.shared.f32 [%rd42+0], %f104; + .loc 16 214 0 + mov.f32 %f105, %f35; + st.shared.f32 [%rd42+512], %f105; + .loc 16 215 0 + mov.f32 %f106, %f34; + st.shared.f32 [%rd42+1024], %f106; + .loc 16 216 0 + mov.f32 %f107, %f38; + st.shared.f32 [%rd42+1536], %f107; + .loc 16 217 0 + mov.f32 %f108, %f37; + st.shared.f32 [%rd42+2048], %f108; + .loc 16 219 0 + shr.s32 %r61, %r1, 31; + mov.s32 %r62, 1; + and.b32 %r63, %r61, %r62; + add.s32 %r64, %r63, %r1; + shr.s32 %r65, %r64, 1; + mov.s32 %r66, %r65; + mov.u32 %r67, 0; + setp.ne.u32 %p11, %r65, %r67; + @!%p11 bra $Lt_0_26882; +$Lt_0_27394: + setp.ge.u32 %p12, %r6, %r66; + @%p12 bra $Lt_0_27650; + .loc 16 222 0 + add.u32 %r68, %r2, %r66; + cvt.u64.u32 %rd43, %r68; + mul.wide.u32 %rd44, %r68, 4; + add.u64 %rd45, %rd39, %rd44; + ld.shared.f32 %f109, [%rd45+0]; + add.ftz.f32 %f104, %f109, %f104; + st.shared.f32 [%rd42+0], %f104; + ld.shared.f32 %f110, [%rd45+512]; + add.ftz.f32 %f105, %f110, %f105; + st.shared.f32 [%rd42+512], %f105; + ld.shared.f32 %f111, [%rd45+1024]; + add.ftz.f32 %f106, %f111, %f106; + st.shared.f32 [%rd42+1024], %f106; + ld.shared.f32 %f112, [%rd45+1536]; + add.ftz.f32 %f107, %f112, %f107; + st.shared.f32 [%rd42+1536], %f107; + ld.shared.f32 %f113, [%rd45+2048]; + add.ftz.f32 %f108, %f113, %f108; + st.shared.f32 [%rd42+2048], %f108; +$Lt_0_27650: + .loc 16 219 0 + shr.u32 %r66, %r66, 1; + mov.u32 %r69, 0; + setp.ne.u32 %p13, %r66, %r69; + @%p13 bra $Lt_0_27394; +$Lt_0_26882: + .loc 16 226 0 + mov.f32 %f36, %f104; + .loc 16 227 0 + mov.f32 %f35, %f105; + .loc 16 228 0 + mov.f32 %f34, %f106; + .loc 16 229 0 + mov.f32 %f38, %f107; + .loc 16 230 0 + mov.f32 %f37, %f108; + ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r71, 0; + setp.le.s32 %p14, %r70, %r71; + @%p14 bra $Lt_0_28418; + .loc 16 234 0 + mov.f32 %f104, %f10; + st.shared.f32 [%rd42+0], %f104; + mov.f32 %f105, %f12; + st.shared.f32 [%rd42+512], %f105; + mov.f32 %f106, %f14; + st.shared.f32 [%rd42+1024], %f106; + mov.f32 %f107, %f16; + st.shared.f32 [%rd42+1536], %f107; + mov.f32 %f108, %f18; + st.shared.f32 [%rd42+2048], %f108; + mov.f32 %f114, %f20; + st.shared.f32 [%rd42+2560], %f114; + .loc 16 236 0 + mov.s32 %r72, %r65; + @!%p11 bra $Lt_0_28930; +$Lt_0_29442: + setp.ge.u32 %p15, %r6, %r72; + @%p15 bra $Lt_0_29698; + .loc 16 239 0 + add.u32 %r73, %r2, %r72; + cvt.u64.u32 %rd46, %r73; + mul.wide.u32 %rd47, %r73, 4; + add.u64 %rd48, %rd39, %rd47; + ld.shared.f32 %f115, [%rd48+0]; + add.ftz.f32 %f104, %f115, %f104; + st.shared.f32 [%rd42+0], %f104; + ld.shared.f32 %f116, [%rd48+512]; + add.ftz.f32 %f105, %f116, %f105; + st.shared.f32 [%rd42+512], %f105; + ld.shared.f32 %f117, [%rd48+1024]; + add.ftz.f32 %f106, %f117, %f106; + st.shared.f32 [%rd42+1024], %f106; + ld.shared.f32 %f118, [%rd48+1536]; + add.ftz.f32 %f107, %f118, %f107; + st.shared.f32 [%rd42+1536], %f107; + ld.shared.f32 %f119, [%rd48+2048]; + add.ftz.f32 %f108, %f119, %f108; + st.shared.f32 [%rd42+2048], %f108; + ld.shared.f32 %f120, [%rd48+2560]; + add.ftz.f32 %f114, %f120, %f114; + st.shared.f32 [%rd42+2560], %f114; +$Lt_0_29698: + .loc 16 236 0 + shr.u32 %r72, %r72, 1; + mov.u32 %r74, 0; + setp.ne.u32 %p16, %r72, %r74; + @%p16 bra $Lt_0_29442; +$Lt_0_28930: + .loc 16 244 0 + mov.f32 %f10, %f104; + mov.f32 %f12, %f105; + mov.f32 %f14, %f106; + mov.f32 %f16, %f107; + mov.f32 %f18, %f108; + mov.f32 %f20, %f114; +$Lt_0_28418: +$Lt_0_26370: + selp.s32 %r75, 1, 0, %p1; + mov.s32 %r76, 0; + set.eq.u32.s32 %r77, %r6, %r76; + neg.s32 %r78, %r77; + and.b32 %r79, %r75, %r78; + mov.u32 %r80, 0; + setp.eq.s32 %p17, %r79, %r80; + @%p17 bra $Lt_0_30466; + .loc 16 250 0 + cvt.s64.s32 %rd49, %r9; + ld.param.u64 %rd50, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd51, %r9, 4; + add.u64 %rd52, %rd50, %rd51; + ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r82, 0; + setp.le.s32 %p18, %r81, %r82; + @%p18 bra $Lt_0_30978; + .loc 16 252 0 + st.global.f32 [%rd52+0], %f38; + .loc 16 253 0 + cvt.s64.s32 %rd53, %r10; + mul.wide.s32 %rd54, %r10, 4; + add.u64 %rd55, %rd54, %rd52; + .loc 16 254 0 + st.global.f32 [%rd55+0], %f37; + .loc 16 255 0 + add.u64 %rd52, %rd54, %rd55; +$Lt_0_30978: + ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r84, 0; + setp.le.s32 %p19, %r83, %r84; + @%p19 bra $Lt_0_31490; + .loc 16 259 0 + mov.f32 %f121, %f10; + st.global.f32 [%rd52+0], %f121; + .loc 16 260 0 + cvt.s64.s32 %rd56, %r10; + mul.wide.s32 %rd57, %r10, 4; + add.u64 %rd58, %rd57, %rd52; + .loc 16 259 0 + mov.f32 %f122, %f12; + st.global.f32 [%rd58+0], %f122; + .loc 16 260 0 + add.u64 %rd59, %rd57, %rd58; + .loc 16 259 0 + mov.f32 %f123, %f14; + st.global.f32 [%rd59+0], %f123; + .loc 16 260 0 + add.u64 %rd60, %rd57, %rd59; + .loc 16 259 0 + mov.f32 %f124, %f16; + st.global.f32 [%rd60+0], %f124; + .loc 16 260 0 + add.u64 %rd52, %rd57, %rd60; + .loc 16 259 0 + mov.f32 %f125, %f18; + st.global.f32 [%rd52+0], %f125; + mov.f32 %f126, %f20; + add.u64 %rd61, %rd57, %rd52; + st.global.f32 [%rd61+0], %f126; +$Lt_0_31490: + .loc 16 263 0 + ld.param.u64 %rd62, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd63, %rd49, 16; + add.u64 %rd64, %rd62, %rd63; + mov.f32 %f127, %f128; + st.global.v4.f32 [%rd64+0], {%f36,%f35,%f34,%f127}; +$Lt_0_30466: + .loc 16 265 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_lj1_in, + .param .u64 __cudaparm_kernel_pair_fast_lj3_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_fast_q_, + .param .u64 __cudaparm_kernel_pair_fast__cutsq, + .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<88>; + .reg .u64 %rd<82>; + .reg .f32 %f<134>; + .reg .pred %p<24>; + .shared .align 4 .b8 __cuda___cuda_local_var_32676_33_non_const_sp_lj3320[32]; + .shared .align 16 .b8 __cuda___cuda_local_var_32673_34_non_const_lj13360[1936]; + .shared .align 4 .b8 __cuda___cuda_local_var_32675_33_non_const_cutsq5296[484]; + .shared .align 16 .b8 __cuda___cuda_local_var_32674_34_non_const_lj35792[1936]; + .shared .align 4 .b8 __cuda___cuda_local_var_32783_35_non_const_red_acc7728[3072]; + // __cuda_local_var_32688_10_non_const_f = 48 + // __cuda_local_var_32692_9_non_const_virial = 16 + // __cuda_local_var_32742_43_non_const_r6inv = 40 + .loc 16 275 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 7; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_23554; + .loc 16 286 0 + mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_sp_lj3320; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_23554: + mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_sp_lj3320; + mov.u32 %r3, 120; + setp.gt.s32 %p2, %r1, %r3; + @%p2 bra $Lt_1_24066; + .loc 16 288 0 + mov.u64 %rd7, __cuda___cuda_local_var_32673_34_non_const_lj13360; + mov.u64 %rd8, __cuda___cuda_local_var_32675_33_non_const_cutsq5296; + cvt.s64.s32 %rd9, %r1; + mul.wide.s32 %rd10, %r1, 16; + ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in]; + add.u64 %rd12, %rd11, %rd10; + add.u64 %rd13, %rd10, %rd7; + ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0]; + st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5}; + .loc 16 289 0 + mul.wide.s32 %rd14, %r1, 4; + ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast__cutsq]; + add.u64 %rd16, %rd15, %rd14; + ld.global.f32 %f6, [%rd16+0]; + add.u64 %rd17, %rd14, %rd8; + st.shared.f32 [%rd17+0], %f6; + ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r5, 0; + setp.le.s32 %p3, %r4, %r5; + @%p3 bra $Lt_1_24578; + .loc 16 291 0 + mov.u64 %rd18, __cuda___cuda_local_var_32674_34_non_const_lj35792; + ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_lj3_in]; + add.u64 %rd20, %rd19, %rd10; + add.u64 %rd21, %rd10, %rd18; + ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd20+0]; + st.shared.v4.f32 [%rd21+0], {%f7,%f8,%f9,%f10}; +$Lt_1_24578: + mov.u64 %rd18, __cuda___cuda_local_var_32674_34_non_const_lj35792; +$Lt_1_24066: + mov.u64 %rd18, __cuda___cuda_local_var_32674_34_non_const_lj35792; + mov.u64 %rd7, __cuda___cuda_local_var_32673_34_non_const_lj13360; + mov.u64 %rd8, __cuda___cuda_local_var_32675_33_non_const_cutsq5296; + .loc 16 302 0 + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + mov.f32 %f17, 0f00000000; // 0 + mov.f32 %f18, %f17; + mov.f32 %f19, 0f00000000; // 0 + mov.f32 %f20, %f19; + mov.f32 %f21, 0f00000000; // 0 + mov.f32 %f22, %f21; + .loc 16 304 0 + bar.sync 0; + ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r7, %r1, %r6; + cvt.s32.u32 %r8, %ntid.x; + div.s32 %r9, %r8, %r6; + rem.s32 %r10, %r1, %r6; + cvt.s32.u32 %r11, %ctaid.x; + mul.lo.s32 %r12, %r11, %r9; + add.s32 %r13, %r7, %r12; + ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p4, %r13, %r14; + @!%p4 bra $Lt_1_25346; + .loc 16 308 0 + cvt.s64.s32 %rd22, %r13; + mul.wide.s32 %rd23, %r13, 4; + ld.param.u64 %rd24, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd25, %rd23, %rd24; + ld.global.s32 %r15, [%rd25+0]; + .loc 16 310 0 + ld.param.s32 %r16, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd26, %r16; + mul.wide.s32 %rd27, %r16, 4; + add.u64 %rd28, %rd27, %rd25; + ld.global.s32 %r17, [%rd28+0]; + add.u64 %rd29, %rd27, %rd28; + ld.param.u64 %rd30, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p5, %rd30, %rd24; + @%p5 bra $Lt_1_25858; + .loc 16 316 0 + cvt.s32.s64 %r18, %rd26; + mul.lo.s32 %r19, %r18, %r17; + cvt.s64.s32 %rd31, %r19; + mul.wide.s32 %rd32, %r19, 4; + add.u64 %rd33, %rd29, %rd32; + .loc 16 317 0 + mul.lo.s32 %r20, %r10, %r18; + cvt.s64.s32 %rd34, %r20; + mul.wide.s32 %rd35, %r20, 4; + add.u64 %rd36, %rd29, %rd35; + .loc 16 318 0 + mul.lo.s32 %r21, %r18, %r6; + bra.uni $Lt_1_25602; +$Lt_1_25858: + .loc 16 320 0 + ld.global.s32 %r22, [%rd29+0]; + cvt.s64.s32 %rd37, %r22; + mul.wide.s32 %rd38, %r22, 4; + add.u64 %rd39, %rd30, %rd38; + .loc 16 321 0 + cvt.s64.s32 %rd40, %r17; + mul.wide.s32 %rd41, %r17, 4; + add.u64 %rd33, %rd39, %rd41; + .loc 16 322 0 + mov.s32 %r21, %r6; + .loc 16 323 0 + cvt.s64.s32 %rd42, %r10; + mul.wide.s32 %rd43, %r10, 4; + add.u64 %rd36, %rd39, %rd43; +$Lt_1_25602: + .loc 16 326 0 + mov.u32 %r23, %r15; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + mov.s32 %r28, 0; + mov.u32 %r29, %r28; + tex.1d.v4.f32.s32 {%f23,%f24,%f25,%f26},[pos_tex,{%r23,%r25,%r27,%r29}]; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + mov.f32 %f29, %f25; + mov.f32 %f30, %f26; + .loc 16 327 0 + mov.u32 %r30, %r15; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + mov.s32 %r33, 0; + mov.u32 %r34, %r33; + mov.s32 %r35, 0; + mov.u32 %r36, %r35; + tex.1d.v4.f32.s32 {%f31,%f32,%f33,%f34},[q_tex,{%r30,%r32,%r34,%r36}]; + mov.f32 %f35, %f31; + setp.ge.u64 %p6, %rd36, %rd33; + @%p6 bra $Lt_1_35842; + cvt.rzi.ftz.s32.f32 %r37, %f30; + cvt.s64.s32 %rd44, %r21; + mul.lo.s32 %r38, %r37, 11; + cvt.rn.f32.s32 %f36, %r38; + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.f32 %f40, 0f00000000; // 0 + mov.f32 %f41, 0f00000000; // 0 +$Lt_1_26626: + //<loop> Loop body line 327, nesting depth: 1, estimated iterations: unknown + .loc 16 332 0 + ld.global.s32 %r39, [%rd36+0]; + .loc 16 335 0 + shr.s32 %r40, %r39, 30; + and.b32 %r41, %r40, 3; + cvt.s64.s32 %rd45, %r41; + mul.wide.s32 %rd46, %r41, 4; + add.u64 %rd47, %rd1, %rd46; + ld.shared.f32 %f42, [%rd47+0]; + .loc 16 339 0 + and.b32 %r42, %r39, 1073741823; + mov.u32 %r43, %r42; + mov.s32 %r44, 0; + mov.u32 %r45, %r44; + mov.s32 %r46, 0; + mov.u32 %r47, %r46; + mov.s32 %r48, 0; + mov.u32 %r49, %r48; + tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r43,%r45,%r47,%r49}]; + mov.f32 %f47, %f43; + mov.f32 %f48, %f44; + mov.f32 %f49, %f45; + mov.f32 %f50, %f46; + sub.ftz.f32 %f51, %f28, %f48; + sub.ftz.f32 %f52, %f27, %f47; + sub.ftz.f32 %f53, %f29, %f49; + mul.ftz.f32 %f54, %f51, %f51; + fma.rn.ftz.f32 %f55, %f52, %f52, %f54; + fma.rn.ftz.f32 %f56, %f53, %f53, %f55; + add.ftz.f32 %f57, %f36, %f50; + cvt.rzi.ftz.s32.f32 %r50, %f57; + cvt.s64.s32 %rd48, %r50; + mul.wide.s32 %rd49, %r50, 4; + add.u64 %rd50, %rd8, %rd49; + ld.shared.f32 %f58, [%rd50+0]; + setp.gt.ftz.f32 %p7, %f58, %f56; + @!%p7 bra $Lt_1_29442; + rcp.approx.ftz.f32 %f59, %f56; + mul.lo.u64 %rd51, %rd48, 16; + add.u64 %rd52, %rd51, %rd7; + ld.shared.f32 %f60, [%rd52+8]; + setp.lt.ftz.f32 %p8, %f56, %f60; + @!%p8 bra $Lt_1_27650; + .loc 16 353 0 + mul.ftz.f32 %f61, %f59, %f59; + mul.ftz.f32 %f62, %f59, %f61; + mov.f32 %f63, %f62; + .loc 16 354 0 + mul.ftz.f32 %f64, %f62, %f42; + ld.shared.v2.f32 {%f65,%f66}, [%rd52+0]; + mul.ftz.f32 %f67, %f65, %f62; + sub.ftz.f32 %f68, %f67, %f66; + mul.ftz.f32 %f69, %f64, %f68; + bra.uni $Lt_1_27394; +$Lt_1_27650: + .loc 16 356 0 + mov.f32 %f69, 0f00000000; // 0 +$Lt_1_27394: + ld.shared.f32 %f70, [%rd52+12]; + setp.gt.ftz.f32 %p9, %f70, %f56; + @!%p9 bra $Lt_1_28162; + .loc 16 359 0 + mov.u32 %r51, %r42; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + mov.s32 %r56, 0; + mov.u32 %r57, %r56; + tex.1d.v4.f32.s32 {%f71,%f72,%f73,%f74},[q_tex,{%r51,%r53,%r55,%r57}]; + mov.f32 %f75, %f71; + ld.shared.f32 %f76, [%rd47+16]; + ld.param.f32 %f77, [__cudaparm_kernel_pair_fast_qqrd2e]; + mul.ftz.f32 %f78, %f77, %f35; + mul.ftz.f32 %f79, %f75, %f78; + sqrt.approx.ftz.f32 %f80, %f59; + mul.ftz.f32 %f81, %f79, %f80; + mul.ftz.f32 %f82, %f76, %f81; + bra.uni $Lt_1_27906; +$Lt_1_28162: + .loc 16 361 0 + mov.f32 %f82, 0f00000000; // 0 +$Lt_1_27906: + .loc 16 365 0 + add.ftz.f32 %f83, %f82, %f69; + mul.ftz.f32 %f84, %f83, %f59; + fma.rn.ftz.f32 %f39, %f52, %f84, %f39; + .loc 16 366 0 + fma.rn.ftz.f32 %f38, %f51, %f84, %f38; + .loc 16 367 0 + fma.rn.ftz.f32 %f37, %f53, %f84, %f37; + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r59, 0; + setp.le.s32 %p10, %r58, %r59; + @%p10 bra $Lt_1_28930; + .loc 16 370 0 + add.ftz.f32 %f40, %f82, %f40; + @!%p8 bra $Lt_1_28930; + .loc 16 372 0 + add.u64 %rd53, %rd51, %rd18; + mov.f32 %f85, %f63; + ld.shared.v4.f32 {%f86,%f87,%f88,_}, [%rd53+0]; + mul.ftz.f32 %f89, %f86, %f85; + sub.ftz.f32 %f90, %f89, %f87; + mul.ftz.f32 %f91, %f85, %f90; + .loc 16 373 0 + sub.ftz.f32 %f92, %f91, %f88; + fma.rn.ftz.f32 %f41, %f42, %f92, %f41; +$Lt_1_28930: +$Lt_1_28418: + ld.param.s32 %r60, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r61, 0; + setp.le.s32 %p11, %r60, %r61; + @%p11 bra $Lt_1_29442; + .loc 16 377 0 + mov.f32 %f93, %f12; + mul.ftz.f32 %f94, %f52, %f52; + fma.rn.ftz.f32 %f95, %f84, %f94, %f93; + mov.f32 %f12, %f95; + .loc 16 378 0 + mov.f32 %f96, %f14; + fma.rn.ftz.f32 %f97, %f84, %f54, %f96; + mov.f32 %f14, %f97; + .loc 16 379 0 + mov.f32 %f98, %f16; + mul.ftz.f32 %f99, %f53, %f53; + fma.rn.ftz.f32 %f100, %f84, %f99, %f98; + mov.f32 %f16, %f100; + .loc 16 380 0 + mov.f32 %f101, %f18; + mul.ftz.f32 %f102, %f51, %f52; + fma.rn.ftz.f32 %f103, %f84, %f102, %f101; + mov.f32 %f18, %f103; + .loc 16 381 0 + mov.f32 %f104, %f20; + mul.ftz.f32 %f105, %f52, %f53; + fma.rn.ftz.f32 %f106, %f84, %f105, %f104; + mov.f32 %f20, %f106; + .loc 16 382 0 + mul.ftz.f32 %f107, %f51, %f53; + fma.rn.ftz.f32 %f21, %f84, %f107, %f21; + mov.f32 %f22, %f21; +$Lt_1_29442: +$Lt_1_26882: + .loc 16 331 0 + mul.lo.u64 %rd54, %rd44, 4; + add.u64 %rd36, %rd36, %rd54; + setp.lt.u64 %p12, %rd36, %rd33; + @%p12 bra $Lt_1_26626; + bra.uni $Lt_1_25090; +$Lt_1_35842: + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.f32 %f40, 0f00000000; // 0 + mov.f32 %f41, 0f00000000; // 0 + bra.uni $Lt_1_25090; +$Lt_1_25346: + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.f32 %f40, 0f00000000; // 0 + mov.f32 %f41, 0f00000000; // 0 +$Lt_1_25090: + mov.u32 %r62, 1; + setp.le.s32 %p13, %r6, %r62; + @%p13 bra $Lt_1_32258; + .loc 16 393 0 + mov.u64 %rd55, __cuda___cuda_local_var_32783_35_non_const_red_acc7728; + cvt.s64.s32 %rd56, %r1; + mul.wide.s32 %rd57, %r1, 4; + add.u64 %rd58, %rd55, %rd57; + mov.f32 %f108, %f39; + st.shared.f32 [%rd58+0], %f108; + .loc 16 394 0 + mov.f32 %f109, %f38; + st.shared.f32 [%rd58+512], %f109; + .loc 16 395 0 + mov.f32 %f110, %f37; + st.shared.f32 [%rd58+1024], %f110; + .loc 16 396 0 + mov.f32 %f111, %f41; + st.shared.f32 [%rd58+1536], %f111; + .loc 16 397 0 + mov.f32 %f112, %f40; + st.shared.f32 [%rd58+2048], %f112; + .loc 16 399 0 + shr.s32 %r63, %r6, 31; + mov.s32 %r64, 1; + and.b32 %r65, %r63, %r64; + add.s32 %r66, %r65, %r6; + shr.s32 %r67, %r66, 1; + mov.s32 %r68, %r67; + mov.u32 %r69, 0; + setp.ne.u32 %p14, %r67, %r69; + @!%p14 bra $Lt_1_30722; +$Lt_1_31234: + setp.ge.u32 %p15, %r10, %r68; + @%p15 bra $Lt_1_31490; + .loc 16 402 0 + add.u32 %r70, %r1, %r68; + cvt.u64.u32 %rd59, %r70; + mul.wide.u32 %rd60, %r70, 4; + add.u64 %rd61, %rd55, %rd60; + ld.shared.f32 %f113, [%rd61+0]; + add.ftz.f32 %f108, %f113, %f108; + st.shared.f32 [%rd58+0], %f108; + ld.shared.f32 %f114, [%rd61+512]; + add.ftz.f32 %f109, %f114, %f109; + st.shared.f32 [%rd58+512], %f109; + ld.shared.f32 %f115, [%rd61+1024]; + add.ftz.f32 %f110, %f115, %f110; + st.shared.f32 [%rd58+1024], %f110; + ld.shared.f32 %f116, [%rd61+1536]; + add.ftz.f32 %f111, %f116, %f111; + st.shared.f32 [%rd58+1536], %f111; + ld.shared.f32 %f117, [%rd61+2048]; + add.ftz.f32 %f112, %f117, %f112; + st.shared.f32 [%rd58+2048], %f112; +$Lt_1_31490: + .loc 16 399 0 + shr.u32 %r68, %r68, 1; + mov.u32 %r71, 0; + setp.ne.u32 %p16, %r68, %r71; + @%p16 bra $Lt_1_31234; +$Lt_1_30722: + .loc 16 406 0 + mov.f32 %f39, %f108; + .loc 16 407 0 + mov.f32 %f38, %f109; + .loc 16 408 0 + mov.f32 %f37, %f110; + .loc 16 409 0 + mov.f32 %f41, %f111; + .loc 16 410 0 + mov.f32 %f40, %f112; + ld.param.s32 %r72, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r73, 0; + setp.le.s32 %p17, %r72, %r73; + @%p17 bra $Lt_1_32258; + .loc 16 414 0 + mov.f32 %f108, %f12; + st.shared.f32 [%rd58+0], %f108; + mov.f32 %f109, %f14; + st.shared.f32 [%rd58+512], %f109; + mov.f32 %f110, %f16; + st.shared.f32 [%rd58+1024], %f110; + mov.f32 %f111, %f18; + st.shared.f32 [%rd58+1536], %f111; + mov.f32 %f112, %f20; + st.shared.f32 [%rd58+2048], %f112; + mov.f32 %f118, %f22; + st.shared.f32 [%rd58+2560], %f118; + .loc 16 416 0 + mov.s32 %r74, %r67; + @!%p14 bra $Lt_1_32770; +$Lt_1_33282: + setp.ge.u32 %p18, %r10, %r74; + @%p18 bra $Lt_1_33538; + .loc 16 419 0 + add.u32 %r75, %r1, %r74; + cvt.u64.u32 %rd62, %r75; + mul.wide.u32 %rd63, %r75, 4; + add.u64 %rd64, %rd55, %rd63; + ld.shared.f32 %f119, [%rd64+0]; + add.ftz.f32 %f108, %f119, %f108; + st.shared.f32 [%rd58+0], %f108; + ld.shared.f32 %f120, [%rd64+512]; + add.ftz.f32 %f109, %f120, %f109; + st.shared.f32 [%rd58+512], %f109; + ld.shared.f32 %f121, [%rd64+1024]; + add.ftz.f32 %f110, %f121, %f110; + st.shared.f32 [%rd58+1024], %f110; + ld.shared.f32 %f122, [%rd64+1536]; + add.ftz.f32 %f111, %f122, %f111; + st.shared.f32 [%rd58+1536], %f111; + ld.shared.f32 %f123, [%rd64+2048]; + add.ftz.f32 %f112, %f123, %f112; + st.shared.f32 [%rd58+2048], %f112; + ld.shared.f32 %f124, [%rd64+2560]; + add.ftz.f32 %f118, %f124, %f118; + st.shared.f32 [%rd58+2560], %f118; +$Lt_1_33538: + .loc 16 416 0 + shr.u32 %r74, %r74, 1; + mov.u32 %r76, 0; + setp.ne.u32 %p19, %r74, %r76; + @%p19 bra $Lt_1_33282; +$Lt_1_32770: + .loc 16 424 0 + mov.f32 %f12, %f108; + mov.f32 %f14, %f109; + mov.f32 %f16, %f110; + mov.f32 %f18, %f111; + mov.f32 %f20, %f112; + mov.f32 %f22, %f118; +$Lt_1_32258: +$Lt_1_30210: + selp.s32 %r77, 1, 0, %p4; + mov.s32 %r78, 0; + set.eq.u32.s32 %r79, %r10, %r78; + neg.s32 %r80, %r79; + and.b32 %r81, %r77, %r80; + mov.u32 %r82, 0; + setp.eq.s32 %p20, %r81, %r82; + @%p20 bra $Lt_1_34306; + .loc 16 430 0 + cvt.s64.s32 %rd65, %r13; + ld.param.u64 %rd66, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd67, %r13, 4; + add.u64 %rd68, %rd66, %rd67; + ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r84, 0; + setp.le.s32 %p21, %r83, %r84; + @%p21 bra $Lt_1_34818; + .loc 16 432 0 + st.global.f32 [%rd68+0], %f41; + .loc 16 433 0 + cvt.s64.s32 %rd69, %r14; + mul.wide.s32 %rd70, %r14, 4; + add.u64 %rd71, %rd70, %rd68; + .loc 16 434 0 + st.global.f32 [%rd71+0], %f40; + .loc 16 435 0 + add.u64 %rd68, %rd70, %rd71; +$Lt_1_34818: + ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r86, 0; + setp.le.s32 %p22, %r85, %r86; + @%p22 bra $Lt_1_35330; + .loc 16 439 0 + mov.f32 %f125, %f12; + st.global.f32 [%rd68+0], %f125; + .loc 16 440 0 + cvt.s64.s32 %rd72, %r14; + mul.wide.s32 %rd73, %r14, 4; + add.u64 %rd74, %rd73, %rd68; + .loc 16 439 0 + mov.f32 %f126, %f14; + st.global.f32 [%rd74+0], %f126; + .loc 16 440 0 + add.u64 %rd75, %rd73, %rd74; + .loc 16 439 0 + mov.f32 %f127, %f16; + st.global.f32 [%rd75+0], %f127; + .loc 16 440 0 + add.u64 %rd76, %rd73, %rd75; + .loc 16 439 0 + mov.f32 %f128, %f18; + st.global.f32 [%rd76+0], %f128; + .loc 16 440 0 + add.u64 %rd68, %rd73, %rd76; + .loc 16 439 0 + mov.f32 %f129, %f20; + st.global.f32 [%rd68+0], %f129; + mov.f32 %f130, %f22; + add.u64 %rd77, %rd73, %rd68; + st.global.f32 [%rd77+0], %f130; +$Lt_1_35330: + .loc 16 443 0 + ld.param.u64 %rd78, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd79, %rd65, 16; + add.u64 %rd80, %rd78, %rd79; + mov.f32 %f131, %f132; + st.global.v4.f32 [%rd80+0], {%f39,%f38,%f37,%f131}; +$Lt_1_34306: + .loc 16 445 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/ljc_cut_gpu_ptx.h b/lib/gpu/ljc_cut_gpu_ptx.h new file mode 100644 index 000000000..d962ac328 --- /dev/null +++ b/lib/gpu/ljc_cut_gpu_ptx.h @@ -0,0 +1,1092 @@ +const char * ljc_cut_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .global .texref q_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .u64 __cudaparm_kernel_pair_lj3,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_q_,\n" +" .param .u64 __cudaparm_kernel_pair_cutsq,\n" +" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<86>;\n" +" .reg .u64 %rd<66>;\n" +" .reg .f32 %f<130>;\n" +" .reg .pred %p<21>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_lj112[32];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32603_35_non_const_red_acc144[3072];\n" +" .loc 16 100 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 107 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 108 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 109 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 110 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 111 0\n" +" ld.global.f32 %f5, [%rd1+16];\n" +" .loc 16 112 0\n" +" ld.global.f32 %f6, [%rd1+20];\n" +" .loc 16 113 0\n" +" ld.global.f32 %f7, [%rd1+24];\n" +" .loc 16 114 0\n" +" ld.global.f32 %f8, [%rd1+28];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8};\n" +" .loc 16 124 0\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" mov.f32 %f17, 0f00000000; \n" +" mov.f32 %f18, %f17;\n" +" mov.f32 %f19, 0f00000000; \n" +" mov.f32 %f20, %f19;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_21506;\n" +" .loc 16 128 0\n" +" cvt.s64.s32 %rd2, %r9;\n" +" mul.wide.s32 %rd3, %r9, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd5, %rd3, %rd4;\n" +" ld.global.s32 %r11, [%rd5+0];\n" +" .loc 16 130 0\n" +" ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd6, %r12;\n" +" mul.wide.s32 %rd7, %r12, 4;\n" +" add.u64 %rd8, %rd7, %rd5;\n" +" ld.global.s32 %r13, [%rd8+0];\n" +" add.u64 %rd9, %rd7, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd4;\n" +" @%p2 bra $Lt_0_22018;\n" +" .loc 16 136 0\n" +" cvt.s32.s64 %r14, %rd6;\n" +" mul.lo.s32 %r15, %r14, %r13;\n" +" cvt.s64.s32 %rd11, %r15;\n" +" mul.wide.s32 %rd12, %r15, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 137 0\n" +" mul.lo.s32 %r16, %r6, %r14;\n" +" cvt.s64.s32 %rd14, %r16;\n" +" mul.wide.s32 %rd15, %r16, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 138 0\n" +" mul.lo.s32 %r17, %r14, %r1;\n" +" bra.uni $Lt_0_21762;\n" +"$Lt_0_22018:\n" +" .loc 16 140 0\n" +" ld.global.s32 %r18, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r18;\n" +" mul.wide.s32 %rd18, %r18, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 141 0\n" +" cvt.s64.s32 %rd20, %r13;\n" +" mul.wide.s32 %rd21, %r13, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 142 0\n" +" mov.s32 %r17, %r1;\n" +" .loc 16 143 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_21762:\n" +" .loc 16 146 0\n" +" mov.u32 %r19, %r11;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f25, %f21;\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" .loc 16 147 0\n" +" mov.u32 %r26, %r11;\n" +" mov.s32 %r27, 0;\n" +" mov.u32 %r28, %r27;\n" +" mov.s32 %r29, 0;\n" +" mov.u32 %r30, %r29;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r26,%r28,%r30,%r32}];\n" +" mov.f32 %f33, %f29;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_32002;\n" +" cvt.rzi.ftz.s32.f32 %r33, %f28;\n" +" cvt.s64.s32 %rd24, %r17;\n" +" ld.param.s32 %r34, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r35, %r34, %r33;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_cutsq];\n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.u64 %rd26, __cuda___cuda_local_var_32498_33_non_const_sp_lj112;\n" +"$Lt_0_22786:\n" +" .loc 16 151 0\n" +" ld.global.s32 %r36, [%rd16+0];\n" +" .loc 16 154 0\n" +" shr.s32 %r37, %r36, 30;\n" +" and.b32 %r38, %r37, 3;\n" +" cvt.s64.s32 %rd27, %r38;\n" +" mul.wide.s32 %rd28, %r38, 4;\n" +" add.u64 %rd29, %rd26, %rd28;\n" +" ld.shared.f32 %f39, [%rd29+0];\n" +" .loc 16 158 0\n" +" and.b32 %r39, %r36, 1073741823;\n" +" mov.u32 %r40, %r39;\n" +" mov.s32 %r41, 0;\n" +" mov.u32 %r42, %r41;\n" +" mov.s32 %r43, 0;\n" +" mov.u32 %r44, %r43;\n" +" mov.s32 %r45, 0;\n" +" mov.u32 %r46, %r45;\n" +" tex.1d.v4.f32.s32 {%f40,%f41,%f42,%f43},[pos_tex,{%r40,%r42,%r44,%r46}];\n" +" mov.f32 %f44, %f40;\n" +" mov.f32 %f45, %f41;\n" +" mov.f32 %f46, %f42;\n" +" mov.f32 %f47, %f43;\n" +" cvt.rzi.ftz.s32.f32 %r47, %f47;\n" +" sub.ftz.f32 %f48, %f26, %f45;\n" +" sub.ftz.f32 %f49, %f25, %f44;\n" +" sub.ftz.f32 %f50, %f27, %f46;\n" +" mul.ftz.f32 %f51, %f48, %f48;\n" +" fma.rn.ftz.f32 %f52, %f49, %f49, %f51;\n" +" add.s32 %r48, %r47, %r35;\n" +" cvt.s64.s32 %rd30, %r48;\n" +" fma.rn.ftz.f32 %f53, %f50, %f50, %f52;\n" +" mul.wide.s32 %rd31, %r48, 4;\n" +" add.u64 %rd32, %rd25, %rd31;\n" +" ld.global.f32 %f54, [%rd32+0];\n" +" setp.gt.ftz.f32 %p4, %f54, %f53;\n" +" @!%p4 bra $Lt_0_25602;\n" +" mul.lo.u64 %rd33, %rd30, 16;\n" +" rcp.approx.ftz.f32 %f55, %f53;\n" +" ld.param.u64 %rd34, [__cudaparm_kernel_pair_lj1];\n" +" add.u64 %rd35, %rd34, %rd33;\n" +" ld.global.f32 %f56, [%rd35+8];\n" +" setp.lt.ftz.f32 %p5, %f53, %f56;\n" +" @!%p5 bra $Lt_0_23810;\n" +" .loc 16 173 0\n" +" mul.ftz.f32 %f57, %f55, %f55;\n" +" mul.ftz.f32 %f58, %f55, %f57;\n" +" mov.f32 %f59, %f58;\n" +" .loc 16 174 0\n" +" mul.ftz.f32 %f60, %f58, %f39;\n" +" ld.global.v2.f32 {%f61,%f62}, [%rd35+0];\n" +" mul.ftz.f32 %f63, %f61, %f58;\n" +" sub.ftz.f32 %f64, %f63, %f62;\n" +" mul.ftz.f32 %f65, %f60, %f64;\n" +" bra.uni $Lt_0_23554;\n" +"$Lt_0_23810:\n" +" .loc 16 176 0\n" +" mov.f32 %f65, 0f00000000; \n" +"$Lt_0_23554:\n" +" ld.global.f32 %f66, [%rd35+12];\n" +" setp.gt.ftz.f32 %p6, %f66, %f53;\n" +" @!%p6 bra $Lt_0_24322;\n" +" .loc 16 179 0\n" +" mov.u32 %r49, %r39;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" tex.1d.v4.f32.s32 {%f67,%f68,%f69,%f70},[q_tex,{%r49,%r51,%r53,%r55}];\n" +" mov.f32 %f71, %f67;\n" +" ld.shared.f32 %f72, [%rd29+16];\n" +" ld.param.f32 %f73, [__cudaparm_kernel_pair_qqrd2e];\n" +" mul.ftz.f32 %f74, %f73, %f33;\n" +" mul.ftz.f32 %f75, %f71, %f74;\n" +" sqrt.approx.ftz.f32 %f76, %f55;\n" +" mul.ftz.f32 %f77, %f75, %f76;\n" +" mul.ftz.f32 %f78, %f72, %f77;\n" +" bra.uni $Lt_0_24066;\n" +"$Lt_0_24322:\n" +" .loc 16 181 0\n" +" mov.f32 %f78, 0f00000000; \n" +"$Lt_0_24066:\n" +" .loc 16 185 0\n" +" add.ftz.f32 %f79, %f78, %f65;\n" +" mul.ftz.f32 %f80, %f79, %f55;\n" +" fma.rn.ftz.f32 %f36, %f49, %f80, %f36;\n" +" .loc 16 186 0\n" +" fma.rn.ftz.f32 %f35, %f48, %f80, %f35;\n" +" .loc 16 187 0\n" +" fma.rn.ftz.f32 %f34, %f50, %f80, %f34;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p7, %r56, %r57;\n" +" @%p7 bra $Lt_0_25090;\n" +" .loc 16 190 0\n" +" add.ftz.f32 %f37, %f78, %f37;\n" +" @!%p5 bra $Lt_0_25090;\n" +" .loc 16 193 0\n" +" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd37, %rd36, %rd33;\n" +" mov.f32 %f81, %f59;\n" +" ld.global.v4.f32 {%f82,%f83,%f84,_}, [%rd37+0];\n" +" mul.ftz.f32 %f85, %f82, %f81;\n" +" sub.ftz.f32 %f86, %f85, %f83;\n" +" mul.ftz.f32 %f87, %f81, %f86;\n" +" sub.ftz.f32 %f88, %f87, %f84;\n" +" fma.rn.ftz.f32 %f38, %f39, %f88, %f38;\n" +"$Lt_0_25090:\n" +"$Lt_0_24578:\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p8, %r58, %r59;\n" +" @%p8 bra $Lt_0_25602;\n" +" .loc 16 197 0\n" +" mov.f32 %f89, %f10;\n" +" mul.ftz.f32 %f90, %f49, %f49;\n" +" fma.rn.ftz.f32 %f91, %f80, %f90, %f89;\n" +" mov.f32 %f10, %f91;\n" +" .loc 16 198 0\n" +" mov.f32 %f92, %f12;\n" +" fma.rn.ftz.f32 %f93, %f80, %f51, %f92;\n" +" mov.f32 %f12, %f93;\n" +" .loc 16 199 0\n" +" mov.f32 %f94, %f14;\n" +" mul.ftz.f32 %f95, %f50, %f50;\n" +" fma.rn.ftz.f32 %f96, %f80, %f95, %f94;\n" +" mov.f32 %f14, %f96;\n" +" .loc 16 200 0\n" +" mov.f32 %f97, %f16;\n" +" mul.ftz.f32 %f98, %f48, %f49;\n" +" fma.rn.ftz.f32 %f99, %f80, %f98, %f97;\n" +" mov.f32 %f16, %f99;\n" +" .loc 16 201 0\n" +" mov.f32 %f100, %f18;\n" +" mul.ftz.f32 %f101, %f49, %f50;\n" +" fma.rn.ftz.f32 %f102, %f80, %f101, %f100;\n" +" mov.f32 %f18, %f102;\n" +" .loc 16 202 0\n" +" mul.ftz.f32 %f103, %f48, %f50;\n" +" fma.rn.ftz.f32 %f19, %f80, %f103, %f19;\n" +" mov.f32 %f20, %f19;\n" +"$Lt_0_25602:\n" +"$Lt_0_23042:\n" +" .loc 16 150 0\n" +" mul.lo.u64 %rd38, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd38;\n" +" setp.lt.u64 %p9, %rd16, %rd13;\n" +" @%p9 bra $Lt_0_22786;\n" +" bra.uni $Lt_0_21250;\n" +"$Lt_0_32002:\n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" bra.uni $Lt_0_21250;\n" +"$Lt_0_21506:\n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +"$Lt_0_21250:\n" +" mov.u32 %r60, 1;\n" +" setp.le.s32 %p10, %r1, %r60;\n" +" @%p10 bra $Lt_0_28418;\n" +" .loc 16 213 0\n" +" mov.u64 %rd39, __cuda___cuda_local_var_32603_35_non_const_red_acc144;\n" +" cvt.s64.s32 %rd40, %r2;\n" +" mul.wide.s32 %rd41, %r2, 4;\n" +" add.u64 %rd42, %rd39, %rd41;\n" +" mov.f32 %f104, %f36;\n" +" st.shared.f32 [%rd42+0], %f104;\n" +" .loc 16 214 0\n" +" mov.f32 %f105, %f35;\n" +" st.shared.f32 [%rd42+512], %f105;\n" +" .loc 16 215 0\n" +" mov.f32 %f106, %f34;\n" +" st.shared.f32 [%rd42+1024], %f106;\n" +" .loc 16 216 0\n" +" mov.f32 %f107, %f38;\n" +" st.shared.f32 [%rd42+1536], %f107;\n" +" .loc 16 217 0\n" +" mov.f32 %f108, %f37;\n" +" st.shared.f32 [%rd42+2048], %f108;\n" +" .loc 16 219 0\n" +" shr.s32 %r61, %r1, 31;\n" +" mov.s32 %r62, 1;\n" +" and.b32 %r63, %r61, %r62;\n" +" add.s32 %r64, %r63, %r1;\n" +" shr.s32 %r65, %r64, 1;\n" +" mov.s32 %r66, %r65;\n" +" mov.u32 %r67, 0;\n" +" setp.ne.u32 %p11, %r65, %r67;\n" +" @!%p11 bra $Lt_0_26882;\n" +"$Lt_0_27394:\n" +" setp.ge.u32 %p12, %r6, %r66;\n" +" @%p12 bra $Lt_0_27650;\n" +" .loc 16 222 0\n" +" add.u32 %r68, %r2, %r66;\n" +" cvt.u64.u32 %rd43, %r68;\n" +" mul.wide.u32 %rd44, %r68, 4;\n" +" add.u64 %rd45, %rd39, %rd44;\n" +" ld.shared.f32 %f109, [%rd45+0];\n" +" add.ftz.f32 %f104, %f109, %f104;\n" +" st.shared.f32 [%rd42+0], %f104;\n" +" ld.shared.f32 %f110, [%rd45+512];\n" +" add.ftz.f32 %f105, %f110, %f105;\n" +" st.shared.f32 [%rd42+512], %f105;\n" +" ld.shared.f32 %f111, [%rd45+1024];\n" +" add.ftz.f32 %f106, %f111, %f106;\n" +" st.shared.f32 [%rd42+1024], %f106;\n" +" ld.shared.f32 %f112, [%rd45+1536];\n" +" add.ftz.f32 %f107, %f112, %f107;\n" +" st.shared.f32 [%rd42+1536], %f107;\n" +" ld.shared.f32 %f113, [%rd45+2048];\n" +" add.ftz.f32 %f108, %f113, %f108;\n" +" st.shared.f32 [%rd42+2048], %f108;\n" +"$Lt_0_27650:\n" +" .loc 16 219 0\n" +" shr.u32 %r66, %r66, 1;\n" +" mov.u32 %r69, 0;\n" +" setp.ne.u32 %p13, %r66, %r69;\n" +" @%p13 bra $Lt_0_27394;\n" +"$Lt_0_26882:\n" +" .loc 16 226 0\n" +" mov.f32 %f36, %f104;\n" +" .loc 16 227 0\n" +" mov.f32 %f35, %f105;\n" +" .loc 16 228 0\n" +" mov.f32 %f34, %f106;\n" +" .loc 16 229 0\n" +" mov.f32 %f38, %f107;\n" +" .loc 16 230 0\n" +" mov.f32 %f37, %f108;\n" +" ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r71, 0;\n" +" setp.le.s32 %p14, %r70, %r71;\n" +" @%p14 bra $Lt_0_28418;\n" +" .loc 16 234 0\n" +" mov.f32 %f104, %f10;\n" +" st.shared.f32 [%rd42+0], %f104;\n" +" mov.f32 %f105, %f12;\n" +" st.shared.f32 [%rd42+512], %f105;\n" +" mov.f32 %f106, %f14;\n" +" st.shared.f32 [%rd42+1024], %f106;\n" +" mov.f32 %f107, %f16;\n" +" st.shared.f32 [%rd42+1536], %f107;\n" +" mov.f32 %f108, %f18;\n" +" st.shared.f32 [%rd42+2048], %f108;\n" +" mov.f32 %f114, %f20;\n" +" st.shared.f32 [%rd42+2560], %f114;\n" +" .loc 16 236 0\n" +" mov.s32 %r72, %r65;\n" +" @!%p11 bra $Lt_0_28930;\n" +"$Lt_0_29442:\n" +" setp.ge.u32 %p15, %r6, %r72;\n" +" @%p15 bra $Lt_0_29698;\n" +" .loc 16 239 0\n" +" add.u32 %r73, %r2, %r72;\n" +" cvt.u64.u32 %rd46, %r73;\n" +" mul.wide.u32 %rd47, %r73, 4;\n" +" add.u64 %rd48, %rd39, %rd47;\n" +" ld.shared.f32 %f115, [%rd48+0];\n" +" add.ftz.f32 %f104, %f115, %f104;\n" +" st.shared.f32 [%rd42+0], %f104;\n" +" ld.shared.f32 %f116, [%rd48+512];\n" +" add.ftz.f32 %f105, %f116, %f105;\n" +" st.shared.f32 [%rd42+512], %f105;\n" +" ld.shared.f32 %f117, [%rd48+1024];\n" +" add.ftz.f32 %f106, %f117, %f106;\n" +" st.shared.f32 [%rd42+1024], %f106;\n" +" ld.shared.f32 %f118, [%rd48+1536];\n" +" add.ftz.f32 %f107, %f118, %f107;\n" +" st.shared.f32 [%rd42+1536], %f107;\n" +" ld.shared.f32 %f119, [%rd48+2048];\n" +" add.ftz.f32 %f108, %f119, %f108;\n" +" st.shared.f32 [%rd42+2048], %f108;\n" +" ld.shared.f32 %f120, [%rd48+2560];\n" +" add.ftz.f32 %f114, %f120, %f114;\n" +" st.shared.f32 [%rd42+2560], %f114;\n" +"$Lt_0_29698:\n" +" .loc 16 236 0\n" +" shr.u32 %r72, %r72, 1;\n" +" mov.u32 %r74, 0;\n" +" setp.ne.u32 %p16, %r72, %r74;\n" +" @%p16 bra $Lt_0_29442;\n" +"$Lt_0_28930:\n" +" .loc 16 244 0\n" +" mov.f32 %f10, %f104;\n" +" mov.f32 %f12, %f105;\n" +" mov.f32 %f14, %f106;\n" +" mov.f32 %f16, %f107;\n" +" mov.f32 %f18, %f108;\n" +" mov.f32 %f20, %f114;\n" +"$Lt_0_28418:\n" +"$Lt_0_26370:\n" +" selp.s32 %r75, 1, 0, %p1;\n" +" mov.s32 %r76, 0;\n" +" set.eq.u32.s32 %r77, %r6, %r76;\n" +" neg.s32 %r78, %r77;\n" +" and.b32 %r79, %r75, %r78;\n" +" mov.u32 %r80, 0;\n" +" setp.eq.s32 %p17, %r79, %r80;\n" +" @%p17 bra $Lt_0_30466;\n" +" .loc 16 250 0\n" +" cvt.s64.s32 %rd49, %r9;\n" +" ld.param.u64 %rd50, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd51, %r9, 4;\n" +" add.u64 %rd52, %rd50, %rd51;\n" +" ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r82, 0;\n" +" setp.le.s32 %p18, %r81, %r82;\n" +" @%p18 bra $Lt_0_30978;\n" +" .loc 16 252 0\n" +" st.global.f32 [%rd52+0], %f38;\n" +" .loc 16 253 0\n" +" cvt.s64.s32 %rd53, %r10;\n" +" mul.wide.s32 %rd54, %r10, 4;\n" +" add.u64 %rd55, %rd54, %rd52;\n" +" .loc 16 254 0\n" +" st.global.f32 [%rd55+0], %f37;\n" +" .loc 16 255 0\n" +" add.u64 %rd52, %rd54, %rd55;\n" +"$Lt_0_30978:\n" +" ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r84, 0;\n" +" setp.le.s32 %p19, %r83, %r84;\n" +" @%p19 bra $Lt_0_31490;\n" +" .loc 16 259 0\n" +" mov.f32 %f121, %f10;\n" +" st.global.f32 [%rd52+0], %f121;\n" +" .loc 16 260 0\n" +" cvt.s64.s32 %rd56, %r10;\n" +" mul.wide.s32 %rd57, %r10, 4;\n" +" add.u64 %rd58, %rd57, %rd52;\n" +" .loc 16 259 0\n" +" mov.f32 %f122, %f12;\n" +" st.global.f32 [%rd58+0], %f122;\n" +" .loc 16 260 0\n" +" add.u64 %rd59, %rd57, %rd58;\n" +" .loc 16 259 0\n" +" mov.f32 %f123, %f14;\n" +" st.global.f32 [%rd59+0], %f123;\n" +" .loc 16 260 0\n" +" add.u64 %rd60, %rd57, %rd59;\n" +" .loc 16 259 0\n" +" mov.f32 %f124, %f16;\n" +" st.global.f32 [%rd60+0], %f124;\n" +" .loc 16 260 0\n" +" add.u64 %rd52, %rd57, %rd60;\n" +" .loc 16 259 0\n" +" mov.f32 %f125, %f18;\n" +" st.global.f32 [%rd52+0], %f125;\n" +" mov.f32 %f126, %f20;\n" +" add.u64 %rd61, %rd57, %rd52;\n" +" st.global.f32 [%rd61+0], %f126;\n" +"$Lt_0_31490:\n" +" .loc 16 263 0\n" +" ld.param.u64 %rd62, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd63, %rd49, 16;\n" +" add.u64 %rd64, %rd62, %rd63;\n" +" mov.f32 %f127, %f128;\n" +" st.global.v4.f32 [%rd64+0], {%f36,%f35,%f34,%f127};\n" +"$Lt_0_30466:\n" +" .loc 16 265 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" +" .param .u64 __cudaparm_kernel_pair_fast__cutsq,\n" +" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<88>;\n" +" .reg .u64 %rd<82>;\n" +" .reg .f32 %f<134>;\n" +" .reg .pred %p<24>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32676_33_non_const_sp_lj3320[32];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32673_34_non_const_lj13360[1936];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32675_33_non_const_cutsq5296[484];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32674_34_non_const_lj35792[1936];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32783_35_non_const_red_acc7728[3072];\n" +" .loc 16 275 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 7;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_23554;\n" +" .loc 16 286 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_sp_lj3320;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_23554:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_sp_lj3320;\n" +" mov.u32 %r3, 120;\n" +" setp.gt.s32 %p2, %r1, %r3;\n" +" @%p2 bra $Lt_1_24066;\n" +" .loc 16 288 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32673_34_non_const_lj13360;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32675_33_non_const_cutsq5296;\n" +" cvt.s64.s32 %rd9, %r1;\n" +" mul.wide.s32 %rd10, %r1, 16;\n" +" ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n" +" add.u64 %rd12, %rd11, %rd10;\n" +" add.u64 %rd13, %rd10, %rd7;\n" +" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0];\n" +" st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5};\n" +" .loc 16 289 0\n" +" mul.wide.s32 %rd14, %r1, 4;\n" +" ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast__cutsq];\n" +" add.u64 %rd16, %rd15, %rd14;\n" +" ld.global.f32 %f6, [%rd16+0];\n" +" add.u64 %rd17, %rd14, %rd8;\n" +" st.shared.f32 [%rd17+0], %f6;\n" +" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r5, 0;\n" +" setp.le.s32 %p3, %r4, %r5;\n" +" @%p3 bra $Lt_1_24578;\n" +" .loc 16 291 0\n" +" mov.u64 %rd18, __cuda___cuda_local_var_32674_34_non_const_lj35792;\n" +" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_lj3_in];\n" +" add.u64 %rd20, %rd19, %rd10;\n" +" add.u64 %rd21, %rd10, %rd18;\n" +" ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd20+0];\n" +" st.shared.v4.f32 [%rd21+0], {%f7,%f8,%f9,%f10};\n" +"$Lt_1_24578:\n" +" mov.u64 %rd18, __cuda___cuda_local_var_32674_34_non_const_lj35792;\n" +"$Lt_1_24066:\n" +" mov.u64 %rd18, __cuda___cuda_local_var_32674_34_non_const_lj35792;\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32673_34_non_const_lj13360;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32675_33_non_const_cutsq5296;\n" +" .loc 16 302 0\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" mov.f32 %f17, 0f00000000; \n" +" mov.f32 %f18, %f17;\n" +" mov.f32 %f19, 0f00000000; \n" +" mov.f32 %f20, %f19;\n" +" mov.f32 %f21, 0f00000000; \n" +" mov.f32 %f22, %f21;\n" +" .loc 16 304 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r7, %r1, %r6;\n" +" cvt.s32.u32 %r8, %ntid.x;\n" +" div.s32 %r9, %r8, %r6;\n" +" rem.s32 %r10, %r1, %r6;\n" +" cvt.s32.u32 %r11, %ctaid.x;\n" +" mul.lo.s32 %r12, %r11, %r9;\n" +" add.s32 %r13, %r7, %r12;\n" +" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p4, %r13, %r14;\n" +" @!%p4 bra $Lt_1_25346;\n" +" .loc 16 308 0\n" +" cvt.s64.s32 %rd22, %r13;\n" +" mul.wide.s32 %rd23, %r13, 4;\n" +" ld.param.u64 %rd24, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd25, %rd23, %rd24;\n" +" ld.global.s32 %r15, [%rd25+0];\n" +" .loc 16 310 0\n" +" ld.param.s32 %r16, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd26, %r16;\n" +" mul.wide.s32 %rd27, %r16, 4;\n" +" add.u64 %rd28, %rd27, %rd25;\n" +" ld.global.s32 %r17, [%rd28+0];\n" +" add.u64 %rd29, %rd27, %rd28;\n" +" ld.param.u64 %rd30, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p5, %rd30, %rd24;\n" +" @%p5 bra $Lt_1_25858;\n" +" .loc 16 316 0\n" +" cvt.s32.s64 %r18, %rd26;\n" +" mul.lo.s32 %r19, %r18, %r17;\n" +" cvt.s64.s32 %rd31, %r19;\n" +" mul.wide.s32 %rd32, %r19, 4;\n" +" add.u64 %rd33, %rd29, %rd32;\n" +" .loc 16 317 0\n" +" mul.lo.s32 %r20, %r10, %r18;\n" +" cvt.s64.s32 %rd34, %r20;\n" +" mul.wide.s32 %rd35, %r20, 4;\n" +" add.u64 %rd36, %rd29, %rd35;\n" +" .loc 16 318 0\n" +" mul.lo.s32 %r21, %r18, %r6;\n" +" bra.uni $Lt_1_25602;\n" +"$Lt_1_25858:\n" +" .loc 16 320 0\n" +" ld.global.s32 %r22, [%rd29+0];\n" +" cvt.s64.s32 %rd37, %r22;\n" +" mul.wide.s32 %rd38, %r22, 4;\n" +" add.u64 %rd39, %rd30, %rd38;\n" +" .loc 16 321 0\n" +" cvt.s64.s32 %rd40, %r17;\n" +" mul.wide.s32 %rd41, %r17, 4;\n" +" add.u64 %rd33, %rd39, %rd41;\n" +" .loc 16 322 0\n" +" mov.s32 %r21, %r6;\n" +" .loc 16 323 0\n" +" cvt.s64.s32 %rd42, %r10;\n" +" mul.wide.s32 %rd43, %r10, 4;\n" +" add.u64 %rd36, %rd39, %rd43;\n" +"$Lt_1_25602:\n" +" .loc 16 326 0\n" +" mov.u32 %r23, %r15;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" mov.s32 %r28, 0;\n" +" mov.u32 %r29, %r28;\n" +" tex.1d.v4.f32.s32 {%f23,%f24,%f25,%f26},[pos_tex,{%r23,%r25,%r27,%r29}];\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" mov.f32 %f29, %f25;\n" +" mov.f32 %f30, %f26;\n" +" .loc 16 327 0\n" +" mov.u32 %r30, %r15;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" mov.s32 %r33, 0;\n" +" mov.u32 %r34, %r33;\n" +" mov.s32 %r35, 0;\n" +" mov.u32 %r36, %r35;\n" +" tex.1d.v4.f32.s32 {%f31,%f32,%f33,%f34},[q_tex,{%r30,%r32,%r34,%r36}];\n" +" mov.f32 %f35, %f31;\n" +" setp.ge.u64 %p6, %rd36, %rd33;\n" +" @%p6 bra $Lt_1_35842;\n" +" cvt.rzi.ftz.s32.f32 %r37, %f30;\n" +" cvt.s64.s32 %rd44, %r21;\n" +" mul.lo.s32 %r38, %r37, 11;\n" +" cvt.rn.f32.s32 %f36, %r38;\n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.f32 %f40, 0f00000000; \n" +" mov.f32 %f41, 0f00000000; \n" +"$Lt_1_26626:\n" +" .loc 16 332 0\n" +" ld.global.s32 %r39, [%rd36+0];\n" +" .loc 16 335 0\n" +" shr.s32 %r40, %r39, 30;\n" +" and.b32 %r41, %r40, 3;\n" +" cvt.s64.s32 %rd45, %r41;\n" +" mul.wide.s32 %rd46, %r41, 4;\n" +" add.u64 %rd47, %rd1, %rd46;\n" +" ld.shared.f32 %f42, [%rd47+0];\n" +" .loc 16 339 0\n" +" and.b32 %r42, %r39, 1073741823;\n" +" mov.u32 %r43, %r42;\n" +" mov.s32 %r44, 0;\n" +" mov.u32 %r45, %r44;\n" +" mov.s32 %r46, 0;\n" +" mov.u32 %r47, %r46;\n" +" mov.s32 %r48, 0;\n" +" mov.u32 %r49, %r48;\n" +" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r43,%r45,%r47,%r49}];\n" +" mov.f32 %f47, %f43;\n" +" mov.f32 %f48, %f44;\n" +" mov.f32 %f49, %f45;\n" +" mov.f32 %f50, %f46;\n" +" sub.ftz.f32 %f51, %f28, %f48;\n" +" sub.ftz.f32 %f52, %f27, %f47;\n" +" sub.ftz.f32 %f53, %f29, %f49;\n" +" mul.ftz.f32 %f54, %f51, %f51;\n" +" fma.rn.ftz.f32 %f55, %f52, %f52, %f54;\n" +" fma.rn.ftz.f32 %f56, %f53, %f53, %f55;\n" +" add.ftz.f32 %f57, %f36, %f50;\n" +" cvt.rzi.ftz.s32.f32 %r50, %f57;\n" +" cvt.s64.s32 %rd48, %r50;\n" +" mul.wide.s32 %rd49, %r50, 4;\n" +" add.u64 %rd50, %rd8, %rd49;\n" +" ld.shared.f32 %f58, [%rd50+0];\n" +" setp.gt.ftz.f32 %p7, %f58, %f56;\n" +" @!%p7 bra $Lt_1_29442;\n" +" rcp.approx.ftz.f32 %f59, %f56;\n" +" mul.lo.u64 %rd51, %rd48, 16;\n" +" add.u64 %rd52, %rd51, %rd7;\n" +" ld.shared.f32 %f60, [%rd52+8];\n" +" setp.lt.ftz.f32 %p8, %f56, %f60;\n" +" @!%p8 bra $Lt_1_27650;\n" +" .loc 16 353 0\n" +" mul.ftz.f32 %f61, %f59, %f59;\n" +" mul.ftz.f32 %f62, %f59, %f61;\n" +" mov.f32 %f63, %f62;\n" +" .loc 16 354 0\n" +" mul.ftz.f32 %f64, %f62, %f42;\n" +" ld.shared.v2.f32 {%f65,%f66}, [%rd52+0];\n" +" mul.ftz.f32 %f67, %f65, %f62;\n" +" sub.ftz.f32 %f68, %f67, %f66;\n" +" mul.ftz.f32 %f69, %f64, %f68;\n" +" bra.uni $Lt_1_27394;\n" +"$Lt_1_27650:\n" +" .loc 16 356 0\n" +" mov.f32 %f69, 0f00000000; \n" +"$Lt_1_27394:\n" +" ld.shared.f32 %f70, [%rd52+12];\n" +" setp.gt.ftz.f32 %p9, %f70, %f56;\n" +" @!%p9 bra $Lt_1_28162;\n" +" .loc 16 359 0\n" +" mov.u32 %r51, %r42;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" mov.s32 %r56, 0;\n" +" mov.u32 %r57, %r56;\n" +" tex.1d.v4.f32.s32 {%f71,%f72,%f73,%f74},[q_tex,{%r51,%r53,%r55,%r57}];\n" +" mov.f32 %f75, %f71;\n" +" ld.shared.f32 %f76, [%rd47+16];\n" +" ld.param.f32 %f77, [__cudaparm_kernel_pair_fast_qqrd2e];\n" +" mul.ftz.f32 %f78, %f77, %f35;\n" +" mul.ftz.f32 %f79, %f75, %f78;\n" +" sqrt.approx.ftz.f32 %f80, %f59;\n" +" mul.ftz.f32 %f81, %f79, %f80;\n" +" mul.ftz.f32 %f82, %f76, %f81;\n" +" bra.uni $Lt_1_27906;\n" +"$Lt_1_28162:\n" +" .loc 16 361 0\n" +" mov.f32 %f82, 0f00000000; \n" +"$Lt_1_27906:\n" +" .loc 16 365 0\n" +" add.ftz.f32 %f83, %f82, %f69;\n" +" mul.ftz.f32 %f84, %f83, %f59;\n" +" fma.rn.ftz.f32 %f39, %f52, %f84, %f39;\n" +" .loc 16 366 0\n" +" fma.rn.ftz.f32 %f38, %f51, %f84, %f38;\n" +" .loc 16 367 0\n" +" fma.rn.ftz.f32 %f37, %f53, %f84, %f37;\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p10, %r58, %r59;\n" +" @%p10 bra $Lt_1_28930;\n" +" .loc 16 370 0\n" +" add.ftz.f32 %f40, %f82, %f40;\n" +" @!%p8 bra $Lt_1_28930;\n" +" .loc 16 372 0\n" +" add.u64 %rd53, %rd51, %rd18;\n" +" mov.f32 %f85, %f63;\n" +" ld.shared.v4.f32 {%f86,%f87,%f88,_}, [%rd53+0];\n" +" mul.ftz.f32 %f89, %f86, %f85;\n" +" sub.ftz.f32 %f90, %f89, %f87;\n" +" mul.ftz.f32 %f91, %f85, %f90;\n" +" .loc 16 373 0\n" +" sub.ftz.f32 %f92, %f91, %f88;\n" +" fma.rn.ftz.f32 %f41, %f42, %f92, %f41;\n" +"$Lt_1_28930:\n" +"$Lt_1_28418:\n" +" ld.param.s32 %r60, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r61, 0;\n" +" setp.le.s32 %p11, %r60, %r61;\n" +" @%p11 bra $Lt_1_29442;\n" +" .loc 16 377 0\n" +" mov.f32 %f93, %f12;\n" +" mul.ftz.f32 %f94, %f52, %f52;\n" +" fma.rn.ftz.f32 %f95, %f84, %f94, %f93;\n" +" mov.f32 %f12, %f95;\n" +" .loc 16 378 0\n" +" mov.f32 %f96, %f14;\n" +" fma.rn.ftz.f32 %f97, %f84, %f54, %f96;\n" +" mov.f32 %f14, %f97;\n" +" .loc 16 379 0\n" +" mov.f32 %f98, %f16;\n" +" mul.ftz.f32 %f99, %f53, %f53;\n" +" fma.rn.ftz.f32 %f100, %f84, %f99, %f98;\n" +" mov.f32 %f16, %f100;\n" +" .loc 16 380 0\n" +" mov.f32 %f101, %f18;\n" +" mul.ftz.f32 %f102, %f51, %f52;\n" +" fma.rn.ftz.f32 %f103, %f84, %f102, %f101;\n" +" mov.f32 %f18, %f103;\n" +" .loc 16 381 0\n" +" mov.f32 %f104, %f20;\n" +" mul.ftz.f32 %f105, %f52, %f53;\n" +" fma.rn.ftz.f32 %f106, %f84, %f105, %f104;\n" +" mov.f32 %f20, %f106;\n" +" .loc 16 382 0\n" +" mul.ftz.f32 %f107, %f51, %f53;\n" +" fma.rn.ftz.f32 %f21, %f84, %f107, %f21;\n" +" mov.f32 %f22, %f21;\n" +"$Lt_1_29442:\n" +"$Lt_1_26882:\n" +" .loc 16 331 0\n" +" mul.lo.u64 %rd54, %rd44, 4;\n" +" add.u64 %rd36, %rd36, %rd54;\n" +" setp.lt.u64 %p12, %rd36, %rd33;\n" +" @%p12 bra $Lt_1_26626;\n" +" bra.uni $Lt_1_25090;\n" +"$Lt_1_35842:\n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.f32 %f40, 0f00000000; \n" +" mov.f32 %f41, 0f00000000; \n" +" bra.uni $Lt_1_25090;\n" +"$Lt_1_25346:\n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.f32 %f40, 0f00000000; \n" +" mov.f32 %f41, 0f00000000; \n" +"$Lt_1_25090:\n" +" mov.u32 %r62, 1;\n" +" setp.le.s32 %p13, %r6, %r62;\n" +" @%p13 bra $Lt_1_32258;\n" +" .loc 16 393 0\n" +" mov.u64 %rd55, __cuda___cuda_local_var_32783_35_non_const_red_acc7728;\n" +" cvt.s64.s32 %rd56, %r1;\n" +" mul.wide.s32 %rd57, %r1, 4;\n" +" add.u64 %rd58, %rd55, %rd57;\n" +" mov.f32 %f108, %f39;\n" +" st.shared.f32 [%rd58+0], %f108;\n" +" .loc 16 394 0\n" +" mov.f32 %f109, %f38;\n" +" st.shared.f32 [%rd58+512], %f109;\n" +" .loc 16 395 0\n" +" mov.f32 %f110, %f37;\n" +" st.shared.f32 [%rd58+1024], %f110;\n" +" .loc 16 396 0\n" +" mov.f32 %f111, %f41;\n" +" st.shared.f32 [%rd58+1536], %f111;\n" +" .loc 16 397 0\n" +" mov.f32 %f112, %f40;\n" +" st.shared.f32 [%rd58+2048], %f112;\n" +" .loc 16 399 0\n" +" shr.s32 %r63, %r6, 31;\n" +" mov.s32 %r64, 1;\n" +" and.b32 %r65, %r63, %r64;\n" +" add.s32 %r66, %r65, %r6;\n" +" shr.s32 %r67, %r66, 1;\n" +" mov.s32 %r68, %r67;\n" +" mov.u32 %r69, 0;\n" +" setp.ne.u32 %p14, %r67, %r69;\n" +" @!%p14 bra $Lt_1_30722;\n" +"$Lt_1_31234:\n" +" setp.ge.u32 %p15, %r10, %r68;\n" +" @%p15 bra $Lt_1_31490;\n" +" .loc 16 402 0\n" +" add.u32 %r70, %r1, %r68;\n" +" cvt.u64.u32 %rd59, %r70;\n" +" mul.wide.u32 %rd60, %r70, 4;\n" +" add.u64 %rd61, %rd55, %rd60;\n" +" ld.shared.f32 %f113, [%rd61+0];\n" +" add.ftz.f32 %f108, %f113, %f108;\n" +" st.shared.f32 [%rd58+0], %f108;\n" +" ld.shared.f32 %f114, [%rd61+512];\n" +" add.ftz.f32 %f109, %f114, %f109;\n" +" st.shared.f32 [%rd58+512], %f109;\n" +" ld.shared.f32 %f115, [%rd61+1024];\n" +" add.ftz.f32 %f110, %f115, %f110;\n" +" st.shared.f32 [%rd58+1024], %f110;\n" +" ld.shared.f32 %f116, [%rd61+1536];\n" +" add.ftz.f32 %f111, %f116, %f111;\n" +" st.shared.f32 [%rd58+1536], %f111;\n" +" ld.shared.f32 %f117, [%rd61+2048];\n" +" add.ftz.f32 %f112, %f117, %f112;\n" +" st.shared.f32 [%rd58+2048], %f112;\n" +"$Lt_1_31490:\n" +" .loc 16 399 0\n" +" shr.u32 %r68, %r68, 1;\n" +" mov.u32 %r71, 0;\n" +" setp.ne.u32 %p16, %r68, %r71;\n" +" @%p16 bra $Lt_1_31234;\n" +"$Lt_1_30722:\n" +" .loc 16 406 0\n" +" mov.f32 %f39, %f108;\n" +" .loc 16 407 0\n" +" mov.f32 %f38, %f109;\n" +" .loc 16 408 0\n" +" mov.f32 %f37, %f110;\n" +" .loc 16 409 0\n" +" mov.f32 %f41, %f111;\n" +" .loc 16 410 0\n" +" mov.f32 %f40, %f112;\n" +" ld.param.s32 %r72, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r73, 0;\n" +" setp.le.s32 %p17, %r72, %r73;\n" +" @%p17 bra $Lt_1_32258;\n" +" .loc 16 414 0\n" +" mov.f32 %f108, %f12;\n" +" st.shared.f32 [%rd58+0], %f108;\n" +" mov.f32 %f109, %f14;\n" +" st.shared.f32 [%rd58+512], %f109;\n" +" mov.f32 %f110, %f16;\n" +" st.shared.f32 [%rd58+1024], %f110;\n" +" mov.f32 %f111, %f18;\n" +" st.shared.f32 [%rd58+1536], %f111;\n" +" mov.f32 %f112, %f20;\n" +" st.shared.f32 [%rd58+2048], %f112;\n" +" mov.f32 %f118, %f22;\n" +" st.shared.f32 [%rd58+2560], %f118;\n" +" .loc 16 416 0\n" +" mov.s32 %r74, %r67;\n" +" @!%p14 bra $Lt_1_32770;\n" +"$Lt_1_33282:\n" +" setp.ge.u32 %p18, %r10, %r74;\n" +" @%p18 bra $Lt_1_33538;\n" +" .loc 16 419 0\n" +" add.u32 %r75, %r1, %r74;\n" +" cvt.u64.u32 %rd62, %r75;\n" +" mul.wide.u32 %rd63, %r75, 4;\n" +" add.u64 %rd64, %rd55, %rd63;\n" +" ld.shared.f32 %f119, [%rd64+0];\n" +" add.ftz.f32 %f108, %f119, %f108;\n" +" st.shared.f32 [%rd58+0], %f108;\n" +" ld.shared.f32 %f120, [%rd64+512];\n" +" add.ftz.f32 %f109, %f120, %f109;\n" +" st.shared.f32 [%rd58+512], %f109;\n" +" ld.shared.f32 %f121, [%rd64+1024];\n" +" add.ftz.f32 %f110, %f121, %f110;\n" +" st.shared.f32 [%rd58+1024], %f110;\n" +" ld.shared.f32 %f122, [%rd64+1536];\n" +" add.ftz.f32 %f111, %f122, %f111;\n" +" st.shared.f32 [%rd58+1536], %f111;\n" +" ld.shared.f32 %f123, [%rd64+2048];\n" +" add.ftz.f32 %f112, %f123, %f112;\n" +" st.shared.f32 [%rd58+2048], %f112;\n" +" ld.shared.f32 %f124, [%rd64+2560];\n" +" add.ftz.f32 %f118, %f124, %f118;\n" +" st.shared.f32 [%rd58+2560], %f118;\n" +"$Lt_1_33538:\n" +" .loc 16 416 0\n" +" shr.u32 %r74, %r74, 1;\n" +" mov.u32 %r76, 0;\n" +" setp.ne.u32 %p19, %r74, %r76;\n" +" @%p19 bra $Lt_1_33282;\n" +"$Lt_1_32770:\n" +" .loc 16 424 0\n" +" mov.f32 %f12, %f108;\n" +" mov.f32 %f14, %f109;\n" +" mov.f32 %f16, %f110;\n" +" mov.f32 %f18, %f111;\n" +" mov.f32 %f20, %f112;\n" +" mov.f32 %f22, %f118;\n" +"$Lt_1_32258:\n" +"$Lt_1_30210:\n" +" selp.s32 %r77, 1, 0, %p4;\n" +" mov.s32 %r78, 0;\n" +" set.eq.u32.s32 %r79, %r10, %r78;\n" +" neg.s32 %r80, %r79;\n" +" and.b32 %r81, %r77, %r80;\n" +" mov.u32 %r82, 0;\n" +" setp.eq.s32 %p20, %r81, %r82;\n" +" @%p20 bra $Lt_1_34306;\n" +" .loc 16 430 0\n" +" cvt.s64.s32 %rd65, %r13;\n" +" ld.param.u64 %rd66, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd67, %r13, 4;\n" +" add.u64 %rd68, %rd66, %rd67;\n" +" ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r84, 0;\n" +" setp.le.s32 %p21, %r83, %r84;\n" +" @%p21 bra $Lt_1_34818;\n" +" .loc 16 432 0\n" +" st.global.f32 [%rd68+0], %f41;\n" +" .loc 16 433 0\n" +" cvt.s64.s32 %rd69, %r14;\n" +" mul.wide.s32 %rd70, %r14, 4;\n" +" add.u64 %rd71, %rd70, %rd68;\n" +" .loc 16 434 0\n" +" st.global.f32 [%rd71+0], %f40;\n" +" .loc 16 435 0\n" +" add.u64 %rd68, %rd70, %rd71;\n" +"$Lt_1_34818:\n" +" ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r86, 0;\n" +" setp.le.s32 %p22, %r85, %r86;\n" +" @%p22 bra $Lt_1_35330;\n" +" .loc 16 439 0\n" +" mov.f32 %f125, %f12;\n" +" st.global.f32 [%rd68+0], %f125;\n" +" .loc 16 440 0\n" +" cvt.s64.s32 %rd72, %r14;\n" +" mul.wide.s32 %rd73, %r14, 4;\n" +" add.u64 %rd74, %rd73, %rd68;\n" +" .loc 16 439 0\n" +" mov.f32 %f126, %f14;\n" +" st.global.f32 [%rd74+0], %f126;\n" +" .loc 16 440 0\n" +" add.u64 %rd75, %rd73, %rd74;\n" +" .loc 16 439 0\n" +" mov.f32 %f127, %f16;\n" +" st.global.f32 [%rd75+0], %f127;\n" +" .loc 16 440 0\n" +" add.u64 %rd76, %rd73, %rd75;\n" +" .loc 16 439 0\n" +" mov.f32 %f128, %f18;\n" +" st.global.f32 [%rd76+0], %f128;\n" +" .loc 16 440 0\n" +" add.u64 %rd68, %rd73, %rd76;\n" +" .loc 16 439 0\n" +" mov.f32 %f129, %f20;\n" +" st.global.f32 [%rd68+0], %f129;\n" +" mov.f32 %f130, %f22;\n" +" add.u64 %rd77, %rd73, %rd68;\n" +" st.global.f32 [%rd77+0], %f130;\n" +"$Lt_1_35330:\n" +" .loc 16 443 0\n" +" ld.param.u64 %rd78, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd79, %rd65, 16;\n" +" add.u64 %rd80, %rd78, %rd79;\n" +" mov.f32 %f131, %f132;\n" +" st.global.v4.f32 [%rd80+0], {%f39,%f38,%f37,%f131};\n" +"$Lt_1_34306:\n" +" .loc 16 445 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/ljcl_cut_gpu_kernel.ptx b/lib/gpu/ljcl_cut_gpu_kernel.ptx new file mode 100644 index 000000000..46ee340ad --- /dev/null +++ b/lib/gpu/ljcl_cut_gpu_kernel.ptx @@ -0,0 +1,1211 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000beb9_00000000-9_ljcl_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.aJtiP6) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000beb9_00000000-8_ljcl_cut_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "ljcl_cut_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + .global .texref q_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_lj1, + .param .u64 __cudaparm_kernel_pair_lj3, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_q_, + .param .f32 __cudaparm_kernel_pair_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_qqrd2e, + .param .f32 __cudaparm_kernel_pair_g_ewald, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<86>; + .reg .u64 %rd<63>; + .reg .f32 %f<164>; + .reg .pred %p<21>; + .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_lj112[32]; + .shared .align 4 .b8 __cuda___cuda_local_var_32610_35_non_const_red_acc144[3072]; + // __cuda_local_var_32510_10_non_const_f = 64 + // __cuda_local_var_32514_9_non_const_virial = 16 + // __cuda_local_var_32562_43_non_const_r6inv = 40 + // __cuda_local_var_32562_50_non_const_prefactor = 48 + // __cuda_local_var_32562_61_non_const__erfc = 44 + .loc 16 108 0 +$LDWbegin_kernel_pair: + .loc 16 115 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 116 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 117 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 118 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4}; + .loc 16 119 0 + ld.global.f32 %f5, [%rd1+16]; + .loc 16 120 0 + ld.global.f32 %f6, [%rd1+20]; + .loc 16 121 0 + ld.global.f32 %f7, [%rd1+24]; + .loc 16 122 0 + ld.global.f32 %f8, [%rd1+28]; + st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8}; + .loc 16 132 0 + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + mov.f32 %f17, 0f00000000; // 0 + mov.f32 %f18, %f17; + mov.f32 %f19, 0f00000000; // 0 + mov.f32 %f20, %f19; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_22274; + .loc 16 136 0 + cvt.s64.s32 %rd2, %r9; + mul.wide.s32 %rd3, %r9, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd5, %rd3, %rd4; + ld.global.s32 %r11, [%rd5+0]; + .loc 16 138 0 + ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd6, %r12; + mul.wide.s32 %rd7, %r12, 4; + add.u64 %rd8, %rd7, %rd5; + ld.global.s32 %r13, [%rd8+0]; + add.u64 %rd9, %rd7, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd4; + @%p2 bra $Lt_0_22786; + .loc 16 144 0 + cvt.s32.s64 %r14, %rd6; + mul.lo.s32 %r15, %r14, %r13; + cvt.s64.s32 %rd11, %r15; + mul.wide.s32 %rd12, %r15, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 145 0 + mul.lo.s32 %r16, %r6, %r14; + cvt.s64.s32 %rd14, %r16; + mul.wide.s32 %rd15, %r16, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 146 0 + mul.lo.s32 %r17, %r14, %r1; + bra.uni $Lt_0_22530; +$Lt_0_22786: + .loc 16 148 0 + ld.global.s32 %r18, [%rd9+0]; + cvt.s64.s32 %rd17, %r18; + mul.wide.s32 %rd18, %r18, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 149 0 + cvt.s64.s32 %rd20, %r13; + mul.wide.s32 %rd21, %r13, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 150 0 + mov.s32 %r17, %r1; + .loc 16 151 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_22530: + .loc 16 154 0 + mov.u32 %r19, %r11; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f25, %f21; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + .loc 16 155 0 + mov.u32 %r26, %r11; + mov.s32 %r27, 0; + mov.u32 %r28, %r27; + mov.s32 %r29, 0; + mov.u32 %r30, %r29; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r26,%r28,%r30,%r32}]; + mov.f32 %f33, %f29; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_32770; + cvt.rzi.ftz.s32.f32 %r33, %f28; + cvt.s64.s32 %rd24, %r17; + ld.param.s32 %r34, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r35, %r34, %r33; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1]; + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.u64 %rd26, __cuda___cuda_local_var_32498_33_non_const_sp_lj112; +$Lt_0_23554: + //<loop> Loop body line 155, nesting depth: 1, estimated iterations: unknown + .loc 16 159 0 + ld.global.s32 %r36, [%rd16+0]; + .loc 16 162 0 + shr.s32 %r37, %r36, 30; + and.b32 %r38, %r37, 3; + cvt.s64.s32 %rd27, %r38; + mul.wide.s32 %rd28, %r38, 4; + add.u64 %rd29, %rd26, %rd28; + ld.shared.f32 %f39, [%rd29+0]; + .loc 16 163 0 + mov.f32 %f40, 0f3f800000; // 1 + ld.shared.f32 %f41, [%rd29+16]; + sub.ftz.f32 %f42, %f40, %f41; + .loc 16 166 0 + and.b32 %r39, %r36, 1073741823; + mov.u32 %r40, %r39; + mov.s32 %r41, 0; + mov.u32 %r42, %r41; + mov.s32 %r43, 0; + mov.u32 %r44, %r43; + mov.s32 %r45, 0; + mov.u32 %r46, %r45; + tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r40,%r42,%r44,%r46}]; + mov.f32 %f47, %f43; + mov.f32 %f48, %f44; + mov.f32 %f49, %f45; + mov.f32 %f50, %f46; + cvt.rzi.ftz.s32.f32 %r47, %f50; + sub.ftz.f32 %f51, %f26, %f48; + sub.ftz.f32 %f52, %f25, %f47; + sub.ftz.f32 %f53, %f27, %f49; + mul.ftz.f32 %f54, %f51, %f51; + fma.rn.ftz.f32 %f55, %f52, %f52, %f54; + fma.rn.ftz.f32 %f56, %f53, %f53, %f55; + add.s32 %r48, %r47, %r35; + cvt.s64.s32 %rd30, %r48; + mul.wide.s32 %rd31, %r48, 16; + add.u64 %rd32, %rd31, %rd25; + ld.global.f32 %f57, [%rd32+8]; + setp.gt.ftz.f32 %p4, %f57, %f56; + @!%p4 bra $Lt_0_26370; + rcp.approx.ftz.f32 %f58, %f56; + ld.global.f32 %f59, [%rd32+12]; + setp.lt.ftz.f32 %p5, %f56, %f59; + @!%p5 bra $Lt_0_24578; + .loc 16 181 0 + mul.ftz.f32 %f60, %f58, %f58; + mul.ftz.f32 %f61, %f58, %f60; + mov.f32 %f62, %f61; + .loc 16 182 0 + mul.ftz.f32 %f63, %f61, %f39; + ld.global.v2.f32 {%f64,%f65}, [%rd32+0]; + mul.ftz.f32 %f66, %f64, %f61; + sub.ftz.f32 %f67, %f66, %f65; + mul.ftz.f32 %f68, %f63, %f67; + bra.uni $Lt_0_24322; +$Lt_0_24578: + .loc 16 184 0 + mov.f32 %f68, 0f00000000; // 0 +$Lt_0_24322: + ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_coulsq]; + setp.gt.ftz.f32 %p6, %f69, %f56; + @!%p6 bra $Lt_0_25090; + .loc 16 191 0 + sqrt.approx.ftz.f32 %f70, %f56; + ld.param.f32 %f71, [__cudaparm_kernel_pair_g_ewald]; + mul.ftz.f32 %f72, %f71, %f70; + mul.ftz.f32 %f73, %f72, %f72; + mov.f32 %f74, 0f3f800000; // 1 + mov.f32 %f75, 0f3ea7ba05; // 0.327591 + fma.rn.ftz.f32 %f76, %f75, %f72, %f74; + neg.ftz.f32 %f77, %f73; + rcp.approx.ftz.f32 %f78, %f76; + mov.f32 %f79, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f80, %f77, %f79; + ex2.approx.ftz.f32 %f81, %f80; + mov.f32 %f82, 0f3e827906; // 0.25483 + mov.f32 %f83, 0fbe91a98e; // -0.284497 + mov.f32 %f84, 0f3fb5f0e3; // 1.42141 + mov.f32 %f85, 0fbfba00e3; // -1.45315 + mov.f32 %f86, 0f3f87dc22; // 1.06141 + fma.rn.ftz.f32 %f87, %f86, %f78, %f85; + fma.rn.ftz.f32 %f88, %f78, %f87, %f84; + fma.rn.ftz.f32 %f89, %f78, %f88, %f83; + fma.rn.ftz.f32 %f90, %f78, %f89, %f82; + mul.ftz.f32 %f91, %f78, %f90; + mul.ftz.f32 %f92, %f81, %f91; + mov.f32 %f93, %f92; + .loc 16 192 0 + mov.u32 %r49, %r39; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + tex.1d.v4.f32.s32 {%f94,%f95,%f96,%f97},[q_tex,{%r49,%r51,%r53,%r55}]; + mov.f32 %f98, %f94; + ld.param.f32 %f99, [__cudaparm_kernel_pair_qqrd2e]; + mul.ftz.f32 %f100, %f99, %f33; + mul.ftz.f32 %f101, %f100, %f98; + div.approx.ftz.f32 %f102, %f101, %f70; + mov.f32 %f103, %f102; + .loc 16 193 0 + mov.f32 %f104, 0f3f906ebb; // 1.12838 + mul.ftz.f32 %f105, %f72, %f104; + fma.rn.ftz.f32 %f106, %f81, %f105, %f92; + sub.ftz.f32 %f107, %f106, %f42; + mul.ftz.f32 %f108, %f102, %f107; + bra.uni $Lt_0_24834; +$Lt_0_25090: + .loc 16 195 0 + mov.f32 %f108, 0f00000000; // 0 +$Lt_0_24834: + .loc 16 199 0 + add.ftz.f32 %f109, %f108, %f68; + mul.ftz.f32 %f110, %f109, %f58; + fma.rn.ftz.f32 %f36, %f52, %f110, %f36; + .loc 16 200 0 + fma.rn.ftz.f32 %f35, %f51, %f110, %f35; + .loc 16 201 0 + fma.rn.ftz.f32 %f34, %f53, %f110, %f34; + ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r57, 0; + setp.le.s32 %p7, %r56, %r57; + @%p7 bra $Lt_0_25858; + .loc 16 204 0 + mov.f32 %f111, %f103; + mov.f32 %f112, %f93; + sub.ftz.f32 %f113, %f112, %f42; + fma.rn.ftz.f32 %f114, %f111, %f113, %f37; + selp.f32 %f37, %f114, %f37, %p6; + @!%p5 bra $Lt_0_25858; + .loc 16 208 0 + ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3]; + add.u64 %rd34, %rd33, %rd31; + mov.f32 %f115, %f62; + ld.global.v4.f32 {%f116,%f117,%f118,_}, [%rd34+0]; + mul.ftz.f32 %f119, %f116, %f115; + sub.ftz.f32 %f120, %f119, %f117; + mul.ftz.f32 %f121, %f115, %f120; + sub.ftz.f32 %f122, %f121, %f118; + fma.rn.ftz.f32 %f38, %f39, %f122, %f38; +$Lt_0_25858: +$Lt_0_25346: + ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p8, %r58, %r59; + @%p8 bra $Lt_0_26370; + .loc 16 212 0 + mov.f32 %f123, %f10; + mul.ftz.f32 %f124, %f52, %f52; + fma.rn.ftz.f32 %f125, %f110, %f124, %f123; + mov.f32 %f10, %f125; + .loc 16 213 0 + mov.f32 %f126, %f12; + fma.rn.ftz.f32 %f127, %f110, %f54, %f126; + mov.f32 %f12, %f127; + .loc 16 214 0 + mov.f32 %f128, %f14; + mul.ftz.f32 %f129, %f53, %f53; + fma.rn.ftz.f32 %f130, %f110, %f129, %f128; + mov.f32 %f14, %f130; + .loc 16 215 0 + mov.f32 %f131, %f16; + mul.ftz.f32 %f132, %f51, %f52; + fma.rn.ftz.f32 %f133, %f110, %f132, %f131; + mov.f32 %f16, %f133; + .loc 16 216 0 + mov.f32 %f134, %f18; + mul.ftz.f32 %f135, %f52, %f53; + fma.rn.ftz.f32 %f136, %f110, %f135, %f134; + mov.f32 %f18, %f136; + .loc 16 217 0 + mul.ftz.f32 %f137, %f51, %f53; + fma.rn.ftz.f32 %f19, %f110, %f137, %f19; + mov.f32 %f20, %f19; +$Lt_0_26370: +$Lt_0_23810: + .loc 16 158 0 + mul.lo.u64 %rd35, %rd24, 4; + add.u64 %rd16, %rd16, %rd35; + setp.lt.u64 %p9, %rd16, %rd13; + @%p9 bra $Lt_0_23554; + bra.uni $Lt_0_22018; +$Lt_0_32770: + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + bra.uni $Lt_0_22018; +$Lt_0_22274: + mov.f32 %f34, 0f00000000; // 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 +$Lt_0_22018: + mov.u32 %r60, 1; + setp.le.s32 %p10, %r1, %r60; + @%p10 bra $Lt_0_29186; + .loc 16 228 0 + mov.u64 %rd36, __cuda___cuda_local_var_32610_35_non_const_red_acc144; + cvt.s64.s32 %rd37, %r2; + mul.wide.s32 %rd38, %r2, 4; + add.u64 %rd39, %rd36, %rd38; + mov.f32 %f138, %f36; + st.shared.f32 [%rd39+0], %f138; + .loc 16 229 0 + mov.f32 %f139, %f35; + st.shared.f32 [%rd39+512], %f139; + .loc 16 230 0 + mov.f32 %f140, %f34; + st.shared.f32 [%rd39+1024], %f140; + .loc 16 231 0 + mov.f32 %f141, %f38; + st.shared.f32 [%rd39+1536], %f141; + .loc 16 232 0 + mov.f32 %f142, %f37; + st.shared.f32 [%rd39+2048], %f142; + .loc 16 234 0 + shr.s32 %r61, %r1, 31; + mov.s32 %r62, 1; + and.b32 %r63, %r61, %r62; + add.s32 %r64, %r63, %r1; + shr.s32 %r65, %r64, 1; + mov.s32 %r66, %r65; + mov.u32 %r67, 0; + setp.ne.u32 %p11, %r65, %r67; + @!%p11 bra $Lt_0_27650; +$Lt_0_28162: + setp.ge.u32 %p12, %r6, %r66; + @%p12 bra $Lt_0_28418; + .loc 16 237 0 + add.u32 %r68, %r2, %r66; + cvt.u64.u32 %rd40, %r68; + mul.wide.u32 %rd41, %r68, 4; + add.u64 %rd42, %rd36, %rd41; + ld.shared.f32 %f143, [%rd42+0]; + add.ftz.f32 %f138, %f143, %f138; + st.shared.f32 [%rd39+0], %f138; + ld.shared.f32 %f144, [%rd42+512]; + add.ftz.f32 %f139, %f144, %f139; + st.shared.f32 [%rd39+512], %f139; + ld.shared.f32 %f145, [%rd42+1024]; + add.ftz.f32 %f140, %f145, %f140; + st.shared.f32 [%rd39+1024], %f140; + ld.shared.f32 %f146, [%rd42+1536]; + add.ftz.f32 %f141, %f146, %f141; + st.shared.f32 [%rd39+1536], %f141; + ld.shared.f32 %f147, [%rd42+2048]; + add.ftz.f32 %f142, %f147, %f142; + st.shared.f32 [%rd39+2048], %f142; +$Lt_0_28418: + .loc 16 234 0 + shr.u32 %r66, %r66, 1; + mov.u32 %r69, 0; + setp.ne.u32 %p13, %r66, %r69; + @%p13 bra $Lt_0_28162; +$Lt_0_27650: + .loc 16 241 0 + mov.f32 %f36, %f138; + .loc 16 242 0 + mov.f32 %f35, %f139; + .loc 16 243 0 + mov.f32 %f34, %f140; + .loc 16 244 0 + mov.f32 %f38, %f141; + .loc 16 245 0 + mov.f32 %f37, %f142; + ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r71, 0; + setp.le.s32 %p14, %r70, %r71; + @%p14 bra $Lt_0_29186; + .loc 16 249 0 + mov.f32 %f138, %f10; + st.shared.f32 [%rd39+0], %f138; + mov.f32 %f139, %f12; + st.shared.f32 [%rd39+512], %f139; + mov.f32 %f140, %f14; + st.shared.f32 [%rd39+1024], %f140; + mov.f32 %f141, %f16; + st.shared.f32 [%rd39+1536], %f141; + mov.f32 %f142, %f18; + st.shared.f32 [%rd39+2048], %f142; + mov.f32 %f148, %f20; + st.shared.f32 [%rd39+2560], %f148; + .loc 16 251 0 + mov.s32 %r72, %r65; + @!%p11 bra $Lt_0_29698; +$Lt_0_30210: + setp.ge.u32 %p15, %r6, %r72; + @%p15 bra $Lt_0_30466; + .loc 16 254 0 + add.u32 %r73, %r2, %r72; + cvt.u64.u32 %rd43, %r73; + mul.wide.u32 %rd44, %r73, 4; + add.u64 %rd45, %rd36, %rd44; + ld.shared.f32 %f149, [%rd45+0]; + add.ftz.f32 %f138, %f149, %f138; + st.shared.f32 [%rd39+0], %f138; + ld.shared.f32 %f150, [%rd45+512]; + add.ftz.f32 %f139, %f150, %f139; + st.shared.f32 [%rd39+512], %f139; + ld.shared.f32 %f151, [%rd45+1024]; + add.ftz.f32 %f140, %f151, %f140; + st.shared.f32 [%rd39+1024], %f140; + ld.shared.f32 %f152, [%rd45+1536]; + add.ftz.f32 %f141, %f152, %f141; + st.shared.f32 [%rd39+1536], %f141; + ld.shared.f32 %f153, [%rd45+2048]; + add.ftz.f32 %f142, %f153, %f142; + st.shared.f32 [%rd39+2048], %f142; + ld.shared.f32 %f154, [%rd45+2560]; + add.ftz.f32 %f148, %f154, %f148; + st.shared.f32 [%rd39+2560], %f148; +$Lt_0_30466: + .loc 16 251 0 + shr.u32 %r72, %r72, 1; + mov.u32 %r74, 0; + setp.ne.u32 %p16, %r72, %r74; + @%p16 bra $Lt_0_30210; +$Lt_0_29698: + .loc 16 259 0 + mov.f32 %f10, %f138; + mov.f32 %f12, %f139; + mov.f32 %f14, %f140; + mov.f32 %f16, %f141; + mov.f32 %f18, %f142; + mov.f32 %f20, %f148; +$Lt_0_29186: +$Lt_0_27138: + selp.s32 %r75, 1, 0, %p1; + mov.s32 %r76, 0; + set.eq.u32.s32 %r77, %r6, %r76; + neg.s32 %r78, %r77; + and.b32 %r79, %r75, %r78; + mov.u32 %r80, 0; + setp.eq.s32 %p17, %r79, %r80; + @%p17 bra $Lt_0_31234; + .loc 16 265 0 + cvt.s64.s32 %rd46, %r9; + ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd48, %r9, 4; + add.u64 %rd49, %rd47, %rd48; + ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r82, 0; + setp.le.s32 %p18, %r81, %r82; + @%p18 bra $Lt_0_31746; + .loc 16 267 0 + st.global.f32 [%rd49+0], %f38; + .loc 16 268 0 + cvt.s64.s32 %rd50, %r10; + mul.wide.s32 %rd51, %r10, 4; + add.u64 %rd52, %rd51, %rd49; + .loc 16 269 0 + st.global.f32 [%rd52+0], %f37; + .loc 16 270 0 + add.u64 %rd49, %rd51, %rd52; +$Lt_0_31746: + ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r84, 0; + setp.le.s32 %p19, %r83, %r84; + @%p19 bra $Lt_0_32258; + .loc 16 274 0 + mov.f32 %f155, %f10; + st.global.f32 [%rd49+0], %f155; + .loc 16 275 0 + cvt.s64.s32 %rd53, %r10; + mul.wide.s32 %rd54, %r10, 4; + add.u64 %rd55, %rd54, %rd49; + .loc 16 274 0 + mov.f32 %f156, %f12; + st.global.f32 [%rd55+0], %f156; + .loc 16 275 0 + add.u64 %rd56, %rd54, %rd55; + .loc 16 274 0 + mov.f32 %f157, %f14; + st.global.f32 [%rd56+0], %f157; + .loc 16 275 0 + add.u64 %rd57, %rd54, %rd56; + .loc 16 274 0 + mov.f32 %f158, %f16; + st.global.f32 [%rd57+0], %f158; + .loc 16 275 0 + add.u64 %rd49, %rd54, %rd57; + .loc 16 274 0 + mov.f32 %f159, %f18; + st.global.f32 [%rd49+0], %f159; + mov.f32 %f160, %f20; + add.u64 %rd58, %rd54, %rd49; + st.global.f32 [%rd58+0], %f160; +$Lt_0_32258: + .loc 16 278 0 + ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd60, %rd46, 16; + add.u64 %rd61, %rd59, %rd60; + mov.f32 %f161, %f162; + st.global.v4.f32 [%rd61+0], {%f36,%f35,%f34,%f161}; +$Lt_0_31234: + .loc 16 280 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_lj1_in, + .param .u64 __cudaparm_kernel_pair_fast_lj3_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .u64 __cudaparm_kernel_pair_fast_q_, + .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, + .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, + .param .f32 __cudaparm_kernel_pair_fast_g_ewald, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<88>; + .reg .u64 %rd<75>; + .reg .f32 %f<167>; + .reg .pred %p<24>; + .shared .align 4 .b8 __cuda___cuda_local_var_32682_33_non_const_sp_lj3320[32]; + .shared .align 16 .b8 __cuda___cuda_local_var_32680_34_non_const_lj13360[1936]; + .shared .align 16 .b8 __cuda___cuda_local_var_32681_34_non_const_lj35296[1936]; + .shared .align 4 .b8 __cuda___cuda_local_var_32795_35_non_const_red_acc7232[3072]; + // __cuda_local_var_32693_10_non_const_f = 64 + // __cuda_local_var_32697_9_non_const_virial = 16 + // __cuda_local_var_32747_43_non_const_r6inv = 40 + // __cuda_local_var_32747_50_non_const_prefactor = 48 + // __cuda_local_var_32747_61_non_const__erfc = 44 + .loc 16 290 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 7; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_24322; + .loc 16 300 0 + mov.u64 %rd1, __cuda___cuda_local_var_32682_33_non_const_sp_lj3320; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_24322: + mov.u64 %rd1, __cuda___cuda_local_var_32682_33_non_const_sp_lj3320; + mov.u32 %r3, 120; + setp.gt.s32 %p2, %r1, %r3; + @%p2 bra $Lt_1_24834; + .loc 16 302 0 + mov.u64 %rd7, __cuda___cuda_local_var_32680_34_non_const_lj13360; + cvt.s64.s32 %rd8, %r1; + mul.wide.s32 %rd9, %r1, 16; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; + add.u64 %rd11, %rd10, %rd9; + add.u64 %rd12, %rd9, %rd7; + ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; + st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; + ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r5, 0; + setp.le.s32 %p3, %r4, %r5; + @%p3 bra $Lt_1_25346; + .loc 16 304 0 + mov.u64 %rd13, __cuda___cuda_local_var_32681_34_non_const_lj35296; + ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; + add.u64 %rd15, %rd14, %rd9; + add.u64 %rd16, %rd9, %rd13; + ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; + st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; +$Lt_1_25346: + mov.u64 %rd13, __cuda___cuda_local_var_32681_34_non_const_lj35296; +$Lt_1_24834: + mov.u64 %rd7, __cuda___cuda_local_var_32680_34_non_const_lj13360; + mov.u64 %rd13, __cuda___cuda_local_var_32681_34_non_const_lj35296; + .loc 16 315 0 + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, %f14; + mov.f32 %f16, 0f00000000; // 0 + mov.f32 %f17, %f16; + mov.f32 %f18, 0f00000000; // 0 + mov.f32 %f19, %f18; + mov.f32 %f20, 0f00000000; // 0 + mov.f32 %f21, %f20; + .loc 16 317 0 + bar.sync 0; + ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r7, %r1, %r6; + cvt.s32.u32 %r8, %ntid.x; + div.s32 %r9, %r8, %r6; + rem.s32 %r10, %r1, %r6; + cvt.s32.u32 %r11, %ctaid.x; + mul.lo.s32 %r12, %r11, %r9; + add.s32 %r13, %r7, %r12; + ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p4, %r13, %r14; + @!%p4 bra $Lt_1_26114; + .loc 16 321 0 + cvt.s64.s32 %rd17, %r13; + mul.wide.s32 %rd18, %r13, 4; + ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd20, %rd18, %rd19; + ld.global.s32 %r15, [%rd20+0]; + .loc 16 323 0 + ld.param.s32 %r16, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd21, %r16; + mul.wide.s32 %rd22, %r16, 4; + add.u64 %rd23, %rd22, %rd20; + ld.global.s32 %r17, [%rd23+0]; + add.u64 %rd24, %rd22, %rd23; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p5, %rd25, %rd19; + @%p5 bra $Lt_1_26626; + .loc 16 329 0 + cvt.s32.s64 %r18, %rd21; + mul.lo.s32 %r19, %r18, %r17; + cvt.s64.s32 %rd26, %r19; + mul.wide.s32 %rd27, %r19, 4; + add.u64 %rd28, %rd24, %rd27; + .loc 16 330 0 + mul.lo.s32 %r20, %r10, %r18; + cvt.s64.s32 %rd29, %r20; + mul.wide.s32 %rd30, %r20, 4; + add.u64 %rd31, %rd24, %rd30; + .loc 16 331 0 + mul.lo.s32 %r21, %r18, %r6; + bra.uni $Lt_1_26370; +$Lt_1_26626: + .loc 16 333 0 + ld.global.s32 %r22, [%rd24+0]; + cvt.s64.s32 %rd32, %r22; + mul.wide.s32 %rd33, %r22, 4; + add.u64 %rd34, %rd25, %rd33; + .loc 16 334 0 + cvt.s64.s32 %rd35, %r17; + mul.wide.s32 %rd36, %r17, 4; + add.u64 %rd28, %rd34, %rd36; + .loc 16 335 0 + mov.s32 %r21, %r6; + .loc 16 336 0 + cvt.s64.s32 %rd37, %r10; + mul.wide.s32 %rd38, %r10, 4; + add.u64 %rd31, %rd34, %rd38; +$Lt_1_26370: + .loc 16 339 0 + mov.u32 %r23, %r15; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + mov.s32 %r28, 0; + mov.u32 %r29, %r28; + tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}]; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + mov.f32 %f28, %f24; + mov.f32 %f29, %f25; + .loc 16 340 0 + mov.u32 %r30, %r15; + mov.s32 %r31, 0; + mov.u32 %r32, %r31; + mov.s32 %r33, 0; + mov.u32 %r34, %r33; + mov.s32 %r35, 0; + mov.u32 %r36, %r35; + tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r30,%r32,%r34,%r36}]; + mov.f32 %f34, %f30; + setp.ge.u64 %p6, %rd31, %rd28; + @%p6 bra $Lt_1_36610; + cvt.rzi.ftz.s32.f32 %r37, %f29; + cvt.s64.s32 %rd39, %r21; + mul.lo.s32 %r38, %r37, 11; + cvt.rn.f32.s32 %f35, %r38; + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.f32 %f40, 0f00000000; // 0 +$Lt_1_27394: + //<loop> Loop body line 340, nesting depth: 1, estimated iterations: unknown + .loc 16 345 0 + ld.global.s32 %r39, [%rd31+0]; + .loc 16 348 0 + shr.s32 %r40, %r39, 30; + and.b32 %r41, %r40, 3; + cvt.s64.s32 %rd40, %r41; + mul.wide.s32 %rd41, %r41, 4; + add.u64 %rd42, %rd1, %rd41; + ld.shared.f32 %f41, [%rd42+0]; + .loc 16 349 0 + mov.f32 %f42, 0f3f800000; // 1 + ld.shared.f32 %f43, [%rd42+16]; + sub.ftz.f32 %f44, %f42, %f43; + .loc 16 352 0 + and.b32 %r42, %r39, 1073741823; + mov.u32 %r43, %r42; + mov.s32 %r44, 0; + mov.u32 %r45, %r44; + mov.s32 %r46, 0; + mov.u32 %r47, %r46; + mov.s32 %r48, 0; + mov.u32 %r49, %r48; + tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r43,%r45,%r47,%r49}]; + mov.f32 %f49, %f45; + mov.f32 %f50, %f46; + mov.f32 %f51, %f47; + mov.f32 %f52, %f48; + sub.ftz.f32 %f53, %f27, %f50; + sub.ftz.f32 %f54, %f26, %f49; + sub.ftz.f32 %f55, %f28, %f51; + mul.ftz.f32 %f56, %f53, %f53; + fma.rn.ftz.f32 %f57, %f54, %f54, %f56; + fma.rn.ftz.f32 %f58, %f55, %f55, %f57; + add.ftz.f32 %f59, %f35, %f52; + cvt.rzi.ftz.s32.f32 %r50, %f59; + cvt.s64.s32 %rd43, %r50; + mul.wide.s32 %rd44, %r50, 16; + add.u64 %rd45, %rd44, %rd7; + ld.shared.f32 %f60, [%rd45+8]; + setp.gt.ftz.f32 %p7, %f60, %f58; + @!%p7 bra $Lt_1_30210; + rcp.approx.ftz.f32 %f61, %f58; + ld.shared.f32 %f62, [%rd45+12]; + setp.lt.ftz.f32 %p8, %f58, %f62; + @!%p8 bra $Lt_1_28418; + .loc 16 366 0 + mul.ftz.f32 %f63, %f61, %f61; + mul.ftz.f32 %f64, %f61, %f63; + mov.f32 %f65, %f64; + .loc 16 367 0 + mul.ftz.f32 %f66, %f64, %f41; + ld.shared.v2.f32 {%f67,%f68}, [%rd45+0]; + mul.ftz.f32 %f69, %f67, %f64; + sub.ftz.f32 %f70, %f69, %f68; + mul.ftz.f32 %f71, %f66, %f70; + bra.uni $Lt_1_28162; +$Lt_1_28418: + .loc 16 369 0 + mov.f32 %f71, 0f00000000; // 0 +$Lt_1_28162: + ld.param.f32 %f72, [__cudaparm_kernel_pair_fast_cut_coulsq]; + setp.gt.ftz.f32 %p9, %f72, %f58; + @!%p9 bra $Lt_1_28930; + .loc 16 376 0 + sqrt.approx.ftz.f32 %f73, %f58; + ld.param.f32 %f74, [__cudaparm_kernel_pair_fast_g_ewald]; + mul.ftz.f32 %f75, %f74, %f73; + mul.ftz.f32 %f76, %f75, %f75; + mov.f32 %f77, 0f3f800000; // 1 + mov.f32 %f78, 0f3ea7ba05; // 0.327591 + fma.rn.ftz.f32 %f79, %f78, %f75, %f77; + neg.ftz.f32 %f80, %f76; + rcp.approx.ftz.f32 %f81, %f79; + mov.f32 %f82, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f83, %f80, %f82; + ex2.approx.ftz.f32 %f84, %f83; + mov.f32 %f85, 0f3e827906; // 0.25483 + mov.f32 %f86, 0fbe91a98e; // -0.284497 + mov.f32 %f87, 0f3fb5f0e3; // 1.42141 + mov.f32 %f88, 0fbfba00e3; // -1.45315 + mov.f32 %f89, 0f3f87dc22; // 1.06141 + fma.rn.ftz.f32 %f90, %f89, %f81, %f88; + fma.rn.ftz.f32 %f91, %f81, %f90, %f87; + fma.rn.ftz.f32 %f92, %f81, %f91, %f86; + fma.rn.ftz.f32 %f93, %f81, %f92, %f85; + mul.ftz.f32 %f94, %f81, %f93; + mul.ftz.f32 %f95, %f84, %f94; + mov.f32 %f96, %f95; + .loc 16 377 0 + mov.u32 %r51, %r42; + mov.s32 %r52, 0; + mov.u32 %r53, %r52; + mov.s32 %r54, 0; + mov.u32 %r55, %r54; + mov.s32 %r56, 0; + mov.u32 %r57, %r56; + tex.1d.v4.f32.s32 {%f97,%f98,%f99,%f100},[q_tex,{%r51,%r53,%r55,%r57}]; + mov.f32 %f101, %f97; + ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_qqrd2e]; + mul.ftz.f32 %f103, %f102, %f34; + mul.ftz.f32 %f104, %f103, %f101; + div.approx.ftz.f32 %f105, %f104, %f73; + mov.f32 %f106, %f105; + .loc 16 378 0 + mov.f32 %f107, 0f3f906ebb; // 1.12838 + mul.ftz.f32 %f108, %f75, %f107; + fma.rn.ftz.f32 %f109, %f84, %f108, %f95; + sub.ftz.f32 %f110, %f109, %f44; + mul.ftz.f32 %f111, %f105, %f110; + bra.uni $Lt_1_28674; +$Lt_1_28930: + .loc 16 380 0 + mov.f32 %f111, 0f00000000; // 0 +$Lt_1_28674: + .loc 16 384 0 + add.ftz.f32 %f112, %f111, %f71; + mul.ftz.f32 %f113, %f112, %f61; + fma.rn.ftz.f32 %f38, %f54, %f113, %f38; + .loc 16 385 0 + fma.rn.ftz.f32 %f37, %f53, %f113, %f37; + .loc 16 386 0 + fma.rn.ftz.f32 %f36, %f55, %f113, %f36; + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r59, 0; + setp.le.s32 %p10, %r58, %r59; + @%p10 bra $Lt_1_29698; + .loc 16 389 0 + mov.f32 %f114, %f106; + mov.f32 %f115, %f96; + sub.ftz.f32 %f116, %f115, %f44; + fma.rn.ftz.f32 %f117, %f114, %f116, %f39; + selp.f32 %f39, %f117, %f39, %p9; + @!%p8 bra $Lt_1_29698; + .loc 16 392 0 + add.u64 %rd46, %rd44, %rd13; + mov.f32 %f118, %f65; + ld.shared.v4.f32 {%f119,%f120,%f121,_}, [%rd46+0]; + mul.ftz.f32 %f122, %f119, %f118; + sub.ftz.f32 %f123, %f122, %f120; + mul.ftz.f32 %f124, %f118, %f123; + .loc 16 393 0 + sub.ftz.f32 %f125, %f124, %f121; + fma.rn.ftz.f32 %f40, %f41, %f125, %f40; +$Lt_1_29698: +$Lt_1_29186: + ld.param.s32 %r60, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r61, 0; + setp.le.s32 %p11, %r60, %r61; + @%p11 bra $Lt_1_30210; + .loc 16 397 0 + mov.f32 %f126, %f11; + mul.ftz.f32 %f127, %f54, %f54; + fma.rn.ftz.f32 %f128, %f113, %f127, %f126; + mov.f32 %f11, %f128; + .loc 16 398 0 + mov.f32 %f129, %f13; + fma.rn.ftz.f32 %f130, %f113, %f56, %f129; + mov.f32 %f13, %f130; + .loc 16 399 0 + mov.f32 %f131, %f15; + mul.ftz.f32 %f132, %f55, %f55; + fma.rn.ftz.f32 %f133, %f113, %f132, %f131; + mov.f32 %f15, %f133; + .loc 16 400 0 + mov.f32 %f134, %f17; + mul.ftz.f32 %f135, %f53, %f54; + fma.rn.ftz.f32 %f136, %f113, %f135, %f134; + mov.f32 %f17, %f136; + .loc 16 401 0 + mov.f32 %f137, %f19; + mul.ftz.f32 %f138, %f54, %f55; + fma.rn.ftz.f32 %f139, %f113, %f138, %f137; + mov.f32 %f19, %f139; + .loc 16 402 0 + mul.ftz.f32 %f140, %f53, %f55; + fma.rn.ftz.f32 %f20, %f113, %f140, %f20; + mov.f32 %f21, %f20; +$Lt_1_30210: +$Lt_1_27650: + .loc 16 344 0 + mul.lo.u64 %rd47, %rd39, 4; + add.u64 %rd31, %rd31, %rd47; + setp.lt.u64 %p12, %rd31, %rd28; + @%p12 bra $Lt_1_27394; + bra.uni $Lt_1_25858; +$Lt_1_36610: + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.f32 %f40, 0f00000000; // 0 + bra.uni $Lt_1_25858; +$Lt_1_26114: + mov.f32 %f36, 0f00000000; // 0 + mov.f32 %f37, 0f00000000; // 0 + mov.f32 %f38, 0f00000000; // 0 + mov.f32 %f39, 0f00000000; // 0 + mov.f32 %f40, 0f00000000; // 0 +$Lt_1_25858: + mov.u32 %r62, 1; + setp.le.s32 %p13, %r6, %r62; + @%p13 bra $Lt_1_33026; + .loc 16 413 0 + mov.u64 %rd48, __cuda___cuda_local_var_32795_35_non_const_red_acc7232; + cvt.s64.s32 %rd49, %r1; + mul.wide.s32 %rd50, %r1, 4; + add.u64 %rd51, %rd48, %rd50; + mov.f32 %f141, %f38; + st.shared.f32 [%rd51+0], %f141; + .loc 16 414 0 + mov.f32 %f142, %f37; + st.shared.f32 [%rd51+512], %f142; + .loc 16 415 0 + mov.f32 %f143, %f36; + st.shared.f32 [%rd51+1024], %f143; + .loc 16 416 0 + mov.f32 %f144, %f40; + st.shared.f32 [%rd51+1536], %f144; + .loc 16 417 0 + mov.f32 %f145, %f39; + st.shared.f32 [%rd51+2048], %f145; + .loc 16 419 0 + shr.s32 %r63, %r6, 31; + mov.s32 %r64, 1; + and.b32 %r65, %r63, %r64; + add.s32 %r66, %r65, %r6; + shr.s32 %r67, %r66, 1; + mov.s32 %r68, %r67; + mov.u32 %r69, 0; + setp.ne.u32 %p14, %r67, %r69; + @!%p14 bra $Lt_1_31490; +$Lt_1_32002: + setp.ge.u32 %p15, %r10, %r68; + @%p15 bra $Lt_1_32258; + .loc 16 422 0 + add.u32 %r70, %r1, %r68; + cvt.u64.u32 %rd52, %r70; + mul.wide.u32 %rd53, %r70, 4; + add.u64 %rd54, %rd48, %rd53; + ld.shared.f32 %f146, [%rd54+0]; + add.ftz.f32 %f141, %f146, %f141; + st.shared.f32 [%rd51+0], %f141; + ld.shared.f32 %f147, [%rd54+512]; + add.ftz.f32 %f142, %f147, %f142; + st.shared.f32 [%rd51+512], %f142; + ld.shared.f32 %f148, [%rd54+1024]; + add.ftz.f32 %f143, %f148, %f143; + st.shared.f32 [%rd51+1024], %f143; + ld.shared.f32 %f149, [%rd54+1536]; + add.ftz.f32 %f144, %f149, %f144; + st.shared.f32 [%rd51+1536], %f144; + ld.shared.f32 %f150, [%rd54+2048]; + add.ftz.f32 %f145, %f150, %f145; + st.shared.f32 [%rd51+2048], %f145; +$Lt_1_32258: + .loc 16 419 0 + shr.u32 %r68, %r68, 1; + mov.u32 %r71, 0; + setp.ne.u32 %p16, %r68, %r71; + @%p16 bra $Lt_1_32002; +$Lt_1_31490: + .loc 16 426 0 + mov.f32 %f38, %f141; + .loc 16 427 0 + mov.f32 %f37, %f142; + .loc 16 428 0 + mov.f32 %f36, %f143; + .loc 16 429 0 + mov.f32 %f40, %f144; + .loc 16 430 0 + mov.f32 %f39, %f145; + ld.param.s32 %r72, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r73, 0; + setp.le.s32 %p17, %r72, %r73; + @%p17 bra $Lt_1_33026; + .loc 16 434 0 + mov.f32 %f141, %f11; + st.shared.f32 [%rd51+0], %f141; + mov.f32 %f142, %f13; + st.shared.f32 [%rd51+512], %f142; + mov.f32 %f143, %f15; + st.shared.f32 [%rd51+1024], %f143; + mov.f32 %f144, %f17; + st.shared.f32 [%rd51+1536], %f144; + mov.f32 %f145, %f19; + st.shared.f32 [%rd51+2048], %f145; + mov.f32 %f151, %f21; + st.shared.f32 [%rd51+2560], %f151; + .loc 16 436 0 + mov.s32 %r74, %r67; + @!%p14 bra $Lt_1_33538; +$Lt_1_34050: + setp.ge.u32 %p18, %r10, %r74; + @%p18 bra $Lt_1_34306; + .loc 16 439 0 + add.u32 %r75, %r1, %r74; + cvt.u64.u32 %rd55, %r75; + mul.wide.u32 %rd56, %r75, 4; + add.u64 %rd57, %rd48, %rd56; + ld.shared.f32 %f152, [%rd57+0]; + add.ftz.f32 %f141, %f152, %f141; + st.shared.f32 [%rd51+0], %f141; + ld.shared.f32 %f153, [%rd57+512]; + add.ftz.f32 %f142, %f153, %f142; + st.shared.f32 [%rd51+512], %f142; + ld.shared.f32 %f154, [%rd57+1024]; + add.ftz.f32 %f143, %f154, %f143; + st.shared.f32 [%rd51+1024], %f143; + ld.shared.f32 %f155, [%rd57+1536]; + add.ftz.f32 %f144, %f155, %f144; + st.shared.f32 [%rd51+1536], %f144; + ld.shared.f32 %f156, [%rd57+2048]; + add.ftz.f32 %f145, %f156, %f145; + st.shared.f32 [%rd51+2048], %f145; + ld.shared.f32 %f157, [%rd57+2560]; + add.ftz.f32 %f151, %f157, %f151; + st.shared.f32 [%rd51+2560], %f151; +$Lt_1_34306: + .loc 16 436 0 + shr.u32 %r74, %r74, 1; + mov.u32 %r76, 0; + setp.ne.u32 %p19, %r74, %r76; + @%p19 bra $Lt_1_34050; +$Lt_1_33538: + .loc 16 444 0 + mov.f32 %f11, %f141; + mov.f32 %f13, %f142; + mov.f32 %f15, %f143; + mov.f32 %f17, %f144; + mov.f32 %f19, %f145; + mov.f32 %f21, %f151; +$Lt_1_33026: +$Lt_1_30978: + selp.s32 %r77, 1, 0, %p4; + mov.s32 %r78, 0; + set.eq.u32.s32 %r79, %r10, %r78; + neg.s32 %r80, %r79; + and.b32 %r81, %r77, %r80; + mov.u32 %r82, 0; + setp.eq.s32 %p20, %r81, %r82; + @%p20 bra $Lt_1_35074; + .loc 16 450 0 + cvt.s64.s32 %rd58, %r13; + ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd60, %r13, 4; + add.u64 %rd61, %rd59, %rd60; + ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r84, 0; + setp.le.s32 %p21, %r83, %r84; + @%p21 bra $Lt_1_35586; + .loc 16 452 0 + st.global.f32 [%rd61+0], %f40; + .loc 16 453 0 + cvt.s64.s32 %rd62, %r14; + mul.wide.s32 %rd63, %r14, 4; + add.u64 %rd64, %rd63, %rd61; + .loc 16 454 0 + st.global.f32 [%rd64+0], %f39; + .loc 16 455 0 + add.u64 %rd61, %rd63, %rd64; +$Lt_1_35586: + ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r86, 0; + setp.le.s32 %p22, %r85, %r86; + @%p22 bra $Lt_1_36098; + .loc 16 459 0 + mov.f32 %f158, %f11; + st.global.f32 [%rd61+0], %f158; + .loc 16 460 0 + cvt.s64.s32 %rd65, %r14; + mul.wide.s32 %rd66, %r14, 4; + add.u64 %rd67, %rd66, %rd61; + .loc 16 459 0 + mov.f32 %f159, %f13; + st.global.f32 [%rd67+0], %f159; + .loc 16 460 0 + add.u64 %rd68, %rd66, %rd67; + .loc 16 459 0 + mov.f32 %f160, %f15; + st.global.f32 [%rd68+0], %f160; + .loc 16 460 0 + add.u64 %rd69, %rd66, %rd68; + .loc 16 459 0 + mov.f32 %f161, %f17; + st.global.f32 [%rd69+0], %f161; + .loc 16 460 0 + add.u64 %rd61, %rd66, %rd69; + .loc 16 459 0 + mov.f32 %f162, %f19; + st.global.f32 [%rd61+0], %f162; + mov.f32 %f163, %f21; + add.u64 %rd70, %rd66, %rd61; + st.global.f32 [%rd70+0], %f163; +$Lt_1_36098: + .loc 16 463 0 + ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd72, %rd58, 16; + add.u64 %rd73, %rd71, %rd72; + mov.f32 %f164, %f165; + st.global.v4.f32 [%rd73+0], {%f38,%f37,%f36,%f164}; +$Lt_1_35074: + .loc 16 465 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/ljcl_cut_gpu_ptx.h b/lib/gpu/ljcl_cut_gpu_ptx.h new file mode 100644 index 000000000..04b42886b --- /dev/null +++ b/lib/gpu/ljcl_cut_gpu_ptx.h @@ -0,0 +1,1153 @@ +const char * ljcl_cut_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .global .texref q_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_lj1,\n" +" .param .u64 __cudaparm_kernel_pair_lj3,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_q_,\n" +" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" +" .param .f32 __cudaparm_kernel_pair_g_ewald,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<86>;\n" +" .reg .u64 %rd<63>;\n" +" .reg .f32 %f<164>;\n" +" .reg .pred %p<21>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_lj112[32];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32610_35_non_const_red_acc144[3072];\n" +" .loc 16 108 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 115 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 116 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 117 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 118 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 119 0\n" +" ld.global.f32 %f5, [%rd1+16];\n" +" .loc 16 120 0\n" +" ld.global.f32 %f6, [%rd1+20];\n" +" .loc 16 121 0\n" +" ld.global.f32 %f7, [%rd1+24];\n" +" .loc 16 122 0\n" +" ld.global.f32 %f8, [%rd1+28];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32498_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8};\n" +" .loc 16 132 0\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" mov.f32 %f17, 0f00000000; \n" +" mov.f32 %f18, %f17;\n" +" mov.f32 %f19, 0f00000000; \n" +" mov.f32 %f20, %f19;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_22274;\n" +" .loc 16 136 0\n" +" cvt.s64.s32 %rd2, %r9;\n" +" mul.wide.s32 %rd3, %r9, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd5, %rd3, %rd4;\n" +" ld.global.s32 %r11, [%rd5+0];\n" +" .loc 16 138 0\n" +" ld.param.s32 %r12, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd6, %r12;\n" +" mul.wide.s32 %rd7, %r12, 4;\n" +" add.u64 %rd8, %rd7, %rd5;\n" +" ld.global.s32 %r13, [%rd8+0];\n" +" add.u64 %rd9, %rd7, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd4;\n" +" @%p2 bra $Lt_0_22786;\n" +" .loc 16 144 0\n" +" cvt.s32.s64 %r14, %rd6;\n" +" mul.lo.s32 %r15, %r14, %r13;\n" +" cvt.s64.s32 %rd11, %r15;\n" +" mul.wide.s32 %rd12, %r15, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 145 0\n" +" mul.lo.s32 %r16, %r6, %r14;\n" +" cvt.s64.s32 %rd14, %r16;\n" +" mul.wide.s32 %rd15, %r16, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 146 0\n" +" mul.lo.s32 %r17, %r14, %r1;\n" +" bra.uni $Lt_0_22530;\n" +"$Lt_0_22786:\n" +" .loc 16 148 0\n" +" ld.global.s32 %r18, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r18;\n" +" mul.wide.s32 %rd18, %r18, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 149 0\n" +" cvt.s64.s32 %rd20, %r13;\n" +" mul.wide.s32 %rd21, %r13, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 150 0\n" +" mov.s32 %r17, %r1;\n" +" .loc 16 151 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_22530:\n" +" .loc 16 154 0\n" +" mov.u32 %r19, %r11;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f25, %f21;\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" .loc 16 155 0\n" +" mov.u32 %r26, %r11;\n" +" mov.s32 %r27, 0;\n" +" mov.u32 %r28, %r27;\n" +" mov.s32 %r29, 0;\n" +" mov.u32 %r30, %r29;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r26,%r28,%r30,%r32}];\n" +" mov.f32 %f33, %f29;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_32770;\n" +" cvt.rzi.ftz.s32.f32 %r33, %f28;\n" +" cvt.s64.s32 %rd24, %r17;\n" +" ld.param.s32 %r34, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r35, %r34, %r33;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];\n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.u64 %rd26, __cuda___cuda_local_var_32498_33_non_const_sp_lj112;\n" +"$Lt_0_23554:\n" +" .loc 16 159 0\n" +" ld.global.s32 %r36, [%rd16+0];\n" +" .loc 16 162 0\n" +" shr.s32 %r37, %r36, 30;\n" +" and.b32 %r38, %r37, 3;\n" +" cvt.s64.s32 %rd27, %r38;\n" +" mul.wide.s32 %rd28, %r38, 4;\n" +" add.u64 %rd29, %rd26, %rd28;\n" +" ld.shared.f32 %f39, [%rd29+0];\n" +" .loc 16 163 0\n" +" mov.f32 %f40, 0f3f800000; \n" +" ld.shared.f32 %f41, [%rd29+16];\n" +" sub.ftz.f32 %f42, %f40, %f41;\n" +" .loc 16 166 0\n" +" and.b32 %r39, %r36, 1073741823;\n" +" mov.u32 %r40, %r39;\n" +" mov.s32 %r41, 0;\n" +" mov.u32 %r42, %r41;\n" +" mov.s32 %r43, 0;\n" +" mov.u32 %r44, %r43;\n" +" mov.s32 %r45, 0;\n" +" mov.u32 %r46, %r45;\n" +" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r40,%r42,%r44,%r46}];\n" +" mov.f32 %f47, %f43;\n" +" mov.f32 %f48, %f44;\n" +" mov.f32 %f49, %f45;\n" +" mov.f32 %f50, %f46;\n" +" cvt.rzi.ftz.s32.f32 %r47, %f50;\n" +" sub.ftz.f32 %f51, %f26, %f48;\n" +" sub.ftz.f32 %f52, %f25, %f47;\n" +" sub.ftz.f32 %f53, %f27, %f49;\n" +" mul.ftz.f32 %f54, %f51, %f51;\n" +" fma.rn.ftz.f32 %f55, %f52, %f52, %f54;\n" +" fma.rn.ftz.f32 %f56, %f53, %f53, %f55;\n" +" add.s32 %r48, %r47, %r35;\n" +" cvt.s64.s32 %rd30, %r48;\n" +" mul.wide.s32 %rd31, %r48, 16;\n" +" add.u64 %rd32, %rd31, %rd25;\n" +" ld.global.f32 %f57, [%rd32+8];\n" +" setp.gt.ftz.f32 %p4, %f57, %f56;\n" +" @!%p4 bra $Lt_0_26370;\n" +" rcp.approx.ftz.f32 %f58, %f56;\n" +" ld.global.f32 %f59, [%rd32+12];\n" +" setp.lt.ftz.f32 %p5, %f56, %f59;\n" +" @!%p5 bra $Lt_0_24578;\n" +" .loc 16 181 0\n" +" mul.ftz.f32 %f60, %f58, %f58;\n" +" mul.ftz.f32 %f61, %f58, %f60;\n" +" mov.f32 %f62, %f61;\n" +" .loc 16 182 0\n" +" mul.ftz.f32 %f63, %f61, %f39;\n" +" ld.global.v2.f32 {%f64,%f65}, [%rd32+0];\n" +" mul.ftz.f32 %f66, %f64, %f61;\n" +" sub.ftz.f32 %f67, %f66, %f65;\n" +" mul.ftz.f32 %f68, %f63, %f67;\n" +" bra.uni $Lt_0_24322;\n" +"$Lt_0_24578:\n" +" .loc 16 184 0\n" +" mov.f32 %f68, 0f00000000; \n" +"$Lt_0_24322:\n" +" ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_coulsq];\n" +" setp.gt.ftz.f32 %p6, %f69, %f56;\n" +" @!%p6 bra $Lt_0_25090;\n" +" .loc 16 191 0\n" +" sqrt.approx.ftz.f32 %f70, %f56;\n" +" ld.param.f32 %f71, [__cudaparm_kernel_pair_g_ewald];\n" +" mul.ftz.f32 %f72, %f71, %f70;\n" +" mul.ftz.f32 %f73, %f72, %f72;\n" +" mov.f32 %f74, 0f3f800000; \n" +" mov.f32 %f75, 0f3ea7ba05; \n" +" fma.rn.ftz.f32 %f76, %f75, %f72, %f74;\n" +" neg.ftz.f32 %f77, %f73;\n" +" rcp.approx.ftz.f32 %f78, %f76;\n" +" mov.f32 %f79, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f80, %f77, %f79;\n" +" ex2.approx.ftz.f32 %f81, %f80;\n" +" mov.f32 %f82, 0f3e827906; \n" +" mov.f32 %f83, 0fbe91a98e; \n" +" mov.f32 %f84, 0f3fb5f0e3; \n" +" mov.f32 %f85, 0fbfba00e3; \n" +" mov.f32 %f86, 0f3f87dc22; \n" +" fma.rn.ftz.f32 %f87, %f86, %f78, %f85;\n" +" fma.rn.ftz.f32 %f88, %f78, %f87, %f84;\n" +" fma.rn.ftz.f32 %f89, %f78, %f88, %f83;\n" +" fma.rn.ftz.f32 %f90, %f78, %f89, %f82;\n" +" mul.ftz.f32 %f91, %f78, %f90;\n" +" mul.ftz.f32 %f92, %f81, %f91;\n" +" mov.f32 %f93, %f92;\n" +" .loc 16 192 0\n" +" mov.u32 %r49, %r39;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" tex.1d.v4.f32.s32 {%f94,%f95,%f96,%f97},[q_tex,{%r49,%r51,%r53,%r55}];\n" +" mov.f32 %f98, %f94;\n" +" ld.param.f32 %f99, [__cudaparm_kernel_pair_qqrd2e];\n" +" mul.ftz.f32 %f100, %f99, %f33;\n" +" mul.ftz.f32 %f101, %f100, %f98;\n" +" div.approx.ftz.f32 %f102, %f101, %f70;\n" +" mov.f32 %f103, %f102;\n" +" .loc 16 193 0\n" +" mov.f32 %f104, 0f3f906ebb; \n" +" mul.ftz.f32 %f105, %f72, %f104;\n" +" fma.rn.ftz.f32 %f106, %f81, %f105, %f92;\n" +" sub.ftz.f32 %f107, %f106, %f42;\n" +" mul.ftz.f32 %f108, %f102, %f107;\n" +" bra.uni $Lt_0_24834;\n" +"$Lt_0_25090:\n" +" .loc 16 195 0\n" +" mov.f32 %f108, 0f00000000; \n" +"$Lt_0_24834:\n" +" .loc 16 199 0\n" +" add.ftz.f32 %f109, %f108, %f68;\n" +" mul.ftz.f32 %f110, %f109, %f58;\n" +" fma.rn.ftz.f32 %f36, %f52, %f110, %f36;\n" +" .loc 16 200 0\n" +" fma.rn.ftz.f32 %f35, %f51, %f110, %f35;\n" +" .loc 16 201 0\n" +" fma.rn.ftz.f32 %f34, %f53, %f110, %f34;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p7, %r56, %r57;\n" +" @%p7 bra $Lt_0_25858;\n" +" .loc 16 204 0\n" +" mov.f32 %f111, %f103;\n" +" mov.f32 %f112, %f93;\n" +" sub.ftz.f32 %f113, %f112, %f42;\n" +" fma.rn.ftz.f32 %f114, %f111, %f113, %f37;\n" +" selp.f32 %f37, %f114, %f37, %p6;\n" +" @!%p5 bra $Lt_0_25858;\n" +" .loc 16 208 0\n" +" ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];\n" +" add.u64 %rd34, %rd33, %rd31;\n" +" mov.f32 %f115, %f62;\n" +" ld.global.v4.f32 {%f116,%f117,%f118,_}, [%rd34+0];\n" +" mul.ftz.f32 %f119, %f116, %f115;\n" +" sub.ftz.f32 %f120, %f119, %f117;\n" +" mul.ftz.f32 %f121, %f115, %f120;\n" +" sub.ftz.f32 %f122, %f121, %f118;\n" +" fma.rn.ftz.f32 %f38, %f39, %f122, %f38;\n" +"$Lt_0_25858:\n" +"$Lt_0_25346:\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p8, %r58, %r59;\n" +" @%p8 bra $Lt_0_26370;\n" +" .loc 16 212 0\n" +" mov.f32 %f123, %f10;\n" +" mul.ftz.f32 %f124, %f52, %f52;\n" +" fma.rn.ftz.f32 %f125, %f110, %f124, %f123;\n" +" mov.f32 %f10, %f125;\n" +" .loc 16 213 0\n" +" mov.f32 %f126, %f12;\n" +" fma.rn.ftz.f32 %f127, %f110, %f54, %f126;\n" +" mov.f32 %f12, %f127;\n" +" .loc 16 214 0\n" +" mov.f32 %f128, %f14;\n" +" mul.ftz.f32 %f129, %f53, %f53;\n" +" fma.rn.ftz.f32 %f130, %f110, %f129, %f128;\n" +" mov.f32 %f14, %f130;\n" +" .loc 16 215 0\n" +" mov.f32 %f131, %f16;\n" +" mul.ftz.f32 %f132, %f51, %f52;\n" +" fma.rn.ftz.f32 %f133, %f110, %f132, %f131;\n" +" mov.f32 %f16, %f133;\n" +" .loc 16 216 0\n" +" mov.f32 %f134, %f18;\n" +" mul.ftz.f32 %f135, %f52, %f53;\n" +" fma.rn.ftz.f32 %f136, %f110, %f135, %f134;\n" +" mov.f32 %f18, %f136;\n" +" .loc 16 217 0\n" +" mul.ftz.f32 %f137, %f51, %f53;\n" +" fma.rn.ftz.f32 %f19, %f110, %f137, %f19;\n" +" mov.f32 %f20, %f19;\n" +"$Lt_0_26370:\n" +"$Lt_0_23810:\n" +" .loc 16 158 0\n" +" mul.lo.u64 %rd35, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd35;\n" +" setp.lt.u64 %p9, %rd16, %rd13;\n" +" @%p9 bra $Lt_0_23554;\n" +" bra.uni $Lt_0_22018;\n" +"$Lt_0_32770:\n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" bra.uni $Lt_0_22018;\n" +"$Lt_0_22274:\n" +" mov.f32 %f34, 0f00000000; \n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +"$Lt_0_22018:\n" +" mov.u32 %r60, 1;\n" +" setp.le.s32 %p10, %r1, %r60;\n" +" @%p10 bra $Lt_0_29186;\n" +" .loc 16 228 0\n" +" mov.u64 %rd36, __cuda___cuda_local_var_32610_35_non_const_red_acc144;\n" +" cvt.s64.s32 %rd37, %r2;\n" +" mul.wide.s32 %rd38, %r2, 4;\n" +" add.u64 %rd39, %rd36, %rd38;\n" +" mov.f32 %f138, %f36;\n" +" st.shared.f32 [%rd39+0], %f138;\n" +" .loc 16 229 0\n" +" mov.f32 %f139, %f35;\n" +" st.shared.f32 [%rd39+512], %f139;\n" +" .loc 16 230 0\n" +" mov.f32 %f140, %f34;\n" +" st.shared.f32 [%rd39+1024], %f140;\n" +" .loc 16 231 0\n" +" mov.f32 %f141, %f38;\n" +" st.shared.f32 [%rd39+1536], %f141;\n" +" .loc 16 232 0\n" +" mov.f32 %f142, %f37;\n" +" st.shared.f32 [%rd39+2048], %f142;\n" +" .loc 16 234 0\n" +" shr.s32 %r61, %r1, 31;\n" +" mov.s32 %r62, 1;\n" +" and.b32 %r63, %r61, %r62;\n" +" add.s32 %r64, %r63, %r1;\n" +" shr.s32 %r65, %r64, 1;\n" +" mov.s32 %r66, %r65;\n" +" mov.u32 %r67, 0;\n" +" setp.ne.u32 %p11, %r65, %r67;\n" +" @!%p11 bra $Lt_0_27650;\n" +"$Lt_0_28162:\n" +" setp.ge.u32 %p12, %r6, %r66;\n" +" @%p12 bra $Lt_0_28418;\n" +" .loc 16 237 0\n" +" add.u32 %r68, %r2, %r66;\n" +" cvt.u64.u32 %rd40, %r68;\n" +" mul.wide.u32 %rd41, %r68, 4;\n" +" add.u64 %rd42, %rd36, %rd41;\n" +" ld.shared.f32 %f143, [%rd42+0];\n" +" add.ftz.f32 %f138, %f143, %f138;\n" +" st.shared.f32 [%rd39+0], %f138;\n" +" ld.shared.f32 %f144, [%rd42+512];\n" +" add.ftz.f32 %f139, %f144, %f139;\n" +" st.shared.f32 [%rd39+512], %f139;\n" +" ld.shared.f32 %f145, [%rd42+1024];\n" +" add.ftz.f32 %f140, %f145, %f140;\n" +" st.shared.f32 [%rd39+1024], %f140;\n" +" ld.shared.f32 %f146, [%rd42+1536];\n" +" add.ftz.f32 %f141, %f146, %f141;\n" +" st.shared.f32 [%rd39+1536], %f141;\n" +" ld.shared.f32 %f147, [%rd42+2048];\n" +" add.ftz.f32 %f142, %f147, %f142;\n" +" st.shared.f32 [%rd39+2048], %f142;\n" +"$Lt_0_28418:\n" +" .loc 16 234 0\n" +" shr.u32 %r66, %r66, 1;\n" +" mov.u32 %r69, 0;\n" +" setp.ne.u32 %p13, %r66, %r69;\n" +" @%p13 bra $Lt_0_28162;\n" +"$Lt_0_27650:\n" +" .loc 16 241 0\n" +" mov.f32 %f36, %f138;\n" +" .loc 16 242 0\n" +" mov.f32 %f35, %f139;\n" +" .loc 16 243 0\n" +" mov.f32 %f34, %f140;\n" +" .loc 16 244 0\n" +" mov.f32 %f38, %f141;\n" +" .loc 16 245 0\n" +" mov.f32 %f37, %f142;\n" +" ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r71, 0;\n" +" setp.le.s32 %p14, %r70, %r71;\n" +" @%p14 bra $Lt_0_29186;\n" +" .loc 16 249 0\n" +" mov.f32 %f138, %f10;\n" +" st.shared.f32 [%rd39+0], %f138;\n" +" mov.f32 %f139, %f12;\n" +" st.shared.f32 [%rd39+512], %f139;\n" +" mov.f32 %f140, %f14;\n" +" st.shared.f32 [%rd39+1024], %f140;\n" +" mov.f32 %f141, %f16;\n" +" st.shared.f32 [%rd39+1536], %f141;\n" +" mov.f32 %f142, %f18;\n" +" st.shared.f32 [%rd39+2048], %f142;\n" +" mov.f32 %f148, %f20;\n" +" st.shared.f32 [%rd39+2560], %f148;\n" +" .loc 16 251 0\n" +" mov.s32 %r72, %r65;\n" +" @!%p11 bra $Lt_0_29698;\n" +"$Lt_0_30210:\n" +" setp.ge.u32 %p15, %r6, %r72;\n" +" @%p15 bra $Lt_0_30466;\n" +" .loc 16 254 0\n" +" add.u32 %r73, %r2, %r72;\n" +" cvt.u64.u32 %rd43, %r73;\n" +" mul.wide.u32 %rd44, %r73, 4;\n" +" add.u64 %rd45, %rd36, %rd44;\n" +" ld.shared.f32 %f149, [%rd45+0];\n" +" add.ftz.f32 %f138, %f149, %f138;\n" +" st.shared.f32 [%rd39+0], %f138;\n" +" ld.shared.f32 %f150, [%rd45+512];\n" +" add.ftz.f32 %f139, %f150, %f139;\n" +" st.shared.f32 [%rd39+512], %f139;\n" +" ld.shared.f32 %f151, [%rd45+1024];\n" +" add.ftz.f32 %f140, %f151, %f140;\n" +" st.shared.f32 [%rd39+1024], %f140;\n" +" ld.shared.f32 %f152, [%rd45+1536];\n" +" add.ftz.f32 %f141, %f152, %f141;\n" +" st.shared.f32 [%rd39+1536], %f141;\n" +" ld.shared.f32 %f153, [%rd45+2048];\n" +" add.ftz.f32 %f142, %f153, %f142;\n" +" st.shared.f32 [%rd39+2048], %f142;\n" +" ld.shared.f32 %f154, [%rd45+2560];\n" +" add.ftz.f32 %f148, %f154, %f148;\n" +" st.shared.f32 [%rd39+2560], %f148;\n" +"$Lt_0_30466:\n" +" .loc 16 251 0\n" +" shr.u32 %r72, %r72, 1;\n" +" mov.u32 %r74, 0;\n" +" setp.ne.u32 %p16, %r72, %r74;\n" +" @%p16 bra $Lt_0_30210;\n" +"$Lt_0_29698:\n" +" .loc 16 259 0\n" +" mov.f32 %f10, %f138;\n" +" mov.f32 %f12, %f139;\n" +" mov.f32 %f14, %f140;\n" +" mov.f32 %f16, %f141;\n" +" mov.f32 %f18, %f142;\n" +" mov.f32 %f20, %f148;\n" +"$Lt_0_29186:\n" +"$Lt_0_27138:\n" +" selp.s32 %r75, 1, 0, %p1;\n" +" mov.s32 %r76, 0;\n" +" set.eq.u32.s32 %r77, %r6, %r76;\n" +" neg.s32 %r78, %r77;\n" +" and.b32 %r79, %r75, %r78;\n" +" mov.u32 %r80, 0;\n" +" setp.eq.s32 %p17, %r79, %r80;\n" +" @%p17 bra $Lt_0_31234;\n" +" .loc 16 265 0\n" +" cvt.s64.s32 %rd46, %r9;\n" +" ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd48, %r9, 4;\n" +" add.u64 %rd49, %rd47, %rd48;\n" +" ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r82, 0;\n" +" setp.le.s32 %p18, %r81, %r82;\n" +" @%p18 bra $Lt_0_31746;\n" +" .loc 16 267 0\n" +" st.global.f32 [%rd49+0], %f38;\n" +" .loc 16 268 0\n" +" cvt.s64.s32 %rd50, %r10;\n" +" mul.wide.s32 %rd51, %r10, 4;\n" +" add.u64 %rd52, %rd51, %rd49;\n" +" .loc 16 269 0\n" +" st.global.f32 [%rd52+0], %f37;\n" +" .loc 16 270 0\n" +" add.u64 %rd49, %rd51, %rd52;\n" +"$Lt_0_31746:\n" +" ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r84, 0;\n" +" setp.le.s32 %p19, %r83, %r84;\n" +" @%p19 bra $Lt_0_32258;\n" +" .loc 16 274 0\n" +" mov.f32 %f155, %f10;\n" +" st.global.f32 [%rd49+0], %f155;\n" +" .loc 16 275 0\n" +" cvt.s64.s32 %rd53, %r10;\n" +" mul.wide.s32 %rd54, %r10, 4;\n" +" add.u64 %rd55, %rd54, %rd49;\n" +" .loc 16 274 0\n" +" mov.f32 %f156, %f12;\n" +" st.global.f32 [%rd55+0], %f156;\n" +" .loc 16 275 0\n" +" add.u64 %rd56, %rd54, %rd55;\n" +" .loc 16 274 0\n" +" mov.f32 %f157, %f14;\n" +" st.global.f32 [%rd56+0], %f157;\n" +" .loc 16 275 0\n" +" add.u64 %rd57, %rd54, %rd56;\n" +" .loc 16 274 0\n" +" mov.f32 %f158, %f16;\n" +" st.global.f32 [%rd57+0], %f158;\n" +" .loc 16 275 0\n" +" add.u64 %rd49, %rd54, %rd57;\n" +" .loc 16 274 0\n" +" mov.f32 %f159, %f18;\n" +" st.global.f32 [%rd49+0], %f159;\n" +" mov.f32 %f160, %f20;\n" +" add.u64 %rd58, %rd54, %rd49;\n" +" st.global.f32 [%rd58+0], %f160;\n" +"$Lt_0_32258:\n" +" .loc 16 278 0\n" +" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd60, %rd46, 16;\n" +" add.u64 %rd61, %rd59, %rd60;\n" +" mov.f32 %f161, %f162;\n" +" st.global.v4.f32 [%rd61+0], {%f36,%f35,%f34,%f161};\n" +"$Lt_0_31234:\n" +" .loc 16 280 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" +" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" +" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" +" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<88>;\n" +" .reg .u64 %rd<75>;\n" +" .reg .f32 %f<167>;\n" +" .reg .pred %p<24>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32682_33_non_const_sp_lj3320[32];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32680_34_non_const_lj13360[1936];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32681_34_non_const_lj35296[1936];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32795_35_non_const_red_acc7232[3072];\n" +" .loc 16 290 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 7;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_24322;\n" +" .loc 16 300 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32682_33_non_const_sp_lj3320;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_24322:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32682_33_non_const_sp_lj3320;\n" +" mov.u32 %r3, 120;\n" +" setp.gt.s32 %p2, %r1, %r3;\n" +" @%p2 bra $Lt_1_24834;\n" +" .loc 16 302 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32680_34_non_const_lj13360;\n" +" cvt.s64.s32 %rd8, %r1;\n" +" mul.wide.s32 %rd9, %r1, 16;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" +" add.u64 %rd11, %rd10, %rd9;\n" +" add.u64 %rd12, %rd9, %rd7;\n" +" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" +" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" +" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r5, 0;\n" +" setp.le.s32 %p3, %r4, %r5;\n" +" @%p3 bra $Lt_1_25346;\n" +" .loc 16 304 0\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32681_34_non_const_lj35296;\n" +" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" +" add.u64 %rd15, %rd14, %rd9;\n" +" add.u64 %rd16, %rd9, %rd13;\n" +" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" +" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" +"$Lt_1_25346:\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32681_34_non_const_lj35296;\n" +"$Lt_1_24834:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32680_34_non_const_lj13360;\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32681_34_non_const_lj35296;\n" +" .loc 16 315 0\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, %f14;\n" +" mov.f32 %f16, 0f00000000; \n" +" mov.f32 %f17, %f16;\n" +" mov.f32 %f18, 0f00000000; \n" +" mov.f32 %f19, %f18;\n" +" mov.f32 %f20, 0f00000000; \n" +" mov.f32 %f21, %f20;\n" +" .loc 16 317 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r7, %r1, %r6;\n" +" cvt.s32.u32 %r8, %ntid.x;\n" +" div.s32 %r9, %r8, %r6;\n" +" rem.s32 %r10, %r1, %r6;\n" +" cvt.s32.u32 %r11, %ctaid.x;\n" +" mul.lo.s32 %r12, %r11, %r9;\n" +" add.s32 %r13, %r7, %r12;\n" +" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p4, %r13, %r14;\n" +" @!%p4 bra $Lt_1_26114;\n" +" .loc 16 321 0\n" +" cvt.s64.s32 %rd17, %r13;\n" +" mul.wide.s32 %rd18, %r13, 4;\n" +" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd20, %rd18, %rd19;\n" +" ld.global.s32 %r15, [%rd20+0];\n" +" .loc 16 323 0\n" +" ld.param.s32 %r16, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd21, %r16;\n" +" mul.wide.s32 %rd22, %r16, 4;\n" +" add.u64 %rd23, %rd22, %rd20;\n" +" ld.global.s32 %r17, [%rd23+0];\n" +" add.u64 %rd24, %rd22, %rd23;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p5, %rd25, %rd19;\n" +" @%p5 bra $Lt_1_26626;\n" +" .loc 16 329 0\n" +" cvt.s32.s64 %r18, %rd21;\n" +" mul.lo.s32 %r19, %r18, %r17;\n" +" cvt.s64.s32 %rd26, %r19;\n" +" mul.wide.s32 %rd27, %r19, 4;\n" +" add.u64 %rd28, %rd24, %rd27;\n" +" .loc 16 330 0\n" +" mul.lo.s32 %r20, %r10, %r18;\n" +" cvt.s64.s32 %rd29, %r20;\n" +" mul.wide.s32 %rd30, %r20, 4;\n" +" add.u64 %rd31, %rd24, %rd30;\n" +" .loc 16 331 0\n" +" mul.lo.s32 %r21, %r18, %r6;\n" +" bra.uni $Lt_1_26370;\n" +"$Lt_1_26626:\n" +" .loc 16 333 0\n" +" ld.global.s32 %r22, [%rd24+0];\n" +" cvt.s64.s32 %rd32, %r22;\n" +" mul.wide.s32 %rd33, %r22, 4;\n" +" add.u64 %rd34, %rd25, %rd33;\n" +" .loc 16 334 0\n" +" cvt.s64.s32 %rd35, %r17;\n" +" mul.wide.s32 %rd36, %r17, 4;\n" +" add.u64 %rd28, %rd34, %rd36;\n" +" .loc 16 335 0\n" +" mov.s32 %r21, %r6;\n" +" .loc 16 336 0\n" +" cvt.s64.s32 %rd37, %r10;\n" +" mul.wide.s32 %rd38, %r10, 4;\n" +" add.u64 %rd31, %rd34, %rd38;\n" +"$Lt_1_26370:\n" +" .loc 16 339 0\n" +" mov.u32 %r23, %r15;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" mov.s32 %r28, 0;\n" +" mov.u32 %r29, %r28;\n" +" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" mov.f32 %f28, %f24;\n" +" mov.f32 %f29, %f25;\n" +" .loc 16 340 0\n" +" mov.u32 %r30, %r15;\n" +" mov.s32 %r31, 0;\n" +" mov.u32 %r32, %r31;\n" +" mov.s32 %r33, 0;\n" +" mov.u32 %r34, %r33;\n" +" mov.s32 %r35, 0;\n" +" mov.u32 %r36, %r35;\n" +" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r30,%r32,%r34,%r36}];\n" +" mov.f32 %f34, %f30;\n" +" setp.ge.u64 %p6, %rd31, %rd28;\n" +" @%p6 bra $Lt_1_36610;\n" +" cvt.rzi.ftz.s32.f32 %r37, %f29;\n" +" cvt.s64.s32 %rd39, %r21;\n" +" mul.lo.s32 %r38, %r37, 11;\n" +" cvt.rn.f32.s32 %f35, %r38;\n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.f32 %f40, 0f00000000; \n" +"$Lt_1_27394:\n" +" .loc 16 345 0\n" +" ld.global.s32 %r39, [%rd31+0];\n" +" .loc 16 348 0\n" +" shr.s32 %r40, %r39, 30;\n" +" and.b32 %r41, %r40, 3;\n" +" cvt.s64.s32 %rd40, %r41;\n" +" mul.wide.s32 %rd41, %r41, 4;\n" +" add.u64 %rd42, %rd1, %rd41;\n" +" ld.shared.f32 %f41, [%rd42+0];\n" +" .loc 16 349 0\n" +" mov.f32 %f42, 0f3f800000; \n" +" ld.shared.f32 %f43, [%rd42+16];\n" +" sub.ftz.f32 %f44, %f42, %f43;\n" +" .loc 16 352 0\n" +" and.b32 %r42, %r39, 1073741823;\n" +" mov.u32 %r43, %r42;\n" +" mov.s32 %r44, 0;\n" +" mov.u32 %r45, %r44;\n" +" mov.s32 %r46, 0;\n" +" mov.u32 %r47, %r46;\n" +" mov.s32 %r48, 0;\n" +" mov.u32 %r49, %r48;\n" +" tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r43,%r45,%r47,%r49}];\n" +" mov.f32 %f49, %f45;\n" +" mov.f32 %f50, %f46;\n" +" mov.f32 %f51, %f47;\n" +" mov.f32 %f52, %f48;\n" +" sub.ftz.f32 %f53, %f27, %f50;\n" +" sub.ftz.f32 %f54, %f26, %f49;\n" +" sub.ftz.f32 %f55, %f28, %f51;\n" +" mul.ftz.f32 %f56, %f53, %f53;\n" +" fma.rn.ftz.f32 %f57, %f54, %f54, %f56;\n" +" fma.rn.ftz.f32 %f58, %f55, %f55, %f57;\n" +" add.ftz.f32 %f59, %f35, %f52;\n" +" cvt.rzi.ftz.s32.f32 %r50, %f59;\n" +" cvt.s64.s32 %rd43, %r50;\n" +" mul.wide.s32 %rd44, %r50, 16;\n" +" add.u64 %rd45, %rd44, %rd7;\n" +" ld.shared.f32 %f60, [%rd45+8];\n" +" setp.gt.ftz.f32 %p7, %f60, %f58;\n" +" @!%p7 bra $Lt_1_30210;\n" +" rcp.approx.ftz.f32 %f61, %f58;\n" +" ld.shared.f32 %f62, [%rd45+12];\n" +" setp.lt.ftz.f32 %p8, %f58, %f62;\n" +" @!%p8 bra $Lt_1_28418;\n" +" .loc 16 366 0\n" +" mul.ftz.f32 %f63, %f61, %f61;\n" +" mul.ftz.f32 %f64, %f61, %f63;\n" +" mov.f32 %f65, %f64;\n" +" .loc 16 367 0\n" +" mul.ftz.f32 %f66, %f64, %f41;\n" +" ld.shared.v2.f32 {%f67,%f68}, [%rd45+0];\n" +" mul.ftz.f32 %f69, %f67, %f64;\n" +" sub.ftz.f32 %f70, %f69, %f68;\n" +" mul.ftz.f32 %f71, %f66, %f70;\n" +" bra.uni $Lt_1_28162;\n" +"$Lt_1_28418:\n" +" .loc 16 369 0\n" +" mov.f32 %f71, 0f00000000; \n" +"$Lt_1_28162:\n" +" ld.param.f32 %f72, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" +" setp.gt.ftz.f32 %p9, %f72, %f58;\n" +" @!%p9 bra $Lt_1_28930;\n" +" .loc 16 376 0\n" +" sqrt.approx.ftz.f32 %f73, %f58;\n" +" ld.param.f32 %f74, [__cudaparm_kernel_pair_fast_g_ewald];\n" +" mul.ftz.f32 %f75, %f74, %f73;\n" +" mul.ftz.f32 %f76, %f75, %f75;\n" +" mov.f32 %f77, 0f3f800000; \n" +" mov.f32 %f78, 0f3ea7ba05; \n" +" fma.rn.ftz.f32 %f79, %f78, %f75, %f77;\n" +" neg.ftz.f32 %f80, %f76;\n" +" rcp.approx.ftz.f32 %f81, %f79;\n" +" mov.f32 %f82, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f83, %f80, %f82;\n" +" ex2.approx.ftz.f32 %f84, %f83;\n" +" mov.f32 %f85, 0f3e827906; \n" +" mov.f32 %f86, 0fbe91a98e; \n" +" mov.f32 %f87, 0f3fb5f0e3; \n" +" mov.f32 %f88, 0fbfba00e3; \n" +" mov.f32 %f89, 0f3f87dc22; \n" +" fma.rn.ftz.f32 %f90, %f89, %f81, %f88;\n" +" fma.rn.ftz.f32 %f91, %f81, %f90, %f87;\n" +" fma.rn.ftz.f32 %f92, %f81, %f91, %f86;\n" +" fma.rn.ftz.f32 %f93, %f81, %f92, %f85;\n" +" mul.ftz.f32 %f94, %f81, %f93;\n" +" mul.ftz.f32 %f95, %f84, %f94;\n" +" mov.f32 %f96, %f95;\n" +" .loc 16 377 0\n" +" mov.u32 %r51, %r42;\n" +" mov.s32 %r52, 0;\n" +" mov.u32 %r53, %r52;\n" +" mov.s32 %r54, 0;\n" +" mov.u32 %r55, %r54;\n" +" mov.s32 %r56, 0;\n" +" mov.u32 %r57, %r56;\n" +" tex.1d.v4.f32.s32 {%f97,%f98,%f99,%f100},[q_tex,{%r51,%r53,%r55,%r57}];\n" +" mov.f32 %f101, %f97;\n" +" ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_qqrd2e];\n" +" mul.ftz.f32 %f103, %f102, %f34;\n" +" mul.ftz.f32 %f104, %f103, %f101;\n" +" div.approx.ftz.f32 %f105, %f104, %f73;\n" +" mov.f32 %f106, %f105;\n" +" .loc 16 378 0\n" +" mov.f32 %f107, 0f3f906ebb; \n" +" mul.ftz.f32 %f108, %f75, %f107;\n" +" fma.rn.ftz.f32 %f109, %f84, %f108, %f95;\n" +" sub.ftz.f32 %f110, %f109, %f44;\n" +" mul.ftz.f32 %f111, %f105, %f110;\n" +" bra.uni $Lt_1_28674;\n" +"$Lt_1_28930:\n" +" .loc 16 380 0\n" +" mov.f32 %f111, 0f00000000; \n" +"$Lt_1_28674:\n" +" .loc 16 384 0\n" +" add.ftz.f32 %f112, %f111, %f71;\n" +" mul.ftz.f32 %f113, %f112, %f61;\n" +" fma.rn.ftz.f32 %f38, %f54, %f113, %f38;\n" +" .loc 16 385 0\n" +" fma.rn.ftz.f32 %f37, %f53, %f113, %f37;\n" +" .loc 16 386 0\n" +" fma.rn.ftz.f32 %f36, %f55, %f113, %f36;\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p10, %r58, %r59;\n" +" @%p10 bra $Lt_1_29698;\n" +" .loc 16 389 0\n" +" mov.f32 %f114, %f106;\n" +" mov.f32 %f115, %f96;\n" +" sub.ftz.f32 %f116, %f115, %f44;\n" +" fma.rn.ftz.f32 %f117, %f114, %f116, %f39;\n" +" selp.f32 %f39, %f117, %f39, %p9;\n" +" @!%p8 bra $Lt_1_29698;\n" +" .loc 16 392 0\n" +" add.u64 %rd46, %rd44, %rd13;\n" +" mov.f32 %f118, %f65;\n" +" ld.shared.v4.f32 {%f119,%f120,%f121,_}, [%rd46+0];\n" +" mul.ftz.f32 %f122, %f119, %f118;\n" +" sub.ftz.f32 %f123, %f122, %f120;\n" +" mul.ftz.f32 %f124, %f118, %f123;\n" +" .loc 16 393 0\n" +" sub.ftz.f32 %f125, %f124, %f121;\n" +" fma.rn.ftz.f32 %f40, %f41, %f125, %f40;\n" +"$Lt_1_29698:\n" +"$Lt_1_29186:\n" +" ld.param.s32 %r60, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r61, 0;\n" +" setp.le.s32 %p11, %r60, %r61;\n" +" @%p11 bra $Lt_1_30210;\n" +" .loc 16 397 0\n" +" mov.f32 %f126, %f11;\n" +" mul.ftz.f32 %f127, %f54, %f54;\n" +" fma.rn.ftz.f32 %f128, %f113, %f127, %f126;\n" +" mov.f32 %f11, %f128;\n" +" .loc 16 398 0\n" +" mov.f32 %f129, %f13;\n" +" fma.rn.ftz.f32 %f130, %f113, %f56, %f129;\n" +" mov.f32 %f13, %f130;\n" +" .loc 16 399 0\n" +" mov.f32 %f131, %f15;\n" +" mul.ftz.f32 %f132, %f55, %f55;\n" +" fma.rn.ftz.f32 %f133, %f113, %f132, %f131;\n" +" mov.f32 %f15, %f133;\n" +" .loc 16 400 0\n" +" mov.f32 %f134, %f17;\n" +" mul.ftz.f32 %f135, %f53, %f54;\n" +" fma.rn.ftz.f32 %f136, %f113, %f135, %f134;\n" +" mov.f32 %f17, %f136;\n" +" .loc 16 401 0\n" +" mov.f32 %f137, %f19;\n" +" mul.ftz.f32 %f138, %f54, %f55;\n" +" fma.rn.ftz.f32 %f139, %f113, %f138, %f137;\n" +" mov.f32 %f19, %f139;\n" +" .loc 16 402 0\n" +" mul.ftz.f32 %f140, %f53, %f55;\n" +" fma.rn.ftz.f32 %f20, %f113, %f140, %f20;\n" +" mov.f32 %f21, %f20;\n" +"$Lt_1_30210:\n" +"$Lt_1_27650:\n" +" .loc 16 344 0\n" +" mul.lo.u64 %rd47, %rd39, 4;\n" +" add.u64 %rd31, %rd31, %rd47;\n" +" setp.lt.u64 %p12, %rd31, %rd28;\n" +" @%p12 bra $Lt_1_27394;\n" +" bra.uni $Lt_1_25858;\n" +"$Lt_1_36610:\n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.f32 %f40, 0f00000000; \n" +" bra.uni $Lt_1_25858;\n" +"$Lt_1_26114:\n" +" mov.f32 %f36, 0f00000000; \n" +" mov.f32 %f37, 0f00000000; \n" +" mov.f32 %f38, 0f00000000; \n" +" mov.f32 %f39, 0f00000000; \n" +" mov.f32 %f40, 0f00000000; \n" +"$Lt_1_25858:\n" +" mov.u32 %r62, 1;\n" +" setp.le.s32 %p13, %r6, %r62;\n" +" @%p13 bra $Lt_1_33026;\n" +" .loc 16 413 0\n" +" mov.u64 %rd48, __cuda___cuda_local_var_32795_35_non_const_red_acc7232;\n" +" cvt.s64.s32 %rd49, %r1;\n" +" mul.wide.s32 %rd50, %r1, 4;\n" +" add.u64 %rd51, %rd48, %rd50;\n" +" mov.f32 %f141, %f38;\n" +" st.shared.f32 [%rd51+0], %f141;\n" +" .loc 16 414 0\n" +" mov.f32 %f142, %f37;\n" +" st.shared.f32 [%rd51+512], %f142;\n" +" .loc 16 415 0\n" +" mov.f32 %f143, %f36;\n" +" st.shared.f32 [%rd51+1024], %f143;\n" +" .loc 16 416 0\n" +" mov.f32 %f144, %f40;\n" +" st.shared.f32 [%rd51+1536], %f144;\n" +" .loc 16 417 0\n" +" mov.f32 %f145, %f39;\n" +" st.shared.f32 [%rd51+2048], %f145;\n" +" .loc 16 419 0\n" +" shr.s32 %r63, %r6, 31;\n" +" mov.s32 %r64, 1;\n" +" and.b32 %r65, %r63, %r64;\n" +" add.s32 %r66, %r65, %r6;\n" +" shr.s32 %r67, %r66, 1;\n" +" mov.s32 %r68, %r67;\n" +" mov.u32 %r69, 0;\n" +" setp.ne.u32 %p14, %r67, %r69;\n" +" @!%p14 bra $Lt_1_31490;\n" +"$Lt_1_32002:\n" +" setp.ge.u32 %p15, %r10, %r68;\n" +" @%p15 bra $Lt_1_32258;\n" +" .loc 16 422 0\n" +" add.u32 %r70, %r1, %r68;\n" +" cvt.u64.u32 %rd52, %r70;\n" +" mul.wide.u32 %rd53, %r70, 4;\n" +" add.u64 %rd54, %rd48, %rd53;\n" +" ld.shared.f32 %f146, [%rd54+0];\n" +" add.ftz.f32 %f141, %f146, %f141;\n" +" st.shared.f32 [%rd51+0], %f141;\n" +" ld.shared.f32 %f147, [%rd54+512];\n" +" add.ftz.f32 %f142, %f147, %f142;\n" +" st.shared.f32 [%rd51+512], %f142;\n" +" ld.shared.f32 %f148, [%rd54+1024];\n" +" add.ftz.f32 %f143, %f148, %f143;\n" +" st.shared.f32 [%rd51+1024], %f143;\n" +" ld.shared.f32 %f149, [%rd54+1536];\n" +" add.ftz.f32 %f144, %f149, %f144;\n" +" st.shared.f32 [%rd51+1536], %f144;\n" +" ld.shared.f32 %f150, [%rd54+2048];\n" +" add.ftz.f32 %f145, %f150, %f145;\n" +" st.shared.f32 [%rd51+2048], %f145;\n" +"$Lt_1_32258:\n" +" .loc 16 419 0\n" +" shr.u32 %r68, %r68, 1;\n" +" mov.u32 %r71, 0;\n" +" setp.ne.u32 %p16, %r68, %r71;\n" +" @%p16 bra $Lt_1_32002;\n" +"$Lt_1_31490:\n" +" .loc 16 426 0\n" +" mov.f32 %f38, %f141;\n" +" .loc 16 427 0\n" +" mov.f32 %f37, %f142;\n" +" .loc 16 428 0\n" +" mov.f32 %f36, %f143;\n" +" .loc 16 429 0\n" +" mov.f32 %f40, %f144;\n" +" .loc 16 430 0\n" +" mov.f32 %f39, %f145;\n" +" ld.param.s32 %r72, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r73, 0;\n" +" setp.le.s32 %p17, %r72, %r73;\n" +" @%p17 bra $Lt_1_33026;\n" +" .loc 16 434 0\n" +" mov.f32 %f141, %f11;\n" +" st.shared.f32 [%rd51+0], %f141;\n" +" mov.f32 %f142, %f13;\n" +" st.shared.f32 [%rd51+512], %f142;\n" +" mov.f32 %f143, %f15;\n" +" st.shared.f32 [%rd51+1024], %f143;\n" +" mov.f32 %f144, %f17;\n" +" st.shared.f32 [%rd51+1536], %f144;\n" +" mov.f32 %f145, %f19;\n" +" st.shared.f32 [%rd51+2048], %f145;\n" +" mov.f32 %f151, %f21;\n" +" st.shared.f32 [%rd51+2560], %f151;\n" +" .loc 16 436 0\n" +" mov.s32 %r74, %r67;\n" +" @!%p14 bra $Lt_1_33538;\n" +"$Lt_1_34050:\n" +" setp.ge.u32 %p18, %r10, %r74;\n" +" @%p18 bra $Lt_1_34306;\n" +" .loc 16 439 0\n" +" add.u32 %r75, %r1, %r74;\n" +" cvt.u64.u32 %rd55, %r75;\n" +" mul.wide.u32 %rd56, %r75, 4;\n" +" add.u64 %rd57, %rd48, %rd56;\n" +" ld.shared.f32 %f152, [%rd57+0];\n" +" add.ftz.f32 %f141, %f152, %f141;\n" +" st.shared.f32 [%rd51+0], %f141;\n" +" ld.shared.f32 %f153, [%rd57+512];\n" +" add.ftz.f32 %f142, %f153, %f142;\n" +" st.shared.f32 [%rd51+512], %f142;\n" +" ld.shared.f32 %f154, [%rd57+1024];\n" +" add.ftz.f32 %f143, %f154, %f143;\n" +" st.shared.f32 [%rd51+1024], %f143;\n" +" ld.shared.f32 %f155, [%rd57+1536];\n" +" add.ftz.f32 %f144, %f155, %f144;\n" +" st.shared.f32 [%rd51+1536], %f144;\n" +" ld.shared.f32 %f156, [%rd57+2048];\n" +" add.ftz.f32 %f145, %f156, %f145;\n" +" st.shared.f32 [%rd51+2048], %f145;\n" +" ld.shared.f32 %f157, [%rd57+2560];\n" +" add.ftz.f32 %f151, %f157, %f151;\n" +" st.shared.f32 [%rd51+2560], %f151;\n" +"$Lt_1_34306:\n" +" .loc 16 436 0\n" +" shr.u32 %r74, %r74, 1;\n" +" mov.u32 %r76, 0;\n" +" setp.ne.u32 %p19, %r74, %r76;\n" +" @%p19 bra $Lt_1_34050;\n" +"$Lt_1_33538:\n" +" .loc 16 444 0\n" +" mov.f32 %f11, %f141;\n" +" mov.f32 %f13, %f142;\n" +" mov.f32 %f15, %f143;\n" +" mov.f32 %f17, %f144;\n" +" mov.f32 %f19, %f145;\n" +" mov.f32 %f21, %f151;\n" +"$Lt_1_33026:\n" +"$Lt_1_30978:\n" +" selp.s32 %r77, 1, 0, %p4;\n" +" mov.s32 %r78, 0;\n" +" set.eq.u32.s32 %r79, %r10, %r78;\n" +" neg.s32 %r80, %r79;\n" +" and.b32 %r81, %r77, %r80;\n" +" mov.u32 %r82, 0;\n" +" setp.eq.s32 %p20, %r81, %r82;\n" +" @%p20 bra $Lt_1_35074;\n" +" .loc 16 450 0\n" +" cvt.s64.s32 %rd58, %r13;\n" +" ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd60, %r13, 4;\n" +" add.u64 %rd61, %rd59, %rd60;\n" +" ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r84, 0;\n" +" setp.le.s32 %p21, %r83, %r84;\n" +" @%p21 bra $Lt_1_35586;\n" +" .loc 16 452 0\n" +" st.global.f32 [%rd61+0], %f40;\n" +" .loc 16 453 0\n" +" cvt.s64.s32 %rd62, %r14;\n" +" mul.wide.s32 %rd63, %r14, 4;\n" +" add.u64 %rd64, %rd63, %rd61;\n" +" .loc 16 454 0\n" +" st.global.f32 [%rd64+0], %f39;\n" +" .loc 16 455 0\n" +" add.u64 %rd61, %rd63, %rd64;\n" +"$Lt_1_35586:\n" +" ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r86, 0;\n" +" setp.le.s32 %p22, %r85, %r86;\n" +" @%p22 bra $Lt_1_36098;\n" +" .loc 16 459 0\n" +" mov.f32 %f158, %f11;\n" +" st.global.f32 [%rd61+0], %f158;\n" +" .loc 16 460 0\n" +" cvt.s64.s32 %rd65, %r14;\n" +" mul.wide.s32 %rd66, %r14, 4;\n" +" add.u64 %rd67, %rd66, %rd61;\n" +" .loc 16 459 0\n" +" mov.f32 %f159, %f13;\n" +" st.global.f32 [%rd67+0], %f159;\n" +" .loc 16 460 0\n" +" add.u64 %rd68, %rd66, %rd67;\n" +" .loc 16 459 0\n" +" mov.f32 %f160, %f15;\n" +" st.global.f32 [%rd68+0], %f160;\n" +" .loc 16 460 0\n" +" add.u64 %rd69, %rd66, %rd68;\n" +" .loc 16 459 0\n" +" mov.f32 %f161, %f17;\n" +" st.global.f32 [%rd69+0], %f161;\n" +" .loc 16 460 0\n" +" add.u64 %rd61, %rd66, %rd69;\n" +" .loc 16 459 0\n" +" mov.f32 %f162, %f19;\n" +" st.global.f32 [%rd61+0], %f162;\n" +" mov.f32 %f163, %f21;\n" +" add.u64 %rd70, %rd66, %rd61;\n" +" st.global.f32 [%rd70+0], %f163;\n" +"$Lt_1_36098:\n" +" .loc 16 463 0\n" +" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd72, %rd58, 16;\n" +" add.u64 %rd73, %rd71, %rd72;\n" +" mov.f32 %f164, %f165;\n" +" st.global.v4.f32 [%rd73+0], {%f38,%f37,%f36,%f164};\n" +"$Lt_1_35074:\n" +" .loc 16 465 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/morse_gpu_kernel.ptx b/lib/gpu/morse_gpu_kernel.ptx new file mode 100644 index 000000000..614b69b1d --- /dev/null +++ b/lib/gpu/morse_gpu_kernel.ptx @@ -0,0 +1,999 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bf97_00000000-9_morse_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.pRrhev) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bf97_00000000-8_morse_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "morse_gpu_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + + .entry kernel_pair ( + .param .u64 __cudaparm_kernel_pair_x_, + .param .u64 __cudaparm_kernel_pair_mor1, + .param .u64 __cudaparm_kernel_pair_mor2, + .param .s32 __cudaparm_kernel_pair_lj_types, + .param .u64 __cudaparm_kernel_pair_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_dev_nbor, + .param .u64 __cudaparm_kernel_pair_dev_packed, + .param .u64 __cudaparm_kernel_pair_ans, + .param .u64 __cudaparm_kernel_pair_engv, + .param .s32 __cudaparm_kernel_pair_eflag, + .param .s32 __cudaparm_kernel_pair_vflag, + .param .s32 __cudaparm_kernel_pair_inum, + .param .s32 __cudaparm_kernel_pair_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_t_per_atom) + { + .reg .u32 %r<72>; + .reg .u64 %rd<63>; + .reg .f32 %f<104>; + .reg .f64 %fd<10>; + .reg .pred %p<19>; + .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16]; + .shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072]; + // __cuda_local_var_32504_10_non_const_f = 48 + // __cuda_local_var_32508_9_non_const_virial = 16 + .loc 16 88 0 +$LDWbegin_kernel_pair: + .loc 16 95 0 + ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; + ldu.global.f32 %f1, [%rd1+0]; + .loc 16 96 0 + ld.global.f32 %f2, [%rd1+4]; + .loc 16 97 0 + ld.global.f32 %f3, [%rd1+8]; + .loc 16 98 0 + ld.global.f32 %f4, [%rd1+12]; + st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; + .loc 16 107 0 + mov.f32 %f5, 0f00000000; // 0 + mov.f32 %f6, %f5; + mov.f32 %f7, 0f00000000; // 0 + mov.f32 %f8, %f7; + mov.f32 %f9, 0f00000000; // 0 + mov.f32 %f10, %f9; + mov.f32 %f11, 0f00000000; // 0 + mov.f32 %f12, %f11; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, %f13; + mov.f32 %f15, 0f00000000; // 0 + mov.f32 %f16, %f15; + ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; + cvt.s32.u32 %r2, %tid.x; + div.s32 %r3, %r2, %r1; + cvt.s32.u32 %r4, %ntid.x; + div.s32 %r5, %r4, %r1; + rem.s32 %r6, %r2, %r1; + cvt.s32.u32 %r7, %ctaid.x; + mul.lo.s32 %r8, %r7, %r5; + add.s32 %r9, %r3, %r8; + ld.param.s32 %r10, [__cudaparm_kernel_pair_inum]; + setp.lt.s32 %p1, %r9, %r10; + @!%p1 bra $Lt_0_19202; + .loc 16 113 0 + ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; + cvt.s64.s32 %rd2, %r11; + mul.wide.s32 %rd3, %r11, 4; + cvt.s64.s32 %rd4, %r9; + mul.wide.s32 %rd5, %r9, 4; + ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; + add.u64 %rd7, %rd5, %rd6; + add.u64 %rd8, %rd3, %rd7; + ld.global.s32 %r12, [%rd8+0]; + add.u64 %rd9, %rd3, %rd8; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed]; + setp.ne.u64 %p2, %rd10, %rd6; + @%p2 bra $Lt_0_19714; + .loc 16 119 0 + cvt.s32.s64 %r13, %rd2; + mul.lo.s32 %r14, %r13, %r12; + cvt.s64.s32 %rd11, %r14; + mul.wide.s32 %rd12, %r14, 4; + add.u64 %rd13, %rd9, %rd12; + .loc 16 120 0 + mul.lo.s32 %r15, %r6, %r13; + cvt.s64.s32 %rd14, %r15; + mul.wide.s32 %rd15, %r15, 4; + add.u64 %rd16, %rd9, %rd15; + .loc 16 121 0 + mul.lo.s32 %r16, %r13, %r1; + bra.uni $Lt_0_19458; +$Lt_0_19714: + .loc 16 123 0 + ld.global.s32 %r17, [%rd9+0]; + cvt.s64.s32 %rd17, %r17; + mul.wide.s32 %rd18, %r17, 4; + add.u64 %rd19, %rd10, %rd18; + .loc 16 124 0 + cvt.s64.s32 %rd20, %r12; + mul.wide.s32 %rd21, %r12, 4; + add.u64 %rd13, %rd19, %rd21; + .loc 16 125 0 + mov.s32 %r16, %r1; + .loc 16 126 0 + cvt.s64.s32 %rd22, %r6; + mul.wide.s32 %rd23, %r6, 4; + add.u64 %rd16, %rd19, %rd23; +$Lt_0_19458: + .loc 16 129 0 + ld.global.s32 %r18, [%rd7+0]; + mov.u32 %r19, %r18; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}]; + mov.f32 %f21, %f17; + mov.f32 %f22, %f18; + mov.f32 %f23, %f19; + mov.f32 %f24, %f20; + setp.ge.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_28162; + cvt.rzi.ftz.s32.f32 %r26, %f24; + cvt.s64.s32 %rd24, %r16; + ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types]; + mul.lo.s32 %r28, %r27, %r26; + ld.param.u64 %rd25, [__cudaparm_kernel_pair_mor1]; + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92; +$Lt_0_20482: + //<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown + .loc 16 135 0 + ld.global.s32 %r29, [%rd16+0]; + .loc 16 136 0 + shr.s32 %r30, %r29, 30; + and.b32 %r31, %r30, 3; + cvt.s64.s32 %rd27, %r31; + mul.wide.s32 %rd28, %r31, 4; + add.u64 %rd29, %rd26, %rd28; + ld.shared.f32 %f29, [%rd29+0]; + .loc 16 139 0 + and.b32 %r32, %r29, 1073741823; + mov.u32 %r33, %r32; + mov.s32 %r34, 0; + mov.u32 %r35, %r34; + mov.s32 %r36, 0; + mov.u32 %r37, %r36; + mov.s32 %r38, 0; + mov.u32 %r39, %r38; + tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}]; + mov.f32 %f34, %f30; + mov.f32 %f35, %f31; + mov.f32 %f36, %f32; + mov.f32 %f37, %f33; + cvt.rzi.ftz.s32.f32 %r40, %f37; + sub.ftz.f32 %f38, %f22, %f35; + sub.ftz.f32 %f39, %f21, %f34; + sub.ftz.f32 %f40, %f23, %f36; + mul.ftz.f32 %f41, %f38, %f38; + fma.rn.ftz.f32 %f42, %f39, %f39, %f41; + add.s32 %r41, %r40, %r28; + cvt.s64.s32 %rd30, %r41; + fma.rn.ftz.f32 %f43, %f40, %f40, %f42; + mul.wide.s32 %rd31, %r41, 16; + add.u64 %rd32, %rd25, %rd31; + ld.global.f32 %f44, [%rd32+0]; + setp.gt.ftz.f32 %p4, %f44, %f43; + @!%p4 bra $Lt_0_21762; + .loc 16 152 0 + sqrt.approx.ftz.f32 %f45, %f43; + ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd32+0]; + sub.ftz.f32 %f49, %f45, %f47; + mul.ftz.f32 %f50, %f48, %f49; + neg.ftz.f32 %f51, %f50; + .loc 16 154 0 + mov.f32 %f52, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f53, %f51, %f52; + ex2.approx.ftz.f32 %f54, %f53; + mul.ftz.f32 %f55, %f54, %f54; + sub.ftz.f32 %f56, %f55, %f54; + mul.ftz.f32 %f57, %f46, %f56; + .loc 16 156 0 + div.approx.ftz.f32 %f58, %f57, %f45; + mul.ftz.f32 %f59, %f58, %f29; + fma.rn.ftz.f32 %f27, %f39, %f59, %f27; + .loc 16 157 0 + fma.rn.ftz.f32 %f26, %f38, %f59, %f26; + .loc 16 158 0 + fma.rn.ftz.f32 %f25, %f40, %f59, %f25; + ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r43, 0; + setp.le.s32 %p5, %r42, %r43; + @%p5 bra $Lt_0_21250; + .loc 16 162 0 + cvt.ftz.f64.f32 %fd1, %f54; + ld.param.u64 %rd33, [__cudaparm_kernel_pair_mor2]; + mul.lo.u64 %rd34, %rd30, 8; + add.u64 %rd35, %rd33, %rd34; + ld.global.v2.f32 {%f60,%f61}, [%rd35+0]; + cvt.ftz.f64.f32 %fd2, %f61; + cvt.ftz.f64.f32 %fd3, %f60; + mul.ftz.f32 %f62, %f54, %f54; + cvt.ftz.f64.f32 %fd4, %f62; + add.f64 %fd5, %fd1, %fd1; + sub.f64 %fd6, %fd4, %fd5; + mul.f64 %fd7, %fd3, %fd6; + sub.f64 %fd8, %fd7, %fd2; + cvt.rn.ftz.f32.f64 %f63, %fd8; + fma.rn.ftz.f32 %f28, %f29, %f63, %f28; +$Lt_0_21250: + ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r45, 0; + setp.le.s32 %p6, %r44, %r45; + @%p6 bra $Lt_0_21762; + .loc 16 165 0 + mov.f32 %f64, %f6; + mul.ftz.f32 %f65, %f39, %f39; + fma.rn.ftz.f32 %f66, %f59, %f65, %f64; + mov.f32 %f6, %f66; + .loc 16 166 0 + mov.f32 %f67, %f8; + fma.rn.ftz.f32 %f68, %f59, %f41, %f67; + mov.f32 %f8, %f68; + .loc 16 167 0 + mov.f32 %f69, %f10; + mul.ftz.f32 %f70, %f40, %f40; + fma.rn.ftz.f32 %f71, %f59, %f70, %f69; + mov.f32 %f10, %f71; + .loc 16 168 0 + mov.f32 %f72, %f12; + mul.ftz.f32 %f73, %f38, %f39; + fma.rn.ftz.f32 %f74, %f59, %f73, %f72; + mov.f32 %f12, %f74; + .loc 16 169 0 + mov.f32 %f75, %f14; + mul.ftz.f32 %f76, %f39, %f40; + fma.rn.ftz.f32 %f77, %f59, %f76, %f75; + mov.f32 %f14, %f77; + .loc 16 170 0 + mul.ftz.f32 %f78, %f38, %f40; + fma.rn.ftz.f32 %f15, %f59, %f78, %f15; + mov.f32 %f16, %f15; +$Lt_0_21762: +$Lt_0_20738: + .loc 16 133 0 + mul.lo.u64 %rd36, %rd24, 4; + add.u64 %rd16, %rd16, %rd36; + setp.lt.u64 %p7, %rd16, %rd13; + @%p7 bra $Lt_0_20482; + bra.uni $Lt_0_18946; +$Lt_0_28162: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 + bra.uni $Lt_0_18946; +$Lt_0_19202: + mov.f32 %f25, 0f00000000; // 0 + mov.f32 %f26, 0f00000000; // 0 + mov.f32 %f27, 0f00000000; // 0 + mov.f32 %f28, 0f00000000; // 0 +$Lt_0_18946: + mov.u32 %r46, 1; + setp.le.s32 %p8, %r1, %r46; + @%p8 bra $Lt_0_24578; + .loc 16 181 0 + mov.u64 %rd37, __cuda___cuda_local_var_32582_35_non_const_red_acc108; + cvt.s64.s32 %rd38, %r2; + mul.wide.s32 %rd39, %r2, 4; + add.u64 %rd40, %rd37, %rd39; + mov.f32 %f79, %f27; + st.shared.f32 [%rd40+0], %f79; + .loc 16 182 0 + mov.f32 %f80, %f26; + st.shared.f32 [%rd40+512], %f80; + .loc 16 183 0 + mov.f32 %f81, %f25; + st.shared.f32 [%rd40+1024], %f81; + .loc 16 184 0 + mov.f32 %f82, %f28; + st.shared.f32 [%rd40+1536], %f82; + .loc 16 186 0 + shr.s32 %r47, %r1, 31; + mov.s32 %r48, 1; + and.b32 %r49, %r47, %r48; + add.s32 %r50, %r49, %r1; + shr.s32 %r51, %r50, 1; + mov.s32 %r52, %r51; + mov.u32 %r53, 0; + setp.ne.u32 %p9, %r51, %r53; + @!%p9 bra $Lt_0_23042; +$Lt_0_23554: + setp.ge.u32 %p10, %r6, %r52; + @%p10 bra $Lt_0_23810; + .loc 16 189 0 + add.u32 %r54, %r2, %r52; + cvt.u64.u32 %rd41, %r54; + mul.wide.u32 %rd42, %r54, 4; + add.u64 %rd43, %rd37, %rd42; + ld.shared.f32 %f83, [%rd43+0]; + add.ftz.f32 %f79, %f83, %f79; + st.shared.f32 [%rd40+0], %f79; + ld.shared.f32 %f84, [%rd43+512]; + add.ftz.f32 %f80, %f84, %f80; + st.shared.f32 [%rd40+512], %f80; + ld.shared.f32 %f85, [%rd43+1024]; + add.ftz.f32 %f81, %f85, %f81; + st.shared.f32 [%rd40+1024], %f81; + ld.shared.f32 %f86, [%rd43+1536]; + add.ftz.f32 %f82, %f86, %f82; + st.shared.f32 [%rd40+1536], %f82; +$Lt_0_23810: + .loc 16 186 0 + shr.u32 %r52, %r52, 1; + mov.u32 %r55, 0; + setp.ne.u32 %p11, %r52, %r55; + @%p11 bra $Lt_0_23554; +$Lt_0_23042: + .loc 16 193 0 + mov.f32 %f27, %f79; + .loc 16 194 0 + mov.f32 %f26, %f80; + .loc 16 195 0 + mov.f32 %f25, %f81; + .loc 16 196 0 + mov.f32 %f28, %f82; + ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r57, 0; + setp.le.s32 %p12, %r56, %r57; + @%p12 bra $Lt_0_24578; + .loc 16 200 0 + mov.f32 %f79, %f6; + st.shared.f32 [%rd40+0], %f79; + mov.f32 %f80, %f8; + st.shared.f32 [%rd40+512], %f80; + mov.f32 %f81, %f10; + st.shared.f32 [%rd40+1024], %f81; + mov.f32 %f82, %f12; + st.shared.f32 [%rd40+1536], %f82; + mov.f32 %f87, %f14; + st.shared.f32 [%rd40+2048], %f87; + mov.f32 %f88, %f16; + st.shared.f32 [%rd40+2560], %f88; + .loc 16 202 0 + mov.s32 %r58, %r51; + @!%p9 bra $Lt_0_25090; +$Lt_0_25602: + setp.ge.u32 %p13, %r6, %r58; + @%p13 bra $Lt_0_25858; + .loc 16 205 0 + add.u32 %r59, %r2, %r58; + cvt.u64.u32 %rd44, %r59; + mul.wide.u32 %rd45, %r59, 4; + add.u64 %rd46, %rd37, %rd45; + ld.shared.f32 %f89, [%rd46+0]; + add.ftz.f32 %f79, %f89, %f79; + st.shared.f32 [%rd40+0], %f79; + ld.shared.f32 %f90, [%rd46+512]; + add.ftz.f32 %f80, %f90, %f80; + st.shared.f32 [%rd40+512], %f80; + ld.shared.f32 %f91, [%rd46+1024]; + add.ftz.f32 %f81, %f91, %f81; + st.shared.f32 [%rd40+1024], %f81; + ld.shared.f32 %f92, [%rd46+1536]; + add.ftz.f32 %f82, %f92, %f82; + st.shared.f32 [%rd40+1536], %f82; + ld.shared.f32 %f93, [%rd46+2048]; + add.ftz.f32 %f87, %f93, %f87; + st.shared.f32 [%rd40+2048], %f87; + ld.shared.f32 %f94, [%rd46+2560]; + add.ftz.f32 %f88, %f94, %f88; + st.shared.f32 [%rd40+2560], %f88; +$Lt_0_25858: + .loc 16 202 0 + shr.u32 %r58, %r58, 1; + mov.u32 %r60, 0; + setp.ne.u32 %p14, %r58, %r60; + @%p14 bra $Lt_0_25602; +$Lt_0_25090: + .loc 16 210 0 + mov.f32 %f6, %f79; + mov.f32 %f8, %f80; + mov.f32 %f10, %f81; + mov.f32 %f12, %f82; + mov.f32 %f14, %f87; + mov.f32 %f16, %f88; +$Lt_0_24578: +$Lt_0_22530: + selp.s32 %r61, 1, 0, %p1; + mov.s32 %r62, 0; + set.eq.u32.s32 %r63, %r6, %r62; + neg.s32 %r64, %r63; + and.b32 %r65, %r61, %r64; + mov.u32 %r66, 0; + setp.eq.s32 %p15, %r65, %r66; + @%p15 bra $Lt_0_26626; + .loc 16 216 0 + cvt.s64.s32 %rd47, %r9; + ld.param.u64 %rd48, [__cudaparm_kernel_pair_engv]; + mul.wide.s32 %rd49, %r9, 4; + add.u64 %rd50, %rd48, %rd49; + ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; + mov.u32 %r68, 0; + setp.le.s32 %p16, %r67, %r68; + @%p16 bra $Lt_0_27138; + .loc 16 218 0 + st.global.f32 [%rd50+0], %f28; + .loc 16 219 0 + cvt.s64.s32 %rd51, %r10; + mul.wide.s32 %rd52, %r10, 4; + add.u64 %rd50, %rd50, %rd52; +$Lt_0_27138: + ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; + mov.u32 %r70, 0; + setp.le.s32 %p17, %r69, %r70; + @%p17 bra $Lt_0_27650; + .loc 16 223 0 + mov.f32 %f95, %f6; + st.global.f32 [%rd50+0], %f95; + .loc 16 224 0 + cvt.s64.s32 %rd53, %r10; + mul.wide.s32 %rd54, %r10, 4; + add.u64 %rd55, %rd54, %rd50; + .loc 16 223 0 + mov.f32 %f96, %f8; + st.global.f32 [%rd55+0], %f96; + .loc 16 224 0 + add.u64 %rd56, %rd54, %rd55; + .loc 16 223 0 + mov.f32 %f97, %f10; + st.global.f32 [%rd56+0], %f97; + .loc 16 224 0 + add.u64 %rd57, %rd54, %rd56; + .loc 16 223 0 + mov.f32 %f98, %f12; + st.global.f32 [%rd57+0], %f98; + .loc 16 224 0 + add.u64 %rd50, %rd54, %rd57; + .loc 16 223 0 + mov.f32 %f99, %f14; + st.global.f32 [%rd50+0], %f99; + mov.f32 %f100, %f16; + add.u64 %rd58, %rd54, %rd50; + st.global.f32 [%rd58+0], %f100; +$Lt_0_27650: + .loc 16 227 0 + ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans]; + mul.lo.u64 %rd60, %rd47, 16; + add.u64 %rd61, %rd59, %rd60; + mov.f32 %f101, %f102; + st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f101}; +$Lt_0_26626: + .loc 16 229 0 + exit; +$LDWend_kernel_pair: + } // kernel_pair + + .entry kernel_pair_fast ( + .param .u64 __cudaparm_kernel_pair_fast_x_, + .param .u64 __cudaparm_kernel_pair_fast_mor1_in, + .param .u64 __cudaparm_kernel_pair_fast_mor2_in, + .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, + .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, + .param .u64 __cudaparm_kernel_pair_fast_dev_packed, + .param .u64 __cudaparm_kernel_pair_fast_ans, + .param .u64 __cudaparm_kernel_pair_fast_engv, + .param .s32 __cudaparm_kernel_pair_fast_eflag, + .param .s32 __cudaparm_kernel_pair_fast_vflag, + .param .s32 __cudaparm_kernel_pair_fast_inum, + .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, + .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) + { + .reg .u32 %r<74>; + .reg .u64 %rd<76>; + .reg .f32 %f<110>; + .reg .pred %p<22>; + .shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16]; + .shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_mor13296[1936]; + .shared .align 8 .b8 __cuda___cuda_local_var_32647_34_non_const_mor25232[968]; + .shared .align 4 .b8 __cuda___cuda_local_var_32738_35_non_const_red_acc6200[3072]; + // __cuda_local_var_32658_10_non_const_f = 48 + // __cuda_local_var_32662_9_non_const_virial = 16 + .loc 16 237 0 +$LDWbegin_kernel_pair_fast: + cvt.s32.u32 %r1, %tid.x; + mov.u32 %r2, 3; + setp.gt.s32 %p1, %r1, %r2; + @%p1 bra $Lt_1_21250; + .loc 16 247 0 + mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268; + cvt.s64.s32 %rd2, %r1; + mul.wide.s32 %rd3, %r1, 4; + ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_21250: + mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268; + mov.u32 %r3, 120; + setp.gt.s32 %p2, %r1, %r3; + @%p2 bra $Lt_1_21762; + .loc 16 249 0 + mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_mor13296; + cvt.s64.s32 %rd8, %r1; + mul.wide.s32 %rd9, %r1, 16; + ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in]; + add.u64 %rd11, %rd10, %rd9; + add.u64 %rd12, %rd9, %rd7; + ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; + st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; + ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r5, 0; + setp.le.s32 %p3, %r4, %r5; + @%p3 bra $Lt_1_22274; + .loc 16 251 0 + mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232; + mul.lo.u64 %rd14, %rd8, 8; + ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in]; + add.u64 %rd16, %rd15, %rd14; + add.u64 %rd17, %rd14, %rd13; + ld.global.v2.f32 {%f6,%f7}, [%rd16+0]; + st.shared.v2.f32 [%rd17+0], {%f6,%f7}; +$Lt_1_22274: + mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232; +$Lt_1_21762: + mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_mor13296; + mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232; + .loc 16 261 0 + mov.f32 %f8, 0f00000000; // 0 + mov.f32 %f9, %f8; + mov.f32 %f10, 0f00000000; // 0 + mov.f32 %f11, %f10; + mov.f32 %f12, 0f00000000; // 0 + mov.f32 %f13, %f12; + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, %f14; + mov.f32 %f16, 0f00000000; // 0 + mov.f32 %f17, %f16; + mov.f32 %f18, 0f00000000; // 0 + mov.f32 %f19, %f18; + .loc 16 263 0 + bar.sync 0; + ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; + div.s32 %r7, %r1, %r6; + cvt.s32.u32 %r8, %ntid.x; + div.s32 %r9, %r8, %r6; + rem.s32 %r10, %r1, %r6; + cvt.s32.u32 %r11, %ctaid.x; + mul.lo.s32 %r12, %r11, %r9; + add.s32 %r13, %r7, %r12; + ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum]; + setp.lt.s32 %p4, %r13, %r14; + @!%p4 bra $Lt_1_23042; + .loc 16 269 0 + ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch]; + cvt.s64.s32 %rd18, %r15; + mul.wide.s32 %rd19, %r15, 4; + cvt.s64.s32 %rd20, %r13; + mul.wide.s32 %rd21, %r13, 4; + ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor]; + add.u64 %rd23, %rd21, %rd22; + add.u64 %rd24, %rd19, %rd23; + ld.global.s32 %r16, [%rd24+0]; + add.u64 %rd25, %rd19, %rd24; + ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed]; + setp.ne.u64 %p5, %rd26, %rd22; + @%p5 bra $Lt_1_23554; + .loc 16 275 0 + cvt.s32.s64 %r17, %rd18; + mul.lo.s32 %r18, %r17, %r16; + cvt.s64.s32 %rd27, %r18; + mul.wide.s32 %rd28, %r18, 4; + add.u64 %rd29, %rd25, %rd28; + .loc 16 276 0 + mul.lo.s32 %r19, %r10, %r17; + cvt.s64.s32 %rd30, %r19; + mul.wide.s32 %rd31, %r19, 4; + add.u64 %rd32, %rd25, %rd31; + .loc 16 277 0 + mul.lo.s32 %r20, %r17, %r6; + bra.uni $Lt_1_23298; +$Lt_1_23554: + .loc 16 279 0 + ld.global.s32 %r21, [%rd25+0]; + cvt.s64.s32 %rd33, %r21; + mul.wide.s32 %rd34, %r21, 4; + add.u64 %rd35, %rd26, %rd34; + .loc 16 280 0 + cvt.s64.s32 %rd36, %r16; + mul.wide.s32 %rd37, %r16, 4; + add.u64 %rd29, %rd35, %rd37; + .loc 16 281 0 + mov.s32 %r20, %r6; + .loc 16 282 0 + cvt.s64.s32 %rd38, %r10; + mul.wide.s32 %rd39, %r10, 4; + add.u64 %rd32, %rd35, %rd39; +$Lt_1_23298: + .loc 16 285 0 + ld.global.s32 %r22, [%rd23+0]; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + mov.s32 %r28, 0; + mov.u32 %r29, %r28; + tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r23,%r25,%r27,%r29}]; + mov.f32 %f24, %f20; + mov.f32 %f25, %f21; + mov.f32 %f26, %f22; + mov.f32 %f27, %f23; + setp.ge.u64 %p6, %rd32, %rd29; + @%p6 bra $Lt_1_32002; + cvt.rzi.ftz.s32.f32 %r30, %f27; + cvt.s64.s32 %rd40, %r20; + mul.lo.s32 %r31, %r30, 11; + cvt.rn.f32.s32 %f28, %r31; + mov.f32 %f29, 0f00000000; // 0 + mov.f32 %f30, 0f00000000; // 0 + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 +$Lt_1_24322: + //<loop> Loop body line 285, nesting depth: 1, estimated iterations: unknown + .loc 16 292 0 + ld.global.s32 %r32, [%rd32+0]; + .loc 16 293 0 + shr.s32 %r33, %r32, 30; + and.b32 %r34, %r33, 3; + cvt.s64.s32 %rd41, %r34; + mul.wide.s32 %rd42, %r34, 4; + add.u64 %rd43, %rd1, %rd42; + ld.shared.f32 %f33, [%rd43+0]; + .loc 16 296 0 + and.b32 %r35, %r32, 1073741823; + mov.u32 %r36, %r35; + mov.s32 %r37, 0; + mov.u32 %r38, %r37; + mov.s32 %r39, 0; + mov.u32 %r40, %r39; + mov.s32 %r41, 0; + mov.u32 %r42, %r41; + tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r36,%r38,%r40,%r42}]; + mov.f32 %f38, %f34; + mov.f32 %f39, %f35; + mov.f32 %f40, %f36; + mov.f32 %f41, %f37; + sub.ftz.f32 %f42, %f25, %f39; + sub.ftz.f32 %f43, %f24, %f38; + sub.ftz.f32 %f44, %f26, %f40; + mul.ftz.f32 %f45, %f42, %f42; + fma.rn.ftz.f32 %f46, %f43, %f43, %f45; + fma.rn.ftz.f32 %f47, %f44, %f44, %f46; + add.ftz.f32 %f48, %f28, %f41; + cvt.rzi.ftz.s32.f32 %r43, %f48; + cvt.s64.s32 %rd44, %r43; + mul.wide.s32 %rd45, %r43, 16; + add.u64 %rd46, %rd7, %rd45; + ld.shared.f32 %f49, [%rd46+0]; + setp.gt.ftz.f32 %p7, %f49, %f47; + @!%p7 bra $Lt_1_25602; + .loc 16 307 0 + sqrt.approx.ftz.f32 %f50, %f47; + ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd46+0]; + sub.ftz.f32 %f54, %f50, %f52; + .loc 16 308 0 + mul.ftz.f32 %f55, %f53, %f54; + neg.ftz.f32 %f56, %f55; + .loc 16 310 0 + mov.f32 %f57, 0f3fb8aa3b; // 1.4427 + mul.ftz.f32 %f58, %f56, %f57; + ex2.approx.ftz.f32 %f59, %f58; + mul.ftz.f32 %f60, %f59, %f59; + sub.ftz.f32 %f61, %f60, %f59; + mul.ftz.f32 %f62, %f51, %f61; + .loc 16 312 0 + div.approx.ftz.f32 %f63, %f62, %f50; + mul.ftz.f32 %f64, %f63, %f33; + fma.rn.ftz.f32 %f31, %f43, %f64, %f31; + .loc 16 313 0 + fma.rn.ftz.f32 %f30, %f42, %f64, %f30; + .loc 16 314 0 + fma.rn.ftz.f32 %f29, %f44, %f64, %f29; + ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r45, 0; + setp.le.s32 %p8, %r44, %r45; + @%p8 bra $Lt_1_25090; + .loc 16 317 0 + mul.lo.u64 %rd47, %rd44, 8; + add.u64 %rd48, %rd13, %rd47; + ld.shared.v2.f32 {%f65,%f66}, [%rd48+0]; + sub.ftz.f32 %f67, %f61, %f59; + mul.ftz.f32 %f68, %f65, %f67; + sub.ftz.f32 %f69, %f68, %f66; + .loc 16 318 0 + fma.rn.ftz.f32 %f32, %f33, %f69, %f32; +$Lt_1_25090: + ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r47, 0; + setp.le.s32 %p9, %r46, %r47; + @%p9 bra $Lt_1_25602; + .loc 16 321 0 + mov.f32 %f70, %f9; + mul.ftz.f32 %f71, %f43, %f43; + fma.rn.ftz.f32 %f72, %f64, %f71, %f70; + mov.f32 %f9, %f72; + .loc 16 322 0 + mov.f32 %f73, %f11; + fma.rn.ftz.f32 %f74, %f64, %f45, %f73; + mov.f32 %f11, %f74; + .loc 16 323 0 + mov.f32 %f75, %f13; + mul.ftz.f32 %f76, %f44, %f44; + fma.rn.ftz.f32 %f77, %f64, %f76, %f75; + mov.f32 %f13, %f77; + .loc 16 324 0 + mov.f32 %f78, %f15; + mul.ftz.f32 %f79, %f42, %f43; + fma.rn.ftz.f32 %f80, %f64, %f79, %f78; + mov.f32 %f15, %f80; + .loc 16 325 0 + mov.f32 %f81, %f17; + mul.ftz.f32 %f82, %f43, %f44; + fma.rn.ftz.f32 %f83, %f64, %f82, %f81; + mov.f32 %f17, %f83; + .loc 16 326 0 + mul.ftz.f32 %f84, %f42, %f44; + fma.rn.ftz.f32 %f18, %f64, %f84, %f18; + mov.f32 %f19, %f18; +$Lt_1_25602: +$Lt_1_24578: + .loc 16 290 0 + mul.lo.u64 %rd49, %rd40, 4; + add.u64 %rd32, %rd32, %rd49; + setp.lt.u64 %p10, %rd32, %rd29; + @%p10 bra $Lt_1_24322; + bra.uni $Lt_1_22786; +$Lt_1_32002: + mov.f32 %f29, 0f00000000; // 0 + mov.f32 %f30, 0f00000000; // 0 + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 + bra.uni $Lt_1_22786; +$Lt_1_23042: + mov.f32 %f29, 0f00000000; // 0 + mov.f32 %f30, 0f00000000; // 0 + mov.f32 %f31, 0f00000000; // 0 + mov.f32 %f32, 0f00000000; // 0 +$Lt_1_22786: + mov.u32 %r48, 1; + setp.le.s32 %p11, %r6, %r48; + @%p11 bra $Lt_1_28418; + .loc 16 337 0 + mov.u64 %rd50, __cuda___cuda_local_var_32738_35_non_const_red_acc6200; + cvt.s64.s32 %rd51, %r1; + mul.wide.s32 %rd52, %r1, 4; + add.u64 %rd53, %rd50, %rd52; + mov.f32 %f85, %f31; + st.shared.f32 [%rd53+0], %f85; + .loc 16 338 0 + mov.f32 %f86, %f30; + st.shared.f32 [%rd53+512], %f86; + .loc 16 339 0 + mov.f32 %f87, %f29; + st.shared.f32 [%rd53+1024], %f87; + .loc 16 340 0 + mov.f32 %f88, %f32; + st.shared.f32 [%rd53+1536], %f88; + .loc 16 342 0 + shr.s32 %r49, %r6, 31; + mov.s32 %r50, 1; + and.b32 %r51, %r49, %r50; + add.s32 %r52, %r51, %r6; + shr.s32 %r53, %r52, 1; + mov.s32 %r54, %r53; + mov.u32 %r55, 0; + setp.ne.u32 %p12, %r53, %r55; + @!%p12 bra $Lt_1_26882; +$Lt_1_27394: + setp.ge.u32 %p13, %r10, %r54; + @%p13 bra $Lt_1_27650; + .loc 16 345 0 + add.u32 %r56, %r1, %r54; + cvt.u64.u32 %rd54, %r56; + mul.wide.u32 %rd55, %r56, 4; + add.u64 %rd56, %rd50, %rd55; + ld.shared.f32 %f89, [%rd56+0]; + add.ftz.f32 %f85, %f89, %f85; + st.shared.f32 [%rd53+0], %f85; + ld.shared.f32 %f90, [%rd56+512]; + add.ftz.f32 %f86, %f90, %f86; + st.shared.f32 [%rd53+512], %f86; + ld.shared.f32 %f91, [%rd56+1024]; + add.ftz.f32 %f87, %f91, %f87; + st.shared.f32 [%rd53+1024], %f87; + ld.shared.f32 %f92, [%rd56+1536]; + add.ftz.f32 %f88, %f92, %f88; + st.shared.f32 [%rd53+1536], %f88; +$Lt_1_27650: + .loc 16 342 0 + shr.u32 %r54, %r54, 1; + mov.u32 %r57, 0; + setp.ne.u32 %p14, %r54, %r57; + @%p14 bra $Lt_1_27394; +$Lt_1_26882: + .loc 16 349 0 + mov.f32 %f31, %f85; + .loc 16 350 0 + mov.f32 %f30, %f86; + .loc 16 351 0 + mov.f32 %f29, %f87; + .loc 16 352 0 + mov.f32 %f32, %f88; + ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r59, 0; + setp.le.s32 %p15, %r58, %r59; + @%p15 bra $Lt_1_28418; + .loc 16 356 0 + mov.f32 %f85, %f9; + st.shared.f32 [%rd53+0], %f85; + mov.f32 %f86, %f11; + st.shared.f32 [%rd53+512], %f86; + mov.f32 %f87, %f13; + st.shared.f32 [%rd53+1024], %f87; + mov.f32 %f88, %f15; + st.shared.f32 [%rd53+1536], %f88; + mov.f32 %f93, %f17; + st.shared.f32 [%rd53+2048], %f93; + mov.f32 %f94, %f19; + st.shared.f32 [%rd53+2560], %f94; + .loc 16 358 0 + mov.s32 %r60, %r53; + @!%p12 bra $Lt_1_28930; +$Lt_1_29442: + setp.ge.u32 %p16, %r10, %r60; + @%p16 bra $Lt_1_29698; + .loc 16 361 0 + add.u32 %r61, %r1, %r60; + cvt.u64.u32 %rd57, %r61; + mul.wide.u32 %rd58, %r61, 4; + add.u64 %rd59, %rd50, %rd58; + ld.shared.f32 %f95, [%rd59+0]; + add.ftz.f32 %f85, %f95, %f85; + st.shared.f32 [%rd53+0], %f85; + ld.shared.f32 %f96, [%rd59+512]; + add.ftz.f32 %f86, %f96, %f86; + st.shared.f32 [%rd53+512], %f86; + ld.shared.f32 %f97, [%rd59+1024]; + add.ftz.f32 %f87, %f97, %f87; + st.shared.f32 [%rd53+1024], %f87; + ld.shared.f32 %f98, [%rd59+1536]; + add.ftz.f32 %f88, %f98, %f88; + st.shared.f32 [%rd53+1536], %f88; + ld.shared.f32 %f99, [%rd59+2048]; + add.ftz.f32 %f93, %f99, %f93; + st.shared.f32 [%rd53+2048], %f93; + ld.shared.f32 %f100, [%rd59+2560]; + add.ftz.f32 %f94, %f100, %f94; + st.shared.f32 [%rd53+2560], %f94; +$Lt_1_29698: + .loc 16 358 0 + shr.u32 %r60, %r60, 1; + mov.u32 %r62, 0; + setp.ne.u32 %p17, %r60, %r62; + @%p17 bra $Lt_1_29442; +$Lt_1_28930: + .loc 16 366 0 + mov.f32 %f9, %f85; + mov.f32 %f11, %f86; + mov.f32 %f13, %f87; + mov.f32 %f15, %f88; + mov.f32 %f17, %f93; + mov.f32 %f19, %f94; +$Lt_1_28418: +$Lt_1_26370: + selp.s32 %r63, 1, 0, %p4; + mov.s32 %r64, 0; + set.eq.u32.s32 %r65, %r10, %r64; + neg.s32 %r66, %r65; + and.b32 %r67, %r63, %r66; + mov.u32 %r68, 0; + setp.eq.s32 %p18, %r67, %r68; + @%p18 bra $Lt_1_30466; + .loc 16 372 0 + cvt.s64.s32 %rd60, %r13; + ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast_engv]; + mul.wide.s32 %rd62, %r13, 4; + add.u64 %rd63, %rd61, %rd62; + ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; + mov.u32 %r70, 0; + setp.le.s32 %p19, %r69, %r70; + @%p19 bra $Lt_1_30978; + .loc 16 374 0 + st.global.f32 [%rd63+0], %f32; + .loc 16 375 0 + cvt.s64.s32 %rd64, %r14; + mul.wide.s32 %rd65, %r14, 4; + add.u64 %rd63, %rd63, %rd65; +$Lt_1_30978: + ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; + mov.u32 %r72, 0; + setp.le.s32 %p20, %r71, %r72; + @%p20 bra $Lt_1_31490; + .loc 16 379 0 + mov.f32 %f101, %f9; + st.global.f32 [%rd63+0], %f101; + .loc 16 380 0 + cvt.s64.s32 %rd66, %r14; + mul.wide.s32 %rd67, %r14, 4; + add.u64 %rd68, %rd67, %rd63; + .loc 16 379 0 + mov.f32 %f102, %f11; + st.global.f32 [%rd68+0], %f102; + .loc 16 380 0 + add.u64 %rd69, %rd67, %rd68; + .loc 16 379 0 + mov.f32 %f103, %f13; + st.global.f32 [%rd69+0], %f103; + .loc 16 380 0 + add.u64 %rd70, %rd67, %rd69; + .loc 16 379 0 + mov.f32 %f104, %f15; + st.global.f32 [%rd70+0], %f104; + .loc 16 380 0 + add.u64 %rd63, %rd67, %rd70; + .loc 16 379 0 + mov.f32 %f105, %f17; + st.global.f32 [%rd63+0], %f105; + mov.f32 %f106, %f19; + add.u64 %rd71, %rd67, %rd63; + st.global.f32 [%rd71+0], %f106; +$Lt_1_31490: + .loc 16 383 0 + ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans]; + mul.lo.u64 %rd73, %rd60, 16; + add.u64 %rd74, %rd72, %rd73; + mov.f32 %f107, %f108; + st.global.v4.f32 [%rd74+0], {%f31,%f30,%f29,%f107}; +$Lt_1_30466: + .loc 16 385 0 + exit; +$LDWend_kernel_pair_fast: + } // kernel_pair_fast + diff --git a/lib/gpu/morse_gpu_ptx.h b/lib/gpu/morse_gpu_ptx.h new file mode 100644 index 000000000..063a0c811 --- /dev/null +++ b/lib/gpu/morse_gpu_ptx.h @@ -0,0 +1,947 @@ +const char * morse_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .entry kernel_pair (\n" +" .param .u64 __cudaparm_kernel_pair_x_,\n" +" .param .u64 __cudaparm_kernel_pair_mor1,\n" +" .param .u64 __cudaparm_kernel_pair_mor2,\n" +" .param .s32 __cudaparm_kernel_pair_lj_types,\n" +" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_ans,\n" +" .param .u64 __cudaparm_kernel_pair_engv,\n" +" .param .s32 __cudaparm_kernel_pair_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_inum,\n" +" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" +" {\n" +" .reg .u32 %r<72>;\n" +" .reg .u64 %rd<63>;\n" +" .reg .f32 %f<104>;\n" +" .reg .f64 %fd<10>;\n" +" .reg .pred %p<19>;\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072];\n" +" .loc 16 88 0\n" +"$LDWbegin_kernel_pair:\n" +" .loc 16 95 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" +" ldu.global.f32 %f1, [%rd1+0];\n" +" .loc 16 96 0\n" +" ld.global.f32 %f2, [%rd1+4];\n" +" .loc 16 97 0\n" +" ld.global.f32 %f3, [%rd1+8];\n" +" .loc 16 98 0\n" +" ld.global.f32 %f4, [%rd1+12];\n" +" st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" +" .loc 16 107 0\n" +" mov.f32 %f5, 0f00000000; \n" +" mov.f32 %f6, %f5;\n" +" mov.f32 %f7, 0f00000000; \n" +" mov.f32 %f8, %f7;\n" +" mov.f32 %f9, 0f00000000; \n" +" mov.f32 %f10, %f9;\n" +" mov.f32 %f11, 0f00000000; \n" +" mov.f32 %f12, %f11;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, %f13;\n" +" mov.f32 %f15, 0f00000000; \n" +" mov.f32 %f16, %f15;\n" +" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" +" cvt.s32.u32 %r2, %tid.x;\n" +" div.s32 %r3, %r2, %r1;\n" +" cvt.s32.u32 %r4, %ntid.x;\n" +" div.s32 %r5, %r4, %r1;\n" +" rem.s32 %r6, %r2, %r1;\n" +" cvt.s32.u32 %r7, %ctaid.x;\n" +" mul.lo.s32 %r8, %r7, %r5;\n" +" add.s32 %r9, %r3, %r8;\n" +" ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];\n" +" setp.lt.s32 %p1, %r9, %r10;\n" +" @!%p1 bra $Lt_0_19202;\n" +" .loc 16 113 0\n" +" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" +" cvt.s64.s32 %rd2, %r11;\n" +" mul.wide.s32 %rd3, %r11, 4;\n" +" cvt.s64.s32 %rd4, %r9;\n" +" mul.wide.s32 %rd5, %r9, 4;\n" +" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" +" add.u64 %rd7, %rd5, %rd6;\n" +" add.u64 %rd8, %rd3, %rd7;\n" +" ld.global.s32 %r12, [%rd8+0];\n" +" add.u64 %rd9, %rd3, %rd8;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];\n" +" setp.ne.u64 %p2, %rd10, %rd6;\n" +" @%p2 bra $Lt_0_19714;\n" +" .loc 16 119 0\n" +" cvt.s32.s64 %r13, %rd2;\n" +" mul.lo.s32 %r14, %r13, %r12;\n" +" cvt.s64.s32 %rd11, %r14;\n" +" mul.wide.s32 %rd12, %r14, 4;\n" +" add.u64 %rd13, %rd9, %rd12;\n" +" .loc 16 120 0\n" +" mul.lo.s32 %r15, %r6, %r13;\n" +" cvt.s64.s32 %rd14, %r15;\n" +" mul.wide.s32 %rd15, %r15, 4;\n" +" add.u64 %rd16, %rd9, %rd15;\n" +" .loc 16 121 0\n" +" mul.lo.s32 %r16, %r13, %r1;\n" +" bra.uni $Lt_0_19458;\n" +"$Lt_0_19714:\n" +" .loc 16 123 0\n" +" ld.global.s32 %r17, [%rd9+0];\n" +" cvt.s64.s32 %rd17, %r17;\n" +" mul.wide.s32 %rd18, %r17, 4;\n" +" add.u64 %rd19, %rd10, %rd18;\n" +" .loc 16 124 0\n" +" cvt.s64.s32 %rd20, %r12;\n" +" mul.wide.s32 %rd21, %r12, 4;\n" +" add.u64 %rd13, %rd19, %rd21;\n" +" .loc 16 125 0\n" +" mov.s32 %r16, %r1;\n" +" .loc 16 126 0\n" +" cvt.s64.s32 %rd22, %r6;\n" +" mul.wide.s32 %rd23, %r6, 4;\n" +" add.u64 %rd16, %rd19, %rd23;\n" +"$Lt_0_19458:\n" +" .loc 16 129 0\n" +" ld.global.s32 %r18, [%rd7+0];\n" +" mov.u32 %r19, %r18;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n" +" mov.f32 %f21, %f17;\n" +" mov.f32 %f22, %f18;\n" +" mov.f32 %f23, %f19;\n" +" mov.f32 %f24, %f20;\n" +" setp.ge.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_28162;\n" +" cvt.rzi.ftz.s32.f32 %r26, %f24;\n" +" cvt.s64.s32 %rd24, %r16;\n" +" ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];\n" +" mul.lo.s32 %r28, %r27, %r26;\n" +" ld.param.u64 %rd25, [__cudaparm_kernel_pair_mor1];\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;\n" +"$Lt_0_20482:\n" +" .loc 16 135 0\n" +" ld.global.s32 %r29, [%rd16+0];\n" +" .loc 16 136 0\n" +" shr.s32 %r30, %r29, 30;\n" +" and.b32 %r31, %r30, 3;\n" +" cvt.s64.s32 %rd27, %r31;\n" +" mul.wide.s32 %rd28, %r31, 4;\n" +" add.u64 %rd29, %rd26, %rd28;\n" +" ld.shared.f32 %f29, [%rd29+0];\n" +" .loc 16 139 0\n" +" and.b32 %r32, %r29, 1073741823;\n" +" mov.u32 %r33, %r32;\n" +" mov.s32 %r34, 0;\n" +" mov.u32 %r35, %r34;\n" +" mov.s32 %r36, 0;\n" +" mov.u32 %r37, %r36;\n" +" mov.s32 %r38, 0;\n" +" mov.u32 %r39, %r38;\n" +" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];\n" +" mov.f32 %f34, %f30;\n" +" mov.f32 %f35, %f31;\n" +" mov.f32 %f36, %f32;\n" +" mov.f32 %f37, %f33;\n" +" cvt.rzi.ftz.s32.f32 %r40, %f37;\n" +" sub.ftz.f32 %f38, %f22, %f35;\n" +" sub.ftz.f32 %f39, %f21, %f34;\n" +" sub.ftz.f32 %f40, %f23, %f36;\n" +" mul.ftz.f32 %f41, %f38, %f38;\n" +" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" +" add.s32 %r41, %r40, %r28;\n" +" cvt.s64.s32 %rd30, %r41;\n" +" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" +" mul.wide.s32 %rd31, %r41, 16;\n" +" add.u64 %rd32, %rd25, %rd31;\n" +" ld.global.f32 %f44, [%rd32+0];\n" +" setp.gt.ftz.f32 %p4, %f44, %f43;\n" +" @!%p4 bra $Lt_0_21762;\n" +" .loc 16 152 0\n" +" sqrt.approx.ftz.f32 %f45, %f43;\n" +" ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd32+0];\n" +" sub.ftz.f32 %f49, %f45, %f47;\n" +" mul.ftz.f32 %f50, %f48, %f49;\n" +" neg.ftz.f32 %f51, %f50;\n" +" .loc 16 154 0\n" +" mov.f32 %f52, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f53, %f51, %f52;\n" +" ex2.approx.ftz.f32 %f54, %f53;\n" +" mul.ftz.f32 %f55, %f54, %f54;\n" +" sub.ftz.f32 %f56, %f55, %f54;\n" +" mul.ftz.f32 %f57, %f46, %f56;\n" +" .loc 16 156 0\n" +" div.approx.ftz.f32 %f58, %f57, %f45;\n" +" mul.ftz.f32 %f59, %f58, %f29;\n" +" fma.rn.ftz.f32 %f27, %f39, %f59, %f27;\n" +" .loc 16 157 0\n" +" fma.rn.ftz.f32 %f26, %f38, %f59, %f26;\n" +" .loc 16 158 0\n" +" fma.rn.ftz.f32 %f25, %f40, %f59, %f25;\n" +" ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r43, 0;\n" +" setp.le.s32 %p5, %r42, %r43;\n" +" @%p5 bra $Lt_0_21250;\n" +" .loc 16 162 0\n" +" cvt.ftz.f64.f32 %fd1, %f54;\n" +" ld.param.u64 %rd33, [__cudaparm_kernel_pair_mor2];\n" +" mul.lo.u64 %rd34, %rd30, 8;\n" +" add.u64 %rd35, %rd33, %rd34;\n" +" ld.global.v2.f32 {%f60,%f61}, [%rd35+0];\n" +" cvt.ftz.f64.f32 %fd2, %f61;\n" +" cvt.ftz.f64.f32 %fd3, %f60;\n" +" mul.ftz.f32 %f62, %f54, %f54;\n" +" cvt.ftz.f64.f32 %fd4, %f62;\n" +" add.f64 %fd5, %fd1, %fd1;\n" +" sub.f64 %fd6, %fd4, %fd5;\n" +" mul.f64 %fd7, %fd3, %fd6;\n" +" sub.f64 %fd8, %fd7, %fd2;\n" +" cvt.rn.ftz.f32.f64 %f63, %fd8;\n" +" fma.rn.ftz.f32 %f28, %f29, %f63, %f28;\n" +"$Lt_0_21250:\n" +" ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r45, 0;\n" +" setp.le.s32 %p6, %r44, %r45;\n" +" @%p6 bra $Lt_0_21762;\n" +" .loc 16 165 0\n" +" mov.f32 %f64, %f6;\n" +" mul.ftz.f32 %f65, %f39, %f39;\n" +" fma.rn.ftz.f32 %f66, %f59, %f65, %f64;\n" +" mov.f32 %f6, %f66;\n" +" .loc 16 166 0\n" +" mov.f32 %f67, %f8;\n" +" fma.rn.ftz.f32 %f68, %f59, %f41, %f67;\n" +" mov.f32 %f8, %f68;\n" +" .loc 16 167 0\n" +" mov.f32 %f69, %f10;\n" +" mul.ftz.f32 %f70, %f40, %f40;\n" +" fma.rn.ftz.f32 %f71, %f59, %f70, %f69;\n" +" mov.f32 %f10, %f71;\n" +" .loc 16 168 0\n" +" mov.f32 %f72, %f12;\n" +" mul.ftz.f32 %f73, %f38, %f39;\n" +" fma.rn.ftz.f32 %f74, %f59, %f73, %f72;\n" +" mov.f32 %f12, %f74;\n" +" .loc 16 169 0\n" +" mov.f32 %f75, %f14;\n" +" mul.ftz.f32 %f76, %f39, %f40;\n" +" fma.rn.ftz.f32 %f77, %f59, %f76, %f75;\n" +" mov.f32 %f14, %f77;\n" +" .loc 16 170 0\n" +" mul.ftz.f32 %f78, %f38, %f40;\n" +" fma.rn.ftz.f32 %f15, %f59, %f78, %f15;\n" +" mov.f32 %f16, %f15;\n" +"$Lt_0_21762:\n" +"$Lt_0_20738:\n" +" .loc 16 133 0\n" +" mul.lo.u64 %rd36, %rd24, 4;\n" +" add.u64 %rd16, %rd16, %rd36;\n" +" setp.lt.u64 %p7, %rd16, %rd13;\n" +" @%p7 bra $Lt_0_20482;\n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_28162:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +" bra.uni $Lt_0_18946;\n" +"$Lt_0_19202:\n" +" mov.f32 %f25, 0f00000000; \n" +" mov.f32 %f26, 0f00000000; \n" +" mov.f32 %f27, 0f00000000; \n" +" mov.f32 %f28, 0f00000000; \n" +"$Lt_0_18946:\n" +" mov.u32 %r46, 1;\n" +" setp.le.s32 %p8, %r1, %r46;\n" +" @%p8 bra $Lt_0_24578;\n" +" .loc 16 181 0\n" +" mov.u64 %rd37, __cuda___cuda_local_var_32582_35_non_const_red_acc108;\n" +" cvt.s64.s32 %rd38, %r2;\n" +" mul.wide.s32 %rd39, %r2, 4;\n" +" add.u64 %rd40, %rd37, %rd39;\n" +" mov.f32 %f79, %f27;\n" +" st.shared.f32 [%rd40+0], %f79;\n" +" .loc 16 182 0\n" +" mov.f32 %f80, %f26;\n" +" st.shared.f32 [%rd40+512], %f80;\n" +" .loc 16 183 0\n" +" mov.f32 %f81, %f25;\n" +" st.shared.f32 [%rd40+1024], %f81;\n" +" .loc 16 184 0\n" +" mov.f32 %f82, %f28;\n" +" st.shared.f32 [%rd40+1536], %f82;\n" +" .loc 16 186 0\n" +" shr.s32 %r47, %r1, 31;\n" +" mov.s32 %r48, 1;\n" +" and.b32 %r49, %r47, %r48;\n" +" add.s32 %r50, %r49, %r1;\n" +" shr.s32 %r51, %r50, 1;\n" +" mov.s32 %r52, %r51;\n" +" mov.u32 %r53, 0;\n" +" setp.ne.u32 %p9, %r51, %r53;\n" +" @!%p9 bra $Lt_0_23042;\n" +"$Lt_0_23554:\n" +" setp.ge.u32 %p10, %r6, %r52;\n" +" @%p10 bra $Lt_0_23810;\n" +" .loc 16 189 0\n" +" add.u32 %r54, %r2, %r52;\n" +" cvt.u64.u32 %rd41, %r54;\n" +" mul.wide.u32 %rd42, %r54, 4;\n" +" add.u64 %rd43, %rd37, %rd42;\n" +" ld.shared.f32 %f83, [%rd43+0];\n" +" add.ftz.f32 %f79, %f83, %f79;\n" +" st.shared.f32 [%rd40+0], %f79;\n" +" ld.shared.f32 %f84, [%rd43+512];\n" +" add.ftz.f32 %f80, %f84, %f80;\n" +" st.shared.f32 [%rd40+512], %f80;\n" +" ld.shared.f32 %f85, [%rd43+1024];\n" +" add.ftz.f32 %f81, %f85, %f81;\n" +" st.shared.f32 [%rd40+1024], %f81;\n" +" ld.shared.f32 %f86, [%rd43+1536];\n" +" add.ftz.f32 %f82, %f86, %f82;\n" +" st.shared.f32 [%rd40+1536], %f82;\n" +"$Lt_0_23810:\n" +" .loc 16 186 0\n" +" shr.u32 %r52, %r52, 1;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p11, %r52, %r55;\n" +" @%p11 bra $Lt_0_23554;\n" +"$Lt_0_23042:\n" +" .loc 16 193 0\n" +" mov.f32 %f27, %f79;\n" +" .loc 16 194 0\n" +" mov.f32 %f26, %f80;\n" +" .loc 16 195 0\n" +" mov.f32 %f25, %f81;\n" +" .loc 16 196 0\n" +" mov.f32 %f28, %f82;\n" +" ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r57, 0;\n" +" setp.le.s32 %p12, %r56, %r57;\n" +" @%p12 bra $Lt_0_24578;\n" +" .loc 16 200 0\n" +" mov.f32 %f79, %f6;\n" +" st.shared.f32 [%rd40+0], %f79;\n" +" mov.f32 %f80, %f8;\n" +" st.shared.f32 [%rd40+512], %f80;\n" +" mov.f32 %f81, %f10;\n" +" st.shared.f32 [%rd40+1024], %f81;\n" +" mov.f32 %f82, %f12;\n" +" st.shared.f32 [%rd40+1536], %f82;\n" +" mov.f32 %f87, %f14;\n" +" st.shared.f32 [%rd40+2048], %f87;\n" +" mov.f32 %f88, %f16;\n" +" st.shared.f32 [%rd40+2560], %f88;\n" +" .loc 16 202 0\n" +" mov.s32 %r58, %r51;\n" +" @!%p9 bra $Lt_0_25090;\n" +"$Lt_0_25602:\n" +" setp.ge.u32 %p13, %r6, %r58;\n" +" @%p13 bra $Lt_0_25858;\n" +" .loc 16 205 0\n" +" add.u32 %r59, %r2, %r58;\n" +" cvt.u64.u32 %rd44, %r59;\n" +" mul.wide.u32 %rd45, %r59, 4;\n" +" add.u64 %rd46, %rd37, %rd45;\n" +" ld.shared.f32 %f89, [%rd46+0];\n" +" add.ftz.f32 %f79, %f89, %f79;\n" +" st.shared.f32 [%rd40+0], %f79;\n" +" ld.shared.f32 %f90, [%rd46+512];\n" +" add.ftz.f32 %f80, %f90, %f80;\n" +" st.shared.f32 [%rd40+512], %f80;\n" +" ld.shared.f32 %f91, [%rd46+1024];\n" +" add.ftz.f32 %f81, %f91, %f81;\n" +" st.shared.f32 [%rd40+1024], %f81;\n" +" ld.shared.f32 %f92, [%rd46+1536];\n" +" add.ftz.f32 %f82, %f92, %f82;\n" +" st.shared.f32 [%rd40+1536], %f82;\n" +" ld.shared.f32 %f93, [%rd46+2048];\n" +" add.ftz.f32 %f87, %f93, %f87;\n" +" st.shared.f32 [%rd40+2048], %f87;\n" +" ld.shared.f32 %f94, [%rd46+2560];\n" +" add.ftz.f32 %f88, %f94, %f88;\n" +" st.shared.f32 [%rd40+2560], %f88;\n" +"$Lt_0_25858:\n" +" .loc 16 202 0\n" +" shr.u32 %r58, %r58, 1;\n" +" mov.u32 %r60, 0;\n" +" setp.ne.u32 %p14, %r58, %r60;\n" +" @%p14 bra $Lt_0_25602;\n" +"$Lt_0_25090:\n" +" .loc 16 210 0\n" +" mov.f32 %f6, %f79;\n" +" mov.f32 %f8, %f80;\n" +" mov.f32 %f10, %f81;\n" +" mov.f32 %f12, %f82;\n" +" mov.f32 %f14, %f87;\n" +" mov.f32 %f16, %f88;\n" +"$Lt_0_24578:\n" +"$Lt_0_22530:\n" +" selp.s32 %r61, 1, 0, %p1;\n" +" mov.s32 %r62, 0;\n" +" set.eq.u32.s32 %r63, %r6, %r62;\n" +" neg.s32 %r64, %r63;\n" +" and.b32 %r65, %r61, %r64;\n" +" mov.u32 %r66, 0;\n" +" setp.eq.s32 %p15, %r65, %r66;\n" +" @%p15 bra $Lt_0_26626;\n" +" .loc 16 216 0\n" +" cvt.s64.s32 %rd47, %r9;\n" +" ld.param.u64 %rd48, [__cudaparm_kernel_pair_engv];\n" +" mul.wide.s32 %rd49, %r9, 4;\n" +" add.u64 %rd50, %rd48, %rd49;\n" +" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" +" mov.u32 %r68, 0;\n" +" setp.le.s32 %p16, %r67, %r68;\n" +" @%p16 bra $Lt_0_27138;\n" +" .loc 16 218 0\n" +" st.global.f32 [%rd50+0], %f28;\n" +" .loc 16 219 0\n" +" cvt.s64.s32 %rd51, %r10;\n" +" mul.wide.s32 %rd52, %r10, 4;\n" +" add.u64 %rd50, %rd50, %rd52;\n" +"$Lt_0_27138:\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p17, %r69, %r70;\n" +" @%p17 bra $Lt_0_27650;\n" +" .loc 16 223 0\n" +" mov.f32 %f95, %f6;\n" +" st.global.f32 [%rd50+0], %f95;\n" +" .loc 16 224 0\n" +" cvt.s64.s32 %rd53, %r10;\n" +" mul.wide.s32 %rd54, %r10, 4;\n" +" add.u64 %rd55, %rd54, %rd50;\n" +" .loc 16 223 0\n" +" mov.f32 %f96, %f8;\n" +" st.global.f32 [%rd55+0], %f96;\n" +" .loc 16 224 0\n" +" add.u64 %rd56, %rd54, %rd55;\n" +" .loc 16 223 0\n" +" mov.f32 %f97, %f10;\n" +" st.global.f32 [%rd56+0], %f97;\n" +" .loc 16 224 0\n" +" add.u64 %rd57, %rd54, %rd56;\n" +" .loc 16 223 0\n" +" mov.f32 %f98, %f12;\n" +" st.global.f32 [%rd57+0], %f98;\n" +" .loc 16 224 0\n" +" add.u64 %rd50, %rd54, %rd57;\n" +" .loc 16 223 0\n" +" mov.f32 %f99, %f14;\n" +" st.global.f32 [%rd50+0], %f99;\n" +" mov.f32 %f100, %f16;\n" +" add.u64 %rd58, %rd54, %rd50;\n" +" st.global.f32 [%rd58+0], %f100;\n" +"$Lt_0_27650:\n" +" .loc 16 227 0\n" +" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n" +" mul.lo.u64 %rd60, %rd47, 16;\n" +" add.u64 %rd61, %rd59, %rd60;\n" +" mov.f32 %f101, %f102;\n" +" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f101};\n" +"$Lt_0_26626:\n" +" .loc 16 229 0\n" +" exit;\n" +"$LDWend_kernel_pair:\n" +" }\n" +" .entry kernel_pair_fast (\n" +" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" +" .param .u64 __cudaparm_kernel_pair_fast_mor1_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_mor2_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" +" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" +" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" +" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" +" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" +" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" +" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" +" {\n" +" .reg .u32 %r<74>;\n" +" .reg .u64 %rd<76>;\n" +" .reg .f32 %f<110>;\n" +" .reg .pred %p<22>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_mor13296[1936];\n" +" .shared .align 8 .b8 __cuda___cuda_local_var_32647_34_non_const_mor25232[968];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32738_35_non_const_red_acc6200[3072];\n" +" .loc 16 237 0\n" +"$LDWbegin_kernel_pair_fast:\n" +" cvt.s32.u32 %r1, %tid.x;\n" +" mov.u32 %r2, 3;\n" +" setp.gt.s32 %p1, %r1, %r2;\n" +" @%p1 bra $Lt_1_21250;\n" +" .loc 16 247 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;\n" +" cvt.s64.s32 %rd2, %r1;\n" +" mul.wide.s32 %rd3, %r1, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_21250:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;\n" +" mov.u32 %r3, 120;\n" +" setp.gt.s32 %p2, %r1, %r3;\n" +" @%p2 bra $Lt_1_21762;\n" +" .loc 16 249 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_mor13296;\n" +" cvt.s64.s32 %rd8, %r1;\n" +" mul.wide.s32 %rd9, %r1, 16;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in];\n" +" add.u64 %rd11, %rd10, %rd9;\n" +" add.u64 %rd12, %rd9, %rd7;\n" +" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" +" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" +" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r5, 0;\n" +" setp.le.s32 %p3, %r4, %r5;\n" +" @%p3 bra $Lt_1_22274;\n" +" .loc 16 251 0\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232;\n" +" mul.lo.u64 %rd14, %rd8, 8;\n" +" ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in];\n" +" add.u64 %rd16, %rd15, %rd14;\n" +" add.u64 %rd17, %rd14, %rd13;\n" +" ld.global.v2.f32 {%f6,%f7}, [%rd16+0];\n" +" st.shared.v2.f32 [%rd17+0], {%f6,%f7};\n" +"$Lt_1_22274:\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232;\n" +"$Lt_1_21762:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_mor13296;\n" +" mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232;\n" +" .loc 16 261 0\n" +" mov.f32 %f8, 0f00000000; \n" +" mov.f32 %f9, %f8;\n" +" mov.f32 %f10, 0f00000000; \n" +" mov.f32 %f11, %f10;\n" +" mov.f32 %f12, 0f00000000; \n" +" mov.f32 %f13, %f12;\n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, %f14;\n" +" mov.f32 %f16, 0f00000000; \n" +" mov.f32 %f17, %f16;\n" +" mov.f32 %f18, 0f00000000; \n" +" mov.f32 %f19, %f18;\n" +" .loc 16 263 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" +" div.s32 %r7, %r1, %r6;\n" +" cvt.s32.u32 %r8, %ntid.x;\n" +" div.s32 %r9, %r8, %r6;\n" +" rem.s32 %r10, %r1, %r6;\n" +" cvt.s32.u32 %r11, %ctaid.x;\n" +" mul.lo.s32 %r12, %r11, %r9;\n" +" add.s32 %r13, %r7, %r12;\n" +" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];\n" +" setp.lt.s32 %p4, %r13, %r14;\n" +" @!%p4 bra $Lt_1_23042;\n" +" .loc 16 269 0\n" +" ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" +" cvt.s64.s32 %rd18, %r15;\n" +" mul.wide.s32 %rd19, %r15, 4;\n" +" cvt.s64.s32 %rd20, %r13;\n" +" mul.wide.s32 %rd21, %r13, 4;\n" +" ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor];\n" +" add.u64 %rd23, %rd21, %rd22;\n" +" add.u64 %rd24, %rd19, %rd23;\n" +" ld.global.s32 %r16, [%rd24+0];\n" +" add.u64 %rd25, %rd19, %rd24;\n" +" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n" +" setp.ne.u64 %p5, %rd26, %rd22;\n" +" @%p5 bra $Lt_1_23554;\n" +" .loc 16 275 0\n" +" cvt.s32.s64 %r17, %rd18;\n" +" mul.lo.s32 %r18, %r17, %r16;\n" +" cvt.s64.s32 %rd27, %r18;\n" +" mul.wide.s32 %rd28, %r18, 4;\n" +" add.u64 %rd29, %rd25, %rd28;\n" +" .loc 16 276 0\n" +" mul.lo.s32 %r19, %r10, %r17;\n" +" cvt.s64.s32 %rd30, %r19;\n" +" mul.wide.s32 %rd31, %r19, 4;\n" +" add.u64 %rd32, %rd25, %rd31;\n" +" .loc 16 277 0\n" +" mul.lo.s32 %r20, %r17, %r6;\n" +" bra.uni $Lt_1_23298;\n" +"$Lt_1_23554:\n" +" .loc 16 279 0\n" +" ld.global.s32 %r21, [%rd25+0];\n" +" cvt.s64.s32 %rd33, %r21;\n" +" mul.wide.s32 %rd34, %r21, 4;\n" +" add.u64 %rd35, %rd26, %rd34;\n" +" .loc 16 280 0\n" +" cvt.s64.s32 %rd36, %r16;\n" +" mul.wide.s32 %rd37, %r16, 4;\n" +" add.u64 %rd29, %rd35, %rd37;\n" +" .loc 16 281 0\n" +" mov.s32 %r20, %r6;\n" +" .loc 16 282 0\n" +" cvt.s64.s32 %rd38, %r10;\n" +" mul.wide.s32 %rd39, %r10, 4;\n" +" add.u64 %rd32, %rd35, %rd39;\n" +"$Lt_1_23298:\n" +" .loc 16 285 0\n" +" ld.global.s32 %r22, [%rd23+0];\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" mov.s32 %r28, 0;\n" +" mov.u32 %r29, %r28;\n" +" tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r23,%r25,%r27,%r29}];\n" +" mov.f32 %f24, %f20;\n" +" mov.f32 %f25, %f21;\n" +" mov.f32 %f26, %f22;\n" +" mov.f32 %f27, %f23;\n" +" setp.ge.u64 %p6, %rd32, %rd29;\n" +" @%p6 bra $Lt_1_32002;\n" +" cvt.rzi.ftz.s32.f32 %r30, %f27;\n" +" cvt.s64.s32 %rd40, %r20;\n" +" mul.lo.s32 %r31, %r30, 11;\n" +" cvt.rn.f32.s32 %f28, %r31;\n" +" mov.f32 %f29, 0f00000000; \n" +" mov.f32 %f30, 0f00000000; \n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +"$Lt_1_24322:\n" +" .loc 16 292 0\n" +" ld.global.s32 %r32, [%rd32+0];\n" +" .loc 16 293 0\n" +" shr.s32 %r33, %r32, 30;\n" +" and.b32 %r34, %r33, 3;\n" +" cvt.s64.s32 %rd41, %r34;\n" +" mul.wide.s32 %rd42, %r34, 4;\n" +" add.u64 %rd43, %rd1, %rd42;\n" +" ld.shared.f32 %f33, [%rd43+0];\n" +" .loc 16 296 0\n" +" and.b32 %r35, %r32, 1073741823;\n" +" mov.u32 %r36, %r35;\n" +" mov.s32 %r37, 0;\n" +" mov.u32 %r38, %r37;\n" +" mov.s32 %r39, 0;\n" +" mov.u32 %r40, %r39;\n" +" mov.s32 %r41, 0;\n" +" mov.u32 %r42, %r41;\n" +" tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r36,%r38,%r40,%r42}];\n" +" mov.f32 %f38, %f34;\n" +" mov.f32 %f39, %f35;\n" +" mov.f32 %f40, %f36;\n" +" mov.f32 %f41, %f37;\n" +" sub.ftz.f32 %f42, %f25, %f39;\n" +" sub.ftz.f32 %f43, %f24, %f38;\n" +" sub.ftz.f32 %f44, %f26, %f40;\n" +" mul.ftz.f32 %f45, %f42, %f42;\n" +" fma.rn.ftz.f32 %f46, %f43, %f43, %f45;\n" +" fma.rn.ftz.f32 %f47, %f44, %f44, %f46;\n" +" add.ftz.f32 %f48, %f28, %f41;\n" +" cvt.rzi.ftz.s32.f32 %r43, %f48;\n" +" cvt.s64.s32 %rd44, %r43;\n" +" mul.wide.s32 %rd45, %r43, 16;\n" +" add.u64 %rd46, %rd7, %rd45;\n" +" ld.shared.f32 %f49, [%rd46+0];\n" +" setp.gt.ftz.f32 %p7, %f49, %f47;\n" +" @!%p7 bra $Lt_1_25602;\n" +" .loc 16 307 0\n" +" sqrt.approx.ftz.f32 %f50, %f47;\n" +" ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd46+0];\n" +" sub.ftz.f32 %f54, %f50, %f52;\n" +" .loc 16 308 0\n" +" mul.ftz.f32 %f55, %f53, %f54;\n" +" neg.ftz.f32 %f56, %f55;\n" +" .loc 16 310 0\n" +" mov.f32 %f57, 0f3fb8aa3b; \n" +" mul.ftz.f32 %f58, %f56, %f57;\n" +" ex2.approx.ftz.f32 %f59, %f58;\n" +" mul.ftz.f32 %f60, %f59, %f59;\n" +" sub.ftz.f32 %f61, %f60, %f59;\n" +" mul.ftz.f32 %f62, %f51, %f61;\n" +" .loc 16 312 0\n" +" div.approx.ftz.f32 %f63, %f62, %f50;\n" +" mul.ftz.f32 %f64, %f63, %f33;\n" +" fma.rn.ftz.f32 %f31, %f43, %f64, %f31;\n" +" .loc 16 313 0\n" +" fma.rn.ftz.f32 %f30, %f42, %f64, %f30;\n" +" .loc 16 314 0\n" +" fma.rn.ftz.f32 %f29, %f44, %f64, %f29;\n" +" ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r45, 0;\n" +" setp.le.s32 %p8, %r44, %r45;\n" +" @%p8 bra $Lt_1_25090;\n" +" .loc 16 317 0\n" +" mul.lo.u64 %rd47, %rd44, 8;\n" +" add.u64 %rd48, %rd13, %rd47;\n" +" ld.shared.v2.f32 {%f65,%f66}, [%rd48+0];\n" +" sub.ftz.f32 %f67, %f61, %f59;\n" +" mul.ftz.f32 %f68, %f65, %f67;\n" +" sub.ftz.f32 %f69, %f68, %f66;\n" +" .loc 16 318 0\n" +" fma.rn.ftz.f32 %f32, %f33, %f69, %f32;\n" +"$Lt_1_25090:\n" +" ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r47, 0;\n" +" setp.le.s32 %p9, %r46, %r47;\n" +" @%p9 bra $Lt_1_25602;\n" +" .loc 16 321 0\n" +" mov.f32 %f70, %f9;\n" +" mul.ftz.f32 %f71, %f43, %f43;\n" +" fma.rn.ftz.f32 %f72, %f64, %f71, %f70;\n" +" mov.f32 %f9, %f72;\n" +" .loc 16 322 0\n" +" mov.f32 %f73, %f11;\n" +" fma.rn.ftz.f32 %f74, %f64, %f45, %f73;\n" +" mov.f32 %f11, %f74;\n" +" .loc 16 323 0\n" +" mov.f32 %f75, %f13;\n" +" mul.ftz.f32 %f76, %f44, %f44;\n" +" fma.rn.ftz.f32 %f77, %f64, %f76, %f75;\n" +" mov.f32 %f13, %f77;\n" +" .loc 16 324 0\n" +" mov.f32 %f78, %f15;\n" +" mul.ftz.f32 %f79, %f42, %f43;\n" +" fma.rn.ftz.f32 %f80, %f64, %f79, %f78;\n" +" mov.f32 %f15, %f80;\n" +" .loc 16 325 0\n" +" mov.f32 %f81, %f17;\n" +" mul.ftz.f32 %f82, %f43, %f44;\n" +" fma.rn.ftz.f32 %f83, %f64, %f82, %f81;\n" +" mov.f32 %f17, %f83;\n" +" .loc 16 326 0\n" +" mul.ftz.f32 %f84, %f42, %f44;\n" +" fma.rn.ftz.f32 %f18, %f64, %f84, %f18;\n" +" mov.f32 %f19, %f18;\n" +"$Lt_1_25602:\n" +"$Lt_1_24578:\n" +" .loc 16 290 0\n" +" mul.lo.u64 %rd49, %rd40, 4;\n" +" add.u64 %rd32, %rd32, %rd49;\n" +" setp.lt.u64 %p10, %rd32, %rd29;\n" +" @%p10 bra $Lt_1_24322;\n" +" bra.uni $Lt_1_22786;\n" +"$Lt_1_32002:\n" +" mov.f32 %f29, 0f00000000; \n" +" mov.f32 %f30, 0f00000000; \n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +" bra.uni $Lt_1_22786;\n" +"$Lt_1_23042:\n" +" mov.f32 %f29, 0f00000000; \n" +" mov.f32 %f30, 0f00000000; \n" +" mov.f32 %f31, 0f00000000; \n" +" mov.f32 %f32, 0f00000000; \n" +"$Lt_1_22786:\n" +" mov.u32 %r48, 1;\n" +" setp.le.s32 %p11, %r6, %r48;\n" +" @%p11 bra $Lt_1_28418;\n" +" .loc 16 337 0\n" +" mov.u64 %rd50, __cuda___cuda_local_var_32738_35_non_const_red_acc6200;\n" +" cvt.s64.s32 %rd51, %r1;\n" +" mul.wide.s32 %rd52, %r1, 4;\n" +" add.u64 %rd53, %rd50, %rd52;\n" +" mov.f32 %f85, %f31;\n" +" st.shared.f32 [%rd53+0], %f85;\n" +" .loc 16 338 0\n" +" mov.f32 %f86, %f30;\n" +" st.shared.f32 [%rd53+512], %f86;\n" +" .loc 16 339 0\n" +" mov.f32 %f87, %f29;\n" +" st.shared.f32 [%rd53+1024], %f87;\n" +" .loc 16 340 0\n" +" mov.f32 %f88, %f32;\n" +" st.shared.f32 [%rd53+1536], %f88;\n" +" .loc 16 342 0\n" +" shr.s32 %r49, %r6, 31;\n" +" mov.s32 %r50, 1;\n" +" and.b32 %r51, %r49, %r50;\n" +" add.s32 %r52, %r51, %r6;\n" +" shr.s32 %r53, %r52, 1;\n" +" mov.s32 %r54, %r53;\n" +" mov.u32 %r55, 0;\n" +" setp.ne.u32 %p12, %r53, %r55;\n" +" @!%p12 bra $Lt_1_26882;\n" +"$Lt_1_27394:\n" +" setp.ge.u32 %p13, %r10, %r54;\n" +" @%p13 bra $Lt_1_27650;\n" +" .loc 16 345 0\n" +" add.u32 %r56, %r1, %r54;\n" +" cvt.u64.u32 %rd54, %r56;\n" +" mul.wide.u32 %rd55, %r56, 4;\n" +" add.u64 %rd56, %rd50, %rd55;\n" +" ld.shared.f32 %f89, [%rd56+0];\n" +" add.ftz.f32 %f85, %f89, %f85;\n" +" st.shared.f32 [%rd53+0], %f85;\n" +" ld.shared.f32 %f90, [%rd56+512];\n" +" add.ftz.f32 %f86, %f90, %f86;\n" +" st.shared.f32 [%rd53+512], %f86;\n" +" ld.shared.f32 %f91, [%rd56+1024];\n" +" add.ftz.f32 %f87, %f91, %f87;\n" +" st.shared.f32 [%rd53+1024], %f87;\n" +" ld.shared.f32 %f92, [%rd56+1536];\n" +" add.ftz.f32 %f88, %f92, %f88;\n" +" st.shared.f32 [%rd53+1536], %f88;\n" +"$Lt_1_27650:\n" +" .loc 16 342 0\n" +" shr.u32 %r54, %r54, 1;\n" +" mov.u32 %r57, 0;\n" +" setp.ne.u32 %p14, %r54, %r57;\n" +" @%p14 bra $Lt_1_27394;\n" +"$Lt_1_26882:\n" +" .loc 16 349 0\n" +" mov.f32 %f31, %f85;\n" +" .loc 16 350 0\n" +" mov.f32 %f30, %f86;\n" +" .loc 16 351 0\n" +" mov.f32 %f29, %f87;\n" +" .loc 16 352 0\n" +" mov.f32 %f32, %f88;\n" +" ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r59, 0;\n" +" setp.le.s32 %p15, %r58, %r59;\n" +" @%p15 bra $Lt_1_28418;\n" +" .loc 16 356 0\n" +" mov.f32 %f85, %f9;\n" +" st.shared.f32 [%rd53+0], %f85;\n" +" mov.f32 %f86, %f11;\n" +" st.shared.f32 [%rd53+512], %f86;\n" +" mov.f32 %f87, %f13;\n" +" st.shared.f32 [%rd53+1024], %f87;\n" +" mov.f32 %f88, %f15;\n" +" st.shared.f32 [%rd53+1536], %f88;\n" +" mov.f32 %f93, %f17;\n" +" st.shared.f32 [%rd53+2048], %f93;\n" +" mov.f32 %f94, %f19;\n" +" st.shared.f32 [%rd53+2560], %f94;\n" +" .loc 16 358 0\n" +" mov.s32 %r60, %r53;\n" +" @!%p12 bra $Lt_1_28930;\n" +"$Lt_1_29442:\n" +" setp.ge.u32 %p16, %r10, %r60;\n" +" @%p16 bra $Lt_1_29698;\n" +" .loc 16 361 0\n" +" add.u32 %r61, %r1, %r60;\n" +" cvt.u64.u32 %rd57, %r61;\n" +" mul.wide.u32 %rd58, %r61, 4;\n" +" add.u64 %rd59, %rd50, %rd58;\n" +" ld.shared.f32 %f95, [%rd59+0];\n" +" add.ftz.f32 %f85, %f95, %f85;\n" +" st.shared.f32 [%rd53+0], %f85;\n" +" ld.shared.f32 %f96, [%rd59+512];\n" +" add.ftz.f32 %f86, %f96, %f86;\n" +" st.shared.f32 [%rd53+512], %f86;\n" +" ld.shared.f32 %f97, [%rd59+1024];\n" +" add.ftz.f32 %f87, %f97, %f87;\n" +" st.shared.f32 [%rd53+1024], %f87;\n" +" ld.shared.f32 %f98, [%rd59+1536];\n" +" add.ftz.f32 %f88, %f98, %f88;\n" +" st.shared.f32 [%rd53+1536], %f88;\n" +" ld.shared.f32 %f99, [%rd59+2048];\n" +" add.ftz.f32 %f93, %f99, %f93;\n" +" st.shared.f32 [%rd53+2048], %f93;\n" +" ld.shared.f32 %f100, [%rd59+2560];\n" +" add.ftz.f32 %f94, %f100, %f94;\n" +" st.shared.f32 [%rd53+2560], %f94;\n" +"$Lt_1_29698:\n" +" .loc 16 358 0\n" +" shr.u32 %r60, %r60, 1;\n" +" mov.u32 %r62, 0;\n" +" setp.ne.u32 %p17, %r60, %r62;\n" +" @%p17 bra $Lt_1_29442;\n" +"$Lt_1_28930:\n" +" .loc 16 366 0\n" +" mov.f32 %f9, %f85;\n" +" mov.f32 %f11, %f86;\n" +" mov.f32 %f13, %f87;\n" +" mov.f32 %f15, %f88;\n" +" mov.f32 %f17, %f93;\n" +" mov.f32 %f19, %f94;\n" +"$Lt_1_28418:\n" +"$Lt_1_26370:\n" +" selp.s32 %r63, 1, 0, %p4;\n" +" mov.s32 %r64, 0;\n" +" set.eq.u32.s32 %r65, %r10, %r64;\n" +" neg.s32 %r66, %r65;\n" +" and.b32 %r67, %r63, %r66;\n" +" mov.u32 %r68, 0;\n" +" setp.eq.s32 %p18, %r67, %r68;\n" +" @%p18 bra $Lt_1_30466;\n" +" .loc 16 372 0\n" +" cvt.s64.s32 %rd60, %r13;\n" +" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast_engv];\n" +" mul.wide.s32 %rd62, %r13, 4;\n" +" add.u64 %rd63, %rd61, %rd62;\n" +" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" +" mov.u32 %r70, 0;\n" +" setp.le.s32 %p19, %r69, %r70;\n" +" @%p19 bra $Lt_1_30978;\n" +" .loc 16 374 0\n" +" st.global.f32 [%rd63+0], %f32;\n" +" .loc 16 375 0\n" +" cvt.s64.s32 %rd64, %r14;\n" +" mul.wide.s32 %rd65, %r14, 4;\n" +" add.u64 %rd63, %rd63, %rd65;\n" +"$Lt_1_30978:\n" +" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" +" mov.u32 %r72, 0;\n" +" setp.le.s32 %p20, %r71, %r72;\n" +" @%p20 bra $Lt_1_31490;\n" +" .loc 16 379 0\n" +" mov.f32 %f101, %f9;\n" +" st.global.f32 [%rd63+0], %f101;\n" +" .loc 16 380 0\n" +" cvt.s64.s32 %rd66, %r14;\n" +" mul.wide.s32 %rd67, %r14, 4;\n" +" add.u64 %rd68, %rd67, %rd63;\n" +" .loc 16 379 0\n" +" mov.f32 %f102, %f11;\n" +" st.global.f32 [%rd68+0], %f102;\n" +" .loc 16 380 0\n" +" add.u64 %rd69, %rd67, %rd68;\n" +" .loc 16 379 0\n" +" mov.f32 %f103, %f13;\n" +" st.global.f32 [%rd69+0], %f103;\n" +" .loc 16 380 0\n" +" add.u64 %rd70, %rd67, %rd69;\n" +" .loc 16 379 0\n" +" mov.f32 %f104, %f15;\n" +" st.global.f32 [%rd70+0], %f104;\n" +" .loc 16 380 0\n" +" add.u64 %rd63, %rd67, %rd70;\n" +" .loc 16 379 0\n" +" mov.f32 %f105, %f17;\n" +" st.global.f32 [%rd63+0], %f105;\n" +" mov.f32 %f106, %f19;\n" +" add.u64 %rd71, %rd67, %rd63;\n" +" st.global.f32 [%rd71+0], %f106;\n" +"$Lt_1_31490:\n" +" .loc 16 383 0\n" +" ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans];\n" +" mul.lo.u64 %rd73, %rd60, 16;\n" +" add.u64 %rd74, %rd72, %rd73;\n" +" mov.f32 %f107, %f108;\n" +" st.global.v4.f32 [%rd74+0], {%f31,%f30,%f29,%f107};\n" +"$Lt_1_30466:\n" +" .loc 16 385 0\n" +" exit;\n" +"$LDWend_kernel_pair_fast:\n" +" }\n" +; diff --git a/lib/gpu/pair_gpu_atom_kernel.ptx b/lib/gpu/pair_gpu_atom_kernel.ptx new file mode 100644 index 000000000..4132c0543 --- /dev/null +++ b/lib/gpu/pair_gpu_atom_kernel.ptx @@ -0,0 +1,101 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bafa_00000000-9_pair_gpu_atom_kernel.cpp3.i (/home/sjplimp/ccBI#.kAZxYr) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bafa_00000000-8_pair_gpu_atom_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "pair_gpu_atom_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + + .entry kernel_cast_x ( + .param .u64 __cudaparm_kernel_cast_x_x_type, + .param .u64 __cudaparm_kernel_cast_x_x, + .param .u64 __cudaparm_kernel_cast_x_type, + .param .s32 __cudaparm_kernel_cast_x_nall) + { + .reg .u32 %r<10>; + .reg .u64 %rd<13>; + .reg .f32 %f<6>; + .reg .f64 %fd<5>; + .reg .pred %p<3>; + .loc 16 34 0 +$LDWbegin_kernel_cast_x: + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mul.lo.u32 %r3, %r1, %r2; + mov.u32 %r4, %tid.x; + add.u32 %r5, %r4, %r3; + ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall]; + setp.le.s32 %p1, %r6, %r5; + @%p1 bra $Lt_0_1026; + .loc 16 39 0 + cvt.s64.s32 %rd1, %r5; + ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type]; + mul.wide.s32 %rd3, %r5, 4; + add.u64 %rd4, %rd2, %rd3; + ld.global.s32 %r7, [%rd4+0]; + cvt.rn.f32.s32 %f1, %r7; + .loc 16 42 0 + ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x]; + mul.lo.s32 %r8, %r5, 3; + cvt.s64.s32 %rd6, %r8; + mul.wide.s32 %rd7, %r8, 8; + add.u64 %rd8, %rd5, %rd7; + ld.global.f64 %fd1, [%rd8+8]; + cvt.rn.ftz.f32.f64 %f2, %fd1; + .loc 16 43 0 + ld.global.f64 %fd2, [%rd8+16]; + cvt.rn.ftz.f32.f64 %f3, %fd2; + .loc 16 44 0 + ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type]; + mul.wide.s32 %rd10, %r5, 16; + add.u64 %rd11, %rd9, %rd10; + ld.global.f64 %fd3, [%rd8+0]; + cvt.rn.ftz.f32.f64 %f4, %fd3; + st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1}; +$Lt_0_1026: + .loc 16 46 0 + exit; +$LDWend_kernel_cast_x: + } // kernel_cast_x + diff --git a/lib/gpu/pair_gpu_atom_ptx.h b/lib/gpu/pair_gpu_atom_ptx.h new file mode 100644 index 000000000..5a41225d7 --- /dev/null +++ b/lib/gpu/pair_gpu_atom_ptx.h @@ -0,0 +1,56 @@ +const char * pair_gpu_atom_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .entry kernel_cast_x (\n" +" .param .u64 __cudaparm_kernel_cast_x_x_type,\n" +" .param .u64 __cudaparm_kernel_cast_x_x,\n" +" .param .u64 __cudaparm_kernel_cast_x_type,\n" +" .param .s32 __cudaparm_kernel_cast_x_nall)\n" +" {\n" +" .reg .u32 %r<10>;\n" +" .reg .u64 %rd<13>;\n" +" .reg .f32 %f<6>;\n" +" .reg .f64 %fd<5>;\n" +" .reg .pred %p<3>;\n" +" .loc 16 34 0\n" +"$LDWbegin_kernel_cast_x:\n" +" mov.u32 %r1, %ctaid.x;\n" +" mov.u32 %r2, %ntid.x;\n" +" mul.lo.u32 %r3, %r1, %r2;\n" +" mov.u32 %r4, %tid.x;\n" +" add.u32 %r5, %r4, %r3;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];\n" +" setp.le.s32 %p1, %r6, %r5;\n" +" @%p1 bra $Lt_0_1026;\n" +" .loc 16 39 0\n" +" cvt.s64.s32 %rd1, %r5;\n" +" ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];\n" +" mul.wide.s32 %rd3, %r5, 4;\n" +" add.u64 %rd4, %rd2, %rd3;\n" +" ld.global.s32 %r7, [%rd4+0];\n" +" cvt.rn.f32.s32 %f1, %r7;\n" +" .loc 16 42 0\n" +" ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];\n" +" mul.lo.s32 %r8, %r5, 3;\n" +" cvt.s64.s32 %rd6, %r8;\n" +" mul.wide.s32 %rd7, %r8, 8;\n" +" add.u64 %rd8, %rd5, %rd7;\n" +" ld.global.f64 %fd1, [%rd8+8];\n" +" cvt.rn.ftz.f32.f64 %f2, %fd1;\n" +" .loc 16 43 0\n" +" ld.global.f64 %fd2, [%rd8+16];\n" +" cvt.rn.ftz.f32.f64 %f3, %fd2;\n" +" .loc 16 44 0\n" +" ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];\n" +" mul.wide.s32 %rd10, %r5, 16;\n" +" add.u64 %rd11, %rd9, %rd10;\n" +" ld.global.f64 %fd3, [%rd8+0];\n" +" cvt.rn.ftz.f32.f64 %f4, %fd3;\n" +" st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};\n" +"$Lt_0_1026:\n" +" .loc 16 46 0\n" +" exit;\n" +"$LDWend_kernel_cast_x:\n" +" }\n" +; diff --git a/lib/gpu/pair_gpu_build_kernel.ptx b/lib/gpu/pair_gpu_build_kernel.ptx new file mode 100644 index 000000000..d31109539 --- /dev/null +++ b/lib/gpu/pair_gpu_build_kernel.ptx @@ -0,0 +1,833 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bb79_00000000-9_pair_gpu_build_kernel.cpp3.i (/home/sjplimp/ccBI#.mdgTku) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bb79_00000000-8_pair_gpu_build_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "pair_gpu_build_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + + .entry transpose ( + .param .u64 __cudaparm_transpose_out, + .param .u64 __cudaparm_transpose_in, + .param .s32 __cudaparm_transpose_columns_in, + .param .s32 __cudaparm_transpose_rows_in) + { + .reg .u32 %r<32>; + .reg .u64 %rd<23>; + .reg .f32 %f<4>; + .reg .pred %p<4>; + .shared .align 4 .b8 __cuda___cuda_local_var_32483_32_non_const_block24[288]; + .loc 16 64 0 +$LDWbegin_transpose: + mov.u32 %r1, %ctaid.x; + mul.lo.u32 %r2, %r1, 8; + mov.u32 %r3, %ctaid.y; + mul.lo.u32 %r4, %r3, 8; + mov.u32 %r5, %tid.x; + add.u32 %r6, %r2, %r5; + mov.u32 %r7, %tid.y; + add.u32 %r8, %r4, %r7; + ld.param.s32 %r9, [__cudaparm_transpose_rows_in]; + ld.param.s32 %r10, [__cudaparm_transpose_columns_in]; + set.gt.u32.u32 %r11, %r9, %r8; + neg.s32 %r12, %r11; + set.gt.u32.u32 %r13, %r10, %r6; + neg.s32 %r14, %r13; + and.b32 %r15, %r12, %r14; + mov.u32 %r16, 0; + setp.eq.s32 %p1, %r15, %r16; + @%p1 bra $Lt_0_2306; + .loc 16 76 0 + mov.u64 %rd1, __cuda___cuda_local_var_32483_32_non_const_block24; + ld.param.u64 %rd2, [__cudaparm_transpose_in]; + mul.lo.u32 %r17, %r10, %r8; + add.u32 %r18, %r6, %r17; + cvt.u64.u32 %rd3, %r18; + mul.wide.u32 %rd4, %r18, 4; + add.u64 %rd5, %rd2, %rd4; + ld.global.s32 %r19, [%rd5+0]; + cvt.rn.f32.s32 %f1, %r19; + cvt.u64.u32 %rd6, %r5; + cvt.u64.u32 %rd7, %r7; + mul.wide.u32 %rd8, %r7, 9; + add.u64 %rd9, %rd6, %rd8; + mul.lo.u64 %rd10, %rd9, 4; + add.u64 %rd11, %rd1, %rd10; + st.shared.f32 [%rd11+0], %f1; +$Lt_0_2306: + mov.u64 %rd1, __cuda___cuda_local_var_32483_32_non_const_block24; + .loc 16 78 0 + bar.sync 0; + add.u32 %r20, %r2, %r7; + add.u32 %r21, %r4, %r5; + set.gt.u32.u32 %r22, %r9, %r21; + neg.s32 %r23, %r22; + set.gt.u32.u32 %r24, %r10, %r20; + neg.s32 %r25, %r24; + and.b32 %r26, %r23, %r25; + mov.u32 %r27, 0; + setp.eq.s32 %p2, %r26, %r27; + @%p2 bra $Lt_0_2818; + .loc 16 83 0 + cvt.u64.u32 %rd12, %r7; + cvt.u64.u32 %rd13, %r5; + mul.wide.u32 %rd14, %r5, 9; + add.u64 %rd15, %rd12, %rd14; + mul.lo.u64 %rd16, %rd15, 4; + add.u64 %rd17, %rd1, %rd16; + ld.shared.f32 %f2, [%rd17+0]; + cvt.rzi.ftz.s32.f32 %r28, %f2; + ld.param.u64 %rd18, [__cudaparm_transpose_out]; + mul.lo.u32 %r29, %r9, %r20; + add.u32 %r30, %r21, %r29; + cvt.u64.u32 %rd19, %r30; + mul.wide.u32 %rd20, %r30, 4; + add.u64 %rd21, %rd18, %rd20; + st.global.s32 [%rd21+0], %r28; +$Lt_0_2818: + .loc 16 84 0 + exit; +$LDWend_transpose: + } // transpose + .global .texref neigh_tex; + + .entry calc_cell_id ( + .param .u64 __cudaparm_calc_cell_id_pos, + .param .u64 __cudaparm_calc_cell_id_cell_id, + .param .u64 __cudaparm_calc_cell_id_particle_id, + .param .f32 __cudaparm_calc_cell_id_boxlo0, + .param .f32 __cudaparm_calc_cell_id_boxlo1, + .param .f32 __cudaparm_calc_cell_id_boxlo2, + .param .f32 __cudaparm_calc_cell_id_boxhi0, + .param .f32 __cudaparm_calc_cell_id_boxhi1, + .param .f32 __cudaparm_calc_cell_id_boxhi2, + .param .f32 __cudaparm_calc_cell_id_cell_size, + .param .s32 __cudaparm_calc_cell_id_ncellx, + .param .s32 __cudaparm_calc_cell_id_ncelly, + .param .s32 __cudaparm_calc_cell_id_nall) + { + .reg .u32 %r<25>; + .reg .u64 %rd<8>; + .reg .f32 %f<35>; + .reg .f64 %fd<11>; + .reg .pred %p<3>; + .loc 16 90 0 +$LDWbegin_calc_cell_id: + mov.u32 %r1, %tid.x; + mov.u32 %r2, %ctaid.x; + mov.u32 %r3, %ntid.x; + mul.lo.u32 %r4, %r2, %r3; + add.u32 %r5, %r1, %r4; + ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall]; + setp.le.s32 %p1, %r6, %r5; + @%p1 bra $Lt_1_1026; + .loc 16 94 0 + mov.u32 %r7, %r5; + mov.s32 %r8, 0; + mov.u32 %r9, %r8; + mov.s32 %r10, 0; + mov.u32 %r11, %r10; + mov.s32 %r12, 0; + mov.u32 %r13, %r12; + tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}]; + mov.f32 %f5, %f1; + mov.f32 %f6, %f2; + mov.f32 %f7, %f3; + .loc 16 107 0 + ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size]; + neg.ftz.f32 %f9, %f8; + ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0]; + ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2]; + ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1]; + ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx]; + ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly]; + ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2]; + sub.ftz.f32 %f14, %f13, %f11; + add.ftz.f32 %f15, %f8, %f14; + sub.ftz.f32 %f16, %f7, %f11; + max.ftz.f32 %f17, %f9, %f16; + min.ftz.f32 %f18, %f15, %f17; + div.approx.ftz.f32 %f19, %f18, %f8; + cvt.ftz.f64.f32 %fd1, %f19; + mov.f64 %fd2, 0d3ff0000000000000; // 1 + add.f64 %fd3, %fd1, %fd2; + cvt.rzi.u32.f64 %r16, %fd3; + mul.lo.u32 %r17, %r14, %r16; + mul.lo.u32 %r18, %r15, %r17; + ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1]; + sub.ftz.f32 %f21, %f20, %f12; + add.ftz.f32 %f22, %f8, %f21; + sub.ftz.f32 %f23, %f6, %f12; + max.ftz.f32 %f24, %f9, %f23; + min.ftz.f32 %f25, %f22, %f24; + div.approx.ftz.f32 %f26, %f25, %f8; + cvt.ftz.f64.f32 %fd4, %f26; + mov.f64 %fd5, 0d3ff0000000000000; // 1 + add.f64 %fd6, %fd4, %fd5; + cvt.rzi.u32.f64 %r19, %fd6; + mul.lo.u32 %r20, %r14, %r19; + add.u32 %r21, %r18, %r20; + ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0]; + sub.ftz.f32 %f28, %f27, %f10; + add.ftz.f32 %f29, %f8, %f28; + sub.ftz.f32 %f30, %f5, %f10; + max.ftz.f32 %f31, %f9, %f30; + min.ftz.f32 %f32, %f29, %f31; + div.approx.ftz.f32 %f33, %f32, %f8; + cvt.ftz.f64.f32 %fd7, %f33; + mov.f64 %fd8, 0d3ff0000000000000; // 1 + add.f64 %fd9, %fd7, %fd8; + cvt.rzi.u32.f64 %r22, %fd9; + add.u32 %r23, %r21, %r22; + .loc 16 111 0 + cvt.s64.s32 %rd1, %r5; + mul.wide.s32 %rd2, %r5, 4; + ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id]; + add.u64 %rd4, %rd3, %rd2; + st.global.u32 [%rd4+0], %r23; + .loc 16 112 0 + ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id]; + add.u64 %rd6, %rd5, %rd2; + st.global.s32 [%rd6+0], %r5; +$Lt_1_1026: + .loc 16 114 0 + exit; +$LDWend_calc_cell_id: + } // calc_cell_id + + .entry kernel_calc_cell_counts ( + .param .u64 __cudaparm_kernel_calc_cell_counts_cell_id, + .param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts, + .param .s32 __cudaparm_kernel_calc_cell_counts_nall, + .param .s32 __cudaparm_kernel_calc_cell_counts_ncell) + { + .reg .u32 %r<33>; + .reg .u64 %rd<15>; + .reg .pred %p<13>; + .loc 16 117 0 +$LDWbegin_kernel_calc_cell_counts: + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mul.lo.u32 %r3, %r1, %r2; + mov.u32 %r4, %tid.x; + add.u32 %r5, %r4, %r3; + ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall]; + setp.gt.s32 %p1, %r6, %r5; + @!%p1 bra $Lt_2_7426; + .loc 16 120 0 + ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id]; + cvt.s64.s32 %rd2, %r5; + mul.wide.s32 %rd3, %r5, 4; + add.u64 %rd4, %rd1, %rd3; + ld.global.u32 %r7, [%rd4+0]; + mov.u32 %r8, 0; + setp.ne.s32 %p2, %r5, %r8; + @%p2 bra $Lt_2_7938; + add.s32 %r9, %r7, 1; + mov.u32 %r10, 0; + setp.le.s32 %p3, %r9, %r10; + @%p3 bra $Lt_2_8450; + mov.s32 %r11, %r9; + ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts]; + mov.s32 %r12, 0; + mov.s32 %r13, %r11; +$Lt_2_8962: + //<loop> Loop body line 120, nesting depth: 1, estimated iterations: unknown + .loc 16 125 0 + mov.s32 %r14, 0; + st.global.s32 [%rd5+0], %r14; + add.s32 %r12, %r12, 1; + add.u64 %rd5, %rd5, 4; + setp.ne.s32 %p4, %r9, %r12; + @%p4 bra $Lt_2_8962; +$Lt_2_8450: +$Lt_2_7938: + sub.s32 %r15, %r6, 1; + setp.ne.s32 %p5, %r5, %r15; + @%p5 bra $Lt_2_9474; + .loc 16 128 0 + add.s32 %r9, %r7, 1; + mov.s32 %r16, %r9; + ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell]; + setp.gt.s32 %p6, %r9, %r17; + @%p6 bra $Lt_2_9986; + sub.s32 %r18, %r17, %r7; + add.s32 %r19, %r17, 1; + ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts]; + cvt.s64.s32 %rd7, %r9; + mul.wide.s32 %rd8, %r9, 4; + add.u64 %rd9, %rd6, %rd8; + mov.s32 %r20, %r18; +$Lt_2_10498: + //<loop> Loop body line 128, nesting depth: 1, estimated iterations: unknown + .loc 16 129 0 + st.global.s32 [%rd9+0], %r6; + add.s32 %r16, %r16, 1; + add.u64 %rd9, %rd9, 4; + setp.ne.s32 %p7, %r19, %r16; + @%p7 bra $Lt_2_10498; +$Lt_2_9986: +$Lt_2_9474: + selp.s32 %r21, 1, 0, %p1; + mov.s32 %r22, 0; + set.gt.u32.s32 %r23, %r5, %r22; + neg.s32 %r24, %r23; + and.b32 %r25, %r21, %r24; + mov.u32 %r26, 0; + setp.eq.s32 %p8, %r25, %r26; + @%p8 bra $Lt_2_11010; + .loc 16 133 0 + ld.global.u32 %r27, [%rd4+-4]; + setp.eq.s32 %p9, %r7, %r27; + @%p9 bra $Lt_2_11522; + .loc 16 135 0 + add.s32 %r28, %r27, 1; + mov.s32 %r29, %r28; + setp.gt.s32 %p10, %r28, %r7; + @%p10 bra $Lt_2_12034; + sub.s32 %r30, %r7, %r27; + add.s32 %r9, %r7, 1; + ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts]; + cvt.s64.s32 %rd11, %r28; + mul.wide.s32 %rd12, %r28, 4; + add.u64 %rd13, %rd10, %rd12; + mov.s32 %r31, %r30; +$Lt_2_12546: + //<loop> Loop body line 135, nesting depth: 1, estimated iterations: unknown + .loc 16 136 0 + st.global.s32 [%rd13+0], %r5; + add.s32 %r29, %r29, 1; + add.u64 %rd13, %rd13, 4; + setp.ne.s32 %p11, %r9, %r29; + @%p11 bra $Lt_2_12546; +$Lt_2_12034: +$Lt_2_11522: +$Lt_2_11010: +$Lt_2_7426: + .loc 16 140 0 + exit; +$LDWend_kernel_calc_cell_counts: + } // kernel_calc_cell_counts + + .entry calc_neigh_list_cell ( + .param .u64 __cudaparm_calc_neigh_list_cell_pos, + .param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id, + .param .u64 __cudaparm_calc_neigh_list_cell_cell_counts, + .param .u64 __cudaparm_calc_neigh_list_cell_nbor_list, + .param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list, + .param .u64 __cudaparm_calc_neigh_list_cell_host_numj, + .param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size, + .param .f32 __cudaparm_calc_neigh_list_cell_cell_size, + .param .s32 __cudaparm_calc_neigh_list_cell_ncellx, + .param .s32 __cudaparm_calc_neigh_list_cell_ncelly, + .param .s32 __cudaparm_calc_neigh_list_cell_ncellz, + .param .s32 __cudaparm_calc_neigh_list_cell_inum, + .param .s32 __cudaparm_calc_neigh_list_cell_nt, + .param .s32 __cudaparm_calc_neigh_list_cell_nall) + { + .reg .u32 %r<106>; + .reg .u64 %rd<46>; + .reg .f32 %f<43>; + .reg .f64 %fd<4>; + .reg .pred %p<22>; + .shared .align 4 .b8 __cuda___cuda_local_var_32577_31_non_const_cell_list_sh480[512]; + .shared .align 16 .b8 __cuda___cuda_local_var_32578_34_non_const_pos_sh992[2048]; + // __cuda_local_var_32592_12_non_const_atom_i = 16 + .loc 16 151 0 +$LDWbegin_calc_neigh_list_cell: + .loc 16 163 0 + ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly]; + mov.u32 %r2, %ctaid.y; + rem.u32 %r3, %r2, %r1; + div.u32 %r4, %r2, %r1; + ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx]; + mul.lo.s32 %r6, %r5, %r3; + mul.lo.s32 %r7, %r5, %r4; + mul.lo.s32 %r8, %r7, %r1; + cvt.s32.u32 %r9, %ctaid.x; + ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts]; + add.s32 %r10, %r6, %r8; + add.s32 %r11, %r9, %r10; + cvt.s64.s32 %rd2, %r11; + mul.wide.s32 %rd3, %r11, 4; + add.u64 %rd4, %rd1, %rd3; + ldu.global.s32 %r12, [%rd4+0]; + .loc 16 164 0 + ldu.global.s32 %r13, [%rd4+4]; + .loc 16 172 0 + sub.s32 %r14, %r13, %r12; + mov.u32 %r15, %ntid.x; + cvt.rn.f32.u32 %f1, %r15; + cvt.rn.f32.s32 %f2, %r14; + div.approx.ftz.f32 %f3, %f2, %f1; + cvt.rpi.ftz.f32.f32 %f4, %f3; + mov.f32 %f5, 0f00000000; // 0 + setp.gt.ftz.f32 %p1, %f4, %f5; + @!%p1 bra $Lt_3_13314; + sub.s32 %r16, %r3, 1; + mov.s32 %r17, 0; + max.s32 %r18, %r16, %r17; + sub.s32 %r19, %r1, 1; + add.s32 %r20, %r3, 1; + min.s32 %r21, %r19, %r20; + ld.param.s32 %r22, [__cudaparm_calc_neigh_list_cell_ncellz]; + sub.s32 %r23, %r22, 1; + add.s32 %r24, %r4, 1; + min.s32 %r25, %r23, %r24; + sub.s32 %r26, %r9, 1; + mov.s32 %r27, 0; + max.s32 %r28, %r26, %r27; + add.s32 %r29, %r9, 1; + sub.s32 %r30, %r5, 1; + min.s32 %r31, %r29, %r30; + cvt.s32.u32 %r32, %tid.x; + add.s32 %r33, %r12, %r32; + mov.u32 %r34, 0; + ld.param.s32 %r35, [__cudaparm_calc_neigh_list_cell_inum]; + cvt.s64.s32 %rd5, %r35; + sub.s32 %r36, %r4, 1; + mov.s32 %r37, %r33; + mul.wide.s32 %rd6, %r35, 4; + mov.s32 %r38, 0; + max.s32 %r39, %r36, %r38; + setp.ge.s32 %p2, %r25, %r39; + ld.param.s32 %r40, [__cudaparm_calc_neigh_list_cell_nt]; + ld.param.s32 %r41, [__cudaparm_calc_neigh_list_cell_nall]; + mov.s32 %r42, 0; + mov.u64 %rd7, __cuda___cuda_local_var_32577_31_non_const_cell_list_sh480; + mov.u64 %rd8, __cuda___cuda_local_var_32578_34_non_const_pos_sh992; +$Lt_3_13826: + //<loop> Loop body line 172, nesting depth: 1, estimated iterations: unknown + .loc 16 174 0 + mov.s32 %r43, %r41; + setp.ge.s32 %p3, %r37, %r13; + @%p3 bra $Lt_3_14082; + .loc 16 180 0 + ld.param.u64 %rd9, [__cudaparm_calc_neigh_list_cell_cell_particle_id]; + add.u32 %r44, %r33, %r34; + cvt.s64.s32 %rd10, %r44; + mul.wide.s32 %rd11, %r44, 4; + add.u64 %rd12, %rd9, %rd11; + ld.global.s32 %r43, [%rd12+0]; +$Lt_3_14082: + setp.lt.s32 %p4, %r43, %r40; + @!%p4 bra $Lt_3_14594; + .loc 16 183 0 + mov.u32 %r45, %r43; + mov.s32 %r46, 0; + mov.u32 %r47, %r46; + mov.s32 %r48, 0; + mov.u32 %r49, %r48; + mov.s32 %r50, 0; + mov.u32 %r51, %r50; + tex.1d.v4.f32.s32 {%f6,%f7,%f8,%f9},[neigh_tex,{%r45,%r47,%r49,%r51}]; + mov.f32 %f10, %f6; + mov.f32 %f11, %f7; + mov.f32 %f12, %f8; + mov.f32 %f13, %f10; + mov.f32 %f14, %f11; + mov.f32 %f15, %f12; +$Lt_3_14594: + cvt.s64.s32 %rd13, %r43; + mul.wide.s32 %rd14, %r43, 4; + setp.ge.s32 %p5, %r43, %r35; + @%p5 bra $Lt_3_15362; + .loc 16 186 0 + mov.s32 %r52, %r35; + .loc 16 187 0 + ld.param.u64 %rd15, [__cudaparm_calc_neigh_list_cell_nbor_list]; + add.u64 %rd16, %rd13, %rd5; + mul.lo.u64 %rd17, %rd16, 4; + add.u64 %rd18, %rd15, %rd17; + mov.s64 %rd19, %rd18; + .loc 16 188 0 + add.u64 %rd20, %rd6, %rd18; + .loc 16 189 0 + add.u64 %rd21, %rd14, %rd15; + st.global.s32 [%rd21+0], %r43; + bra.uni $Lt_3_15106; +$Lt_3_15362: + .loc 16 192 0 + ld.param.u64 %rd22, [__cudaparm_calc_neigh_list_cell_host_numj]; + add.u64 %rd23, %rd22, %rd14; + sub.u64 %rd19, %rd23, %rd6; + .loc 16 193 0 + ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_nbor_list]; + ld.param.s32 %r53, [__cudaparm_calc_neigh_list_cell_neigh_bin_size]; + sub.s32 %r54, %r43, %r35; + mul.lo.s32 %r55, %r53, %r54; + cvt.s64.s32 %rd25, %r55; + mul.wide.s32 %rd26, %r55, 4; + add.u64 %rd20, %rd24, %rd26; + mov.s32 %r52, 1; +$Lt_3_15106: + .loc 16 198 0 + mov.s32 %r56, %r39; + @!%p2 bra $Lt_3_23298; + sub.s32 %r57, %r25, %r39; + add.s32 %r58, %r57, 1; + setp.le.s32 %p6, %r18, %r21; + add.s32 %r59, %r25, 1; + mov.s32 %r60, 0; + mov.s32 %r61, %r58; +$Lt_3_16130: + //<loop> Loop body line 198, nesting depth: 1, estimated iterations: unknown + .loc 16 199 0 + mov.s32 %r62, %r18; + @!%p6 bra $Lt_3_16386; + sub.s32 %r63, %r21, %r18; + add.s32 %r64, %r63, 1; + setp.ge.s32 %p7, %r31, %r28; + add.s32 %r65, %r21, 1; + mov.s32 %r66, %r64; +$Lt_3_16898: + //<loop> Loop body line 199, nesting depth: 2, estimated iterations: unknown + @!%p7 bra $Lt_3_17154; + sub.s32 %r67, %r31, %r28; + add.s32 %r68, %r67, 1; + mul.lo.s32 %r69, %r62, %r5; + mul.lo.s32 %r70, %r56, %r5; + mul.lo.s32 %r71, %r70, %r1; + add.s32 %r72, %r31, 1; + add.s32 %r73, %r69, %r71; + add.s32 %r74, %r73, %r28; + add.s32 %r75, %r72, %r73; + cvt.s64.s32 %rd27, %r74; + mul.wide.s32 %rd28, %r74, 4; + add.u64 %rd29, %rd1, %rd28; + mov.s32 %r76, %r68; +$Lt_3_17666: + //<loop> Loop body line 199, nesting depth: 3, estimated iterations: unknown + .loc 16 204 0 + ld.global.s32 %r77, [%rd29+0]; + .loc 16 205 0 + ld.global.s32 %r78, [%rd29+4]; + .loc 16 209 0 + sub.s32 %r79, %r78, %r77; + cvt.rn.f32.s32 %f16, %r79; + mov.f32 %f17, 0f43000000; // 128 + div.approx.ftz.f32 %f18, %f16, %f17; + cvt.rpi.ftz.f32.f32 %f19, %f18; + cvt.rzi.ftz.s32.f32 %r80, %f19; + mov.u32 %r81, 0; + setp.le.s32 %p8, %r80, %r81; + @%p8 bra $Lt_3_17922; + mov.s32 %r82, %r80; + mov.s32 %r83, 0; + setp.lt.s32 %p9, %r43, %r40; + mul.lo.s32 %r84, %r80, 128; + mov.s32 %r85, %r82; +$Lt_3_18434: + //<loop> Loop body line 209, nesting depth: 4, estimated iterations: unknown + sub.s32 %r86, %r79, %r83; + mov.s32 %r87, 128; + min.s32 %r88, %r86, %r87; + setp.le.s32 %p10, %r88, %r32; + @%p10 bra $Lt_3_18690; + .loc 16 215 0 + ld.param.u64 %rd30, [__cudaparm_calc_neigh_list_cell_cell_particle_id]; + add.s32 %r89, %r83, %r32; + add.s32 %r90, %r77, %r89; + cvt.s64.s32 %rd31, %r90; + mul.wide.s32 %rd32, %r90, 4; + add.u64 %rd33, %rd30, %rd32; + ld.global.s32 %r91, [%rd33+0]; + .loc 16 216 0 + cvt.s64.s32 %rd34, %r32; + mul.wide.s32 %rd35, %r32, 4; + add.u64 %rd36, %rd7, %rd35; + st.shared.s32 [%rd36+0], %r91; + .loc 16 217 0 + mov.u32 %r92, %r91; + mov.s32 %r93, 0; + mov.u32 %r94, %r93; + mov.s32 %r95, 0; + mov.u32 %r96, %r95; + mov.s32 %r97, 0; + mov.u32 %r98, %r97; + tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[neigh_tex,{%r92,%r94,%r96,%r98}]; + mov.f32 %f24, %f20; + mov.f32 %f25, %f21; + mov.f32 %f26, %f22; + .loc 16 218 0 + mul.lo.u64 %rd37, %rd34, 16; + add.u64 %rd38, %rd8, %rd37; + st.shared.v2.f32 [%rd38+0], {%f24,%f25}; + .loc 16 220 0 + st.shared.f32 [%rd38+8], %f26; +$Lt_3_18690: + .loc 16 222 0 + bar.sync 0; + @!%p9 bra $Lt_3_19714; + mov.u32 %r99, 0; + setp.le.s32 %p11, %r88, %r99; + @%p11 bra $Lt_3_19714; + mov.s32 %r100, %r88; + mov.s64 %rd39, 0; + ld.param.f32 %f27, [__cudaparm_calc_neigh_list_cell_cell_size]; + mul.ftz.f32 %f28, %f27, %f27; + mov.s64 %rd40, %rd8; + mov.f32 %f29, %f15; + mov.f32 %f30, %f14; + mov.f32 %f31, %f13; + mov.s32 %r101, 0; + mov.s32 %r102, %r100; +$Lt_3_20226: + //<loop> Loop body line 222, nesting depth: 5, estimated iterations: unknown + ld.shared.v4.f32 {%f32,%f33,%f34,_}, [%rd40+0]; + .loc 16 228 0 + sub.ftz.f32 %f35, %f31, %f32; + .loc 16 229 0 + sub.ftz.f32 %f36, %f30, %f33; + .loc 16 230 0 + sub.ftz.f32 %f37, %f29, %f34; + .loc 16 227 0 + mul.ftz.f32 %f38, %f36, %f36; + fma.rn.ftz.f32 %f39, %f35, %f35, %f38; + fma.rn.ftz.f32 %f40, %f37, %f37, %f39; + setp.gt.ftz.f32 %p12, %f28, %f40; + @!%p12 bra $Lt_3_24578; + cvt.ftz.f64.f32 %fd1, %f40; + mov.f64 %fd2, 0d3ee4f8b588e368f1; // 1e-05 + setp.gt.f64 %p13, %fd1, %fd2; + @!%p13 bra $Lt_3_24578; + ld.param.s32 %r103, [__cudaparm_calc_neigh_list_cell_neigh_bin_size]; + setp.le.s32 %p14, %r103, %r60; + @%p14 bra $Lt_3_20482; + .loc 16 235 0 + mul.lo.u64 %rd41, %rd39, 4; + add.u64 %rd42, %rd7, %rd41; + ld.shared.s32 %r104, [%rd42+0]; + st.global.s32 [%rd20+0], %r104; + .loc 16 236 0 + cvt.s64.s32 %rd43, %r52; + mul.wide.s32 %rd44, %r52, 4; + add.u64 %rd20, %rd20, %rd44; +$Lt_3_20482: + .loc 16 238 0 + add.s32 %r60, %r60, 1; +$Lt_3_24578: +$L_3_12802: + add.s32 %r101, %r101, 1; + add.s64 %rd39, %rd39, 1; + add.u64 %rd40, %rd40, 16; + setp.ne.s32 %p15, %r88, %r101; + @%p15 bra $Lt_3_20226; +$Lt_3_19714: +$Lt_3_19202: + .loc 16 242 0 + bar.sync 0; + add.s32 %r83, %r83, 128; + setp.ne.s32 %p16, %r83, %r84; + @%p16 bra $Lt_3_18434; +$Lt_3_17922: + add.s32 %r74, %r74, 1; + add.u64 %rd29, %rd29, 4; + setp.ne.s32 %p17, %r74, %r75; + @%p17 bra $Lt_3_17666; +$Lt_3_17154: + add.s32 %r62, %r62, 1; + setp.ne.s32 %p18, %r65, %r62; + @%p18 bra $Lt_3_16898; +$Lt_3_16386: + add.s32 %r56, %r56, 1; + setp.ne.s32 %p19, %r59, %r56; + @%p19 bra $Lt_3_16130; + bra.uni $Lt_3_15618; +$Lt_3_23298: + mov.s32 %r60, 0; +$Lt_3_15618: + @!%p4 bra $Lt_3_22274; + .loc 16 248 0 + st.global.s32 [%rd19+0], %r60; +$Lt_3_22274: + .loc 16 172 0 + add.s32 %r42, %r42, 1; + add.u32 %r34, %r34, %r15; + add.s32 %r37, %r37, %r15; + cvt.rn.f32.s32 %f41, %r42; + setp.lt.ftz.f32 %p20, %f41, %f4; + @%p20 bra $Lt_3_13826; +$Lt_3_13314: + .loc 16 250 0 + exit; +$LDWend_calc_neigh_list_cell: + } // calc_neigh_list_cell + + .entry kernel_special ( + .param .u64 __cudaparm_kernel_special_dev_nbor, + .param .u64 __cudaparm_kernel_special_host_nbor_list, + .param .u64 __cudaparm_kernel_special_host_numj, + .param .u64 __cudaparm_kernel_special_tag, + .param .u64 __cudaparm_kernel_special_nspecial, + .param .u64 __cudaparm_kernel_special_special, + .param .s32 __cudaparm_kernel_special_inum, + .param .s32 __cudaparm_kernel_special_nt, + .param .s32 __cudaparm_kernel_special_max_nbors) + { + .reg .u32 %r<34>; + .reg .u64 %rd<36>; + .reg .pred %p<11>; + .loc 16 256 0 +$LDWbegin_kernel_special: + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mul.lo.u32 %r3, %r1, %r2; + mov.u32 %r4, %tid.x; + add.u32 %r5, %r4, %r3; + ld.param.s32 %r6, [__cudaparm_kernel_special_nt]; + setp.le.s32 %p1, %r6, %r5; + @%p1 bra $Lt_4_6146; + .loc 16 264 0 + ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial]; + mul.lo.s32 %r7, %r5, 3; + cvt.s64.s32 %rd2, %r7; + mul.wide.s32 %rd3, %r7, 4; + add.u64 %rd4, %rd1, %rd3; + ld.global.s32 %r8, [%rd4+0]; + .loc 16 265 0 + ld.global.s32 %r9, [%rd4+4]; + .loc 16 266 0 + ld.global.s32 %r10, [%rd4+8]; + ld.param.s32 %r11, [__cudaparm_kernel_special_inum]; + setp.le.s32 %p2, %r11, %r5; + @%p2 bra $Lt_4_6914; + .loc 16 270 0 + mov.s32 %r12, %r11; + .loc 16 272 0 + cvt.s64.s32 %rd5, %r11; + ld.param.u64 %rd6, [__cudaparm_kernel_special_dev_nbor]; + cvt.s64.s32 %rd7, %r5; + add.u64 %rd8, %rd7, %rd5; + mul.lo.u64 %rd9, %rd8, 4; + add.u64 %rd10, %rd6, %rd9; + ld.global.s32 %r13, [%rd10+0]; + .loc 16 273 0 + mul.wide.s32 %rd11, %r11, 4; + add.u64 %rd12, %rd10, %rd11; + bra.uni $Lt_4_6658; +$Lt_4_6914: + .loc 16 276 0 + sub.s32 %r14, %r5, %r11; + ld.param.u64 %rd13, [__cudaparm_kernel_special_host_nbor_list]; + ld.param.s32 %r15, [__cudaparm_kernel_special_max_nbors]; + mul.lo.s32 %r16, %r15, %r14; + cvt.s64.s32 %rd14, %r16; + mul.wide.s32 %rd15, %r16, 4; + add.u64 %rd12, %rd13, %rd15; + .loc 16 277 0 + ld.param.u64 %rd16, [__cudaparm_kernel_special_host_numj]; + cvt.s64.s32 %rd17, %r14; + mul.wide.s32 %rd18, %r14, 4; + add.u64 %rd19, %rd16, %rd18; + ld.global.s32 %r13, [%rd19+0]; + mov.s32 %r12, 1; +$Lt_4_6658: + .loc 16 279 0 + mul.lo.s32 %r17, %r13, %r12; + cvt.s64.s32 %rd20, %r17; + mul.wide.s32 %rd21, %r17, 4; + add.u64 %rd22, %rd12, %rd21; + setp.le.u64 %p3, %rd22, %rd12; + @%p3 bra $Lt_4_7170; + mov.s32 %r18, 0; + setp.gt.s32 %p4, %r10, %r18; + cvt.s64.s32 %rd23, %r12; + ld.param.u64 %rd24, [__cudaparm_kernel_special_tag]; +$Lt_4_7682: + //<loop> Loop body line 279, nesting depth: 1, estimated iterations: unknown + .loc 16 282 0 + ld.global.s32 %r19, [%rd12+0]; + .loc 16 283 0 + cvt.s64.s32 %rd25, %r19; + mul.wide.s32 %rd26, %r19, 4; + add.u64 %rd27, %rd24, %rd26; + ld.global.s32 %r20, [%rd27+0]; + @!%p4 bra $Lt_4_7938; + mov.s32 %r21, %r10; + cvt.s64.s32 %rd28, %r5; + cvt.s64.s32 %rd29, %r6; + mul.wide.s32 %rd30, %r6, 4; + ld.param.u64 %rd31, [__cudaparm_kernel_special_special]; + mul.wide.s32 %rd32, %r5, 4; + add.u64 %rd33, %rd31, %rd32; + mov.s32 %r22, 0; + mov.s32 %r23, %r21; +$Lt_4_8450: + //<loop> Loop body line 283, nesting depth: 1, estimated iterations: unknown + ld.global.s32 %r24, [%rd33+0]; + setp.ne.s32 %p5, %r24, %r20; + @%p5 bra $Lt_4_8706; + .loc 16 293 0 + setp.le.s32 %p6, %r8, %r22; + mov.s32 %r25, 3; + mov.s32 %r26, 2; + selp.s32 %r27, %r25, %r26, %p6; + mov.s32 %r28, 2; + mov.s32 %r29, 1; + selp.s32 %r30, %r28, %r29, %p6; + setp.le.s32 %p7, %r9, %r22; + selp.s32 %r31, %r27, %r30, %p7; + shl.b32 %r32, %r31, 30; + xor.b32 %r19, %r19, %r32; + .loc 16 294 0 + st.global.s32 [%rd12+0], %r19; +$Lt_4_8706: + add.s32 %r22, %r22, 1; + add.u64 %rd33, %rd30, %rd33; + setp.ne.s32 %p8, %r10, %r22; + @%p8 bra $Lt_4_8450; +$Lt_4_7938: + .loc 16 281 0 + mul.lo.u64 %rd34, %rd23, 4; + add.u64 %rd12, %rd12, %rd34; + setp.gt.u64 %p9, %rd22, %rd12; + @%p9 bra $Lt_4_7682; +$Lt_4_7170: +$Lt_4_6146: + .loc 16 300 0 + exit; +$LDWend_kernel_special: + } // kernel_special + diff --git a/lib/gpu/pair_gpu_build_ptx.h b/lib/gpu/pair_gpu_build_ptx.h new file mode 100644 index 000000000..e806ab861 --- /dev/null +++ b/lib/gpu/pair_gpu_build_ptx.h @@ -0,0 +1,772 @@ +const char * pair_gpu_build_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .entry transpose (\n" +" .param .u64 __cudaparm_transpose_out,\n" +" .param .u64 __cudaparm_transpose_in,\n" +" .param .s32 __cudaparm_transpose_columns_in,\n" +" .param .s32 __cudaparm_transpose_rows_in)\n" +" {\n" +" .reg .u32 %r<32>;\n" +" .reg .u64 %rd<23>;\n" +" .reg .f32 %f<4>;\n" +" .reg .pred %p<4>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32483_32_non_const_block24[288];\n" +" .loc 16 64 0\n" +"$LDWbegin_transpose:\n" +" mov.u32 %r1, %ctaid.x;\n" +" mul.lo.u32 %r2, %r1, 8;\n" +" mov.u32 %r3, %ctaid.y;\n" +" mul.lo.u32 %r4, %r3, 8;\n" +" mov.u32 %r5, %tid.x;\n" +" add.u32 %r6, %r2, %r5;\n" +" mov.u32 %r7, %tid.y;\n" +" add.u32 %r8, %r4, %r7;\n" +" ld.param.s32 %r9, [__cudaparm_transpose_rows_in];\n" +" ld.param.s32 %r10, [__cudaparm_transpose_columns_in];\n" +" set.gt.u32.u32 %r11, %r9, %r8;\n" +" neg.s32 %r12, %r11;\n" +" set.gt.u32.u32 %r13, %r10, %r6;\n" +" neg.s32 %r14, %r13;\n" +" and.b32 %r15, %r12, %r14;\n" +" mov.u32 %r16, 0;\n" +" setp.eq.s32 %p1, %r15, %r16;\n" +" @%p1 bra $Lt_0_2306;\n" +" .loc 16 76 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32483_32_non_const_block24;\n" +" ld.param.u64 %rd2, [__cudaparm_transpose_in];\n" +" mul.lo.u32 %r17, %r10, %r8;\n" +" add.u32 %r18, %r6, %r17;\n" +" cvt.u64.u32 %rd3, %r18;\n" +" mul.wide.u32 %rd4, %r18, 4;\n" +" add.u64 %rd5, %rd2, %rd4;\n" +" ld.global.s32 %r19, [%rd5+0];\n" +" cvt.rn.f32.s32 %f1, %r19;\n" +" cvt.u64.u32 %rd6, %r5;\n" +" cvt.u64.u32 %rd7, %r7;\n" +" mul.wide.u32 %rd8, %r7, 9;\n" +" add.u64 %rd9, %rd6, %rd8;\n" +" mul.lo.u64 %rd10, %rd9, 4;\n" +" add.u64 %rd11, %rd1, %rd10;\n" +" st.shared.f32 [%rd11+0], %f1;\n" +"$Lt_0_2306:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32483_32_non_const_block24;\n" +" .loc 16 78 0\n" +" bar.sync 0;\n" +" add.u32 %r20, %r2, %r7;\n" +" add.u32 %r21, %r4, %r5;\n" +" set.gt.u32.u32 %r22, %r9, %r21;\n" +" neg.s32 %r23, %r22;\n" +" set.gt.u32.u32 %r24, %r10, %r20;\n" +" neg.s32 %r25, %r24;\n" +" and.b32 %r26, %r23, %r25;\n" +" mov.u32 %r27, 0;\n" +" setp.eq.s32 %p2, %r26, %r27;\n" +" @%p2 bra $Lt_0_2818;\n" +" .loc 16 83 0\n" +" cvt.u64.u32 %rd12, %r7;\n" +" cvt.u64.u32 %rd13, %r5;\n" +" mul.wide.u32 %rd14, %r5, 9;\n" +" add.u64 %rd15, %rd12, %rd14;\n" +" mul.lo.u64 %rd16, %rd15, 4;\n" +" add.u64 %rd17, %rd1, %rd16;\n" +" ld.shared.f32 %f2, [%rd17+0];\n" +" cvt.rzi.ftz.s32.f32 %r28, %f2;\n" +" ld.param.u64 %rd18, [__cudaparm_transpose_out];\n" +" mul.lo.u32 %r29, %r9, %r20;\n" +" add.u32 %r30, %r21, %r29;\n" +" cvt.u64.u32 %rd19, %r30;\n" +" mul.wide.u32 %rd20, %r30, 4;\n" +" add.u64 %rd21, %rd18, %rd20;\n" +" st.global.s32 [%rd21+0], %r28;\n" +"$Lt_0_2818:\n" +" .loc 16 84 0\n" +" exit;\n" +"$LDWend_transpose:\n" +" }\n" +" .global .texref neigh_tex;\n" +" .entry calc_cell_id (\n" +" .param .u64 __cudaparm_calc_cell_id_pos,\n" +" .param .u64 __cudaparm_calc_cell_id_cell_id,\n" +" .param .u64 __cudaparm_calc_cell_id_particle_id,\n" +" .param .f32 __cudaparm_calc_cell_id_boxlo0,\n" +" .param .f32 __cudaparm_calc_cell_id_boxlo1,\n" +" .param .f32 __cudaparm_calc_cell_id_boxlo2,\n" +" .param .f32 __cudaparm_calc_cell_id_boxhi0,\n" +" .param .f32 __cudaparm_calc_cell_id_boxhi1,\n" +" .param .f32 __cudaparm_calc_cell_id_boxhi2,\n" +" .param .f32 __cudaparm_calc_cell_id_cell_size,\n" +" .param .s32 __cudaparm_calc_cell_id_ncellx,\n" +" .param .s32 __cudaparm_calc_cell_id_ncelly,\n" +" .param .s32 __cudaparm_calc_cell_id_nall)\n" +" {\n" +" .reg .u32 %r<25>;\n" +" .reg .u64 %rd<8>;\n" +" .reg .f32 %f<35>;\n" +" .reg .f64 %fd<11>;\n" +" .reg .pred %p<3>;\n" +" .loc 16 90 0\n" +"$LDWbegin_calc_cell_id:\n" +" mov.u32 %r1, %tid.x;\n" +" mov.u32 %r2, %ctaid.x;\n" +" mov.u32 %r3, %ntid.x;\n" +" mul.lo.u32 %r4, %r2, %r3;\n" +" add.u32 %r5, %r1, %r4;\n" +" ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall];\n" +" setp.le.s32 %p1, %r6, %r5;\n" +" @%p1 bra $Lt_1_1026;\n" +" .loc 16 94 0\n" +" mov.u32 %r7, %r5;\n" +" mov.s32 %r8, 0;\n" +" mov.u32 %r9, %r8;\n" +" mov.s32 %r10, 0;\n" +" mov.u32 %r11, %r10;\n" +" mov.s32 %r12, 0;\n" +" mov.u32 %r13, %r12;\n" +" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}];\n" +" mov.f32 %f5, %f1;\n" +" mov.f32 %f6, %f2;\n" +" mov.f32 %f7, %f3;\n" +" .loc 16 107 0\n" +" ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size];\n" +" neg.ftz.f32 %f9, %f8;\n" +" ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0];\n" +" ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2];\n" +" ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1];\n" +" ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx];\n" +" ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly];\n" +" ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2];\n" +" sub.ftz.f32 %f14, %f13, %f11;\n" +" add.ftz.f32 %f15, %f8, %f14;\n" +" sub.ftz.f32 %f16, %f7, %f11;\n" +" max.ftz.f32 %f17, %f9, %f16;\n" +" min.ftz.f32 %f18, %f15, %f17;\n" +" div.approx.ftz.f32 %f19, %f18, %f8;\n" +" cvt.ftz.f64.f32 %fd1, %f19;\n" +" mov.f64 %fd2, 0d3ff0000000000000; \n" +" add.f64 %fd3, %fd1, %fd2;\n" +" cvt.rzi.u32.f64 %r16, %fd3;\n" +" mul.lo.u32 %r17, %r14, %r16;\n" +" mul.lo.u32 %r18, %r15, %r17;\n" +" ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1];\n" +" sub.ftz.f32 %f21, %f20, %f12;\n" +" add.ftz.f32 %f22, %f8, %f21;\n" +" sub.ftz.f32 %f23, %f6, %f12;\n" +" max.ftz.f32 %f24, %f9, %f23;\n" +" min.ftz.f32 %f25, %f22, %f24;\n" +" div.approx.ftz.f32 %f26, %f25, %f8;\n" +" cvt.ftz.f64.f32 %fd4, %f26;\n" +" mov.f64 %fd5, 0d3ff0000000000000; \n" +" add.f64 %fd6, %fd4, %fd5;\n" +" cvt.rzi.u32.f64 %r19, %fd6;\n" +" mul.lo.u32 %r20, %r14, %r19;\n" +" add.u32 %r21, %r18, %r20;\n" +" ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0];\n" +" sub.ftz.f32 %f28, %f27, %f10;\n" +" add.ftz.f32 %f29, %f8, %f28;\n" +" sub.ftz.f32 %f30, %f5, %f10;\n" +" max.ftz.f32 %f31, %f9, %f30;\n" +" min.ftz.f32 %f32, %f29, %f31;\n" +" div.approx.ftz.f32 %f33, %f32, %f8;\n" +" cvt.ftz.f64.f32 %fd7, %f33;\n" +" mov.f64 %fd8, 0d3ff0000000000000; \n" +" add.f64 %fd9, %fd7, %fd8;\n" +" cvt.rzi.u32.f64 %r22, %fd9;\n" +" add.u32 %r23, %r21, %r22;\n" +" .loc 16 111 0\n" +" cvt.s64.s32 %rd1, %r5;\n" +" mul.wide.s32 %rd2, %r5, 4;\n" +" ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id];\n" +" add.u64 %rd4, %rd3, %rd2;\n" +" st.global.u32 [%rd4+0], %r23;\n" +" .loc 16 112 0\n" +" ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id];\n" +" add.u64 %rd6, %rd5, %rd2;\n" +" st.global.s32 [%rd6+0], %r5;\n" +"$Lt_1_1026:\n" +" .loc 16 114 0\n" +" exit;\n" +"$LDWend_calc_cell_id:\n" +" }\n" +" .entry kernel_calc_cell_counts (\n" +" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,\n" +" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,\n" +" .param .s32 __cudaparm_kernel_calc_cell_counts_nall,\n" +" .param .s32 __cudaparm_kernel_calc_cell_counts_ncell)\n" +" {\n" +" .reg .u32 %r<33>;\n" +" .reg .u64 %rd<15>;\n" +" .reg .pred %p<13>;\n" +" .loc 16 117 0\n" +"$LDWbegin_kernel_calc_cell_counts:\n" +" mov.u32 %r1, %ctaid.x;\n" +" mov.u32 %r2, %ntid.x;\n" +" mul.lo.u32 %r3, %r1, %r2;\n" +" mov.u32 %r4, %tid.x;\n" +" add.u32 %r5, %r4, %r3;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall];\n" +" setp.gt.s32 %p1, %r6, %r5;\n" +" @!%p1 bra $Lt_2_7426;\n" +" .loc 16 120 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];\n" +" cvt.s64.s32 %rd2, %r5;\n" +" mul.wide.s32 %rd3, %r5, 4;\n" +" add.u64 %rd4, %rd1, %rd3;\n" +" ld.global.u32 %r7, [%rd4+0];\n" +" mov.u32 %r8, 0;\n" +" setp.ne.s32 %p2, %r5, %r8;\n" +" @%p2 bra $Lt_2_7938;\n" +" add.s32 %r9, %r7, 1;\n" +" mov.u32 %r10, 0;\n" +" setp.le.s32 %p3, %r9, %r10;\n" +" @%p3 bra $Lt_2_8450;\n" +" mov.s32 %r11, %r9;\n" +" ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n" +" mov.s32 %r12, 0;\n" +" mov.s32 %r13, %r11;\n" +"$Lt_2_8962:\n" +" .loc 16 125 0\n" +" mov.s32 %r14, 0;\n" +" st.global.s32 [%rd5+0], %r14;\n" +" add.s32 %r12, %r12, 1;\n" +" add.u64 %rd5, %rd5, 4;\n" +" setp.ne.s32 %p4, %r9, %r12;\n" +" @%p4 bra $Lt_2_8962;\n" +"$Lt_2_8450:\n" +"$Lt_2_7938:\n" +" sub.s32 %r15, %r6, 1;\n" +" setp.ne.s32 %p5, %r5, %r15;\n" +" @%p5 bra $Lt_2_9474;\n" +" .loc 16 128 0\n" +" add.s32 %r9, %r7, 1;\n" +" mov.s32 %r16, %r9;\n" +" ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell];\n" +" setp.gt.s32 %p6, %r9, %r17;\n" +" @%p6 bra $Lt_2_9986;\n" +" sub.s32 %r18, %r17, %r7;\n" +" add.s32 %r19, %r17, 1;\n" +" ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n" +" cvt.s64.s32 %rd7, %r9;\n" +" mul.wide.s32 %rd8, %r9, 4;\n" +" add.u64 %rd9, %rd6, %rd8;\n" +" mov.s32 %r20, %r18;\n" +"$Lt_2_10498:\n" +" .loc 16 129 0\n" +" st.global.s32 [%rd9+0], %r6;\n" +" add.s32 %r16, %r16, 1;\n" +" add.u64 %rd9, %rd9, 4;\n" +" setp.ne.s32 %p7, %r19, %r16;\n" +" @%p7 bra $Lt_2_10498;\n" +"$Lt_2_9986:\n" +"$Lt_2_9474:\n" +" selp.s32 %r21, 1, 0, %p1;\n" +" mov.s32 %r22, 0;\n" +" set.gt.u32.s32 %r23, %r5, %r22;\n" +" neg.s32 %r24, %r23;\n" +" and.b32 %r25, %r21, %r24;\n" +" mov.u32 %r26, 0;\n" +" setp.eq.s32 %p8, %r25, %r26;\n" +" @%p8 bra $Lt_2_11010;\n" +" .loc 16 133 0\n" +" ld.global.u32 %r27, [%rd4+-4];\n" +" setp.eq.s32 %p9, %r7, %r27;\n" +" @%p9 bra $Lt_2_11522;\n" +" .loc 16 135 0\n" +" add.s32 %r28, %r27, 1;\n" +" mov.s32 %r29, %r28;\n" +" setp.gt.s32 %p10, %r28, %r7;\n" +" @%p10 bra $Lt_2_12034;\n" +" sub.s32 %r30, %r7, %r27;\n" +" add.s32 %r9, %r7, 1;\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n" +" cvt.s64.s32 %rd11, %r28;\n" +" mul.wide.s32 %rd12, %r28, 4;\n" +" add.u64 %rd13, %rd10, %rd12;\n" +" mov.s32 %r31, %r30;\n" +"$Lt_2_12546:\n" +" .loc 16 136 0\n" +" st.global.s32 [%rd13+0], %r5;\n" +" add.s32 %r29, %r29, 1;\n" +" add.u64 %rd13, %rd13, 4;\n" +" setp.ne.s32 %p11, %r9, %r29;\n" +" @%p11 bra $Lt_2_12546;\n" +"$Lt_2_12034:\n" +"$Lt_2_11522:\n" +"$Lt_2_11010:\n" +"$Lt_2_7426:\n" +" .loc 16 140 0\n" +" exit;\n" +"$LDWend_kernel_calc_cell_counts:\n" +" }\n" +" .entry calc_neigh_list_cell (\n" +" .param .u64 __cudaparm_calc_neigh_list_cell_pos,\n" +" .param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,\n" +" .param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,\n" +" .param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,\n" +" .param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,\n" +" .param .u64 __cudaparm_calc_neigh_list_cell_host_numj,\n" +" .param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,\n" +" .param .f32 __cudaparm_calc_neigh_list_cell_cell_size,\n" +" .param .s32 __cudaparm_calc_neigh_list_cell_ncellx,\n" +" .param .s32 __cudaparm_calc_neigh_list_cell_ncelly,\n" +" .param .s32 __cudaparm_calc_neigh_list_cell_ncellz,\n" +" .param .s32 __cudaparm_calc_neigh_list_cell_inum,\n" +" .param .s32 __cudaparm_calc_neigh_list_cell_nt,\n" +" .param .s32 __cudaparm_calc_neigh_list_cell_nall)\n" +" {\n" +" .reg .u32 %r<106>;\n" +" .reg .u64 %rd<46>;\n" +" .reg .f32 %f<43>;\n" +" .reg .f64 %fd<4>;\n" +" .reg .pred %p<22>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32577_31_non_const_cell_list_sh480[512];\n" +" .shared .align 16 .b8 __cuda___cuda_local_var_32578_34_non_const_pos_sh992[2048];\n" +" .loc 16 151 0\n" +"$LDWbegin_calc_neigh_list_cell:\n" +" .loc 16 163 0\n" +" ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly];\n" +" mov.u32 %r2, %ctaid.y;\n" +" rem.u32 %r3, %r2, %r1;\n" +" div.u32 %r4, %r2, %r1;\n" +" ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx];\n" +" mul.lo.s32 %r6, %r5, %r3;\n" +" mul.lo.s32 %r7, %r5, %r4;\n" +" mul.lo.s32 %r8, %r7, %r1;\n" +" cvt.s32.u32 %r9, %ctaid.x;\n" +" ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];\n" +" add.s32 %r10, %r6, %r8;\n" +" add.s32 %r11, %r9, %r10;\n" +" cvt.s64.s32 %rd2, %r11;\n" +" mul.wide.s32 %rd3, %r11, 4;\n" +" add.u64 %rd4, %rd1, %rd3;\n" +" ldu.global.s32 %r12, [%rd4+0];\n" +" .loc 16 164 0\n" +" ldu.global.s32 %r13, [%rd4+4];\n" +" .loc 16 172 0\n" +" sub.s32 %r14, %r13, %r12;\n" +" mov.u32 %r15, %ntid.x;\n" +" cvt.rn.f32.u32 %f1, %r15;\n" +" cvt.rn.f32.s32 %f2, %r14;\n" +" div.approx.ftz.f32 %f3, %f2, %f1;\n" +" cvt.rpi.ftz.f32.f32 %f4, %f3;\n" +" mov.f32 %f5, 0f00000000; \n" +" setp.gt.ftz.f32 %p1, %f4, %f5;\n" +" @!%p1 bra $Lt_3_13314;\n" +" sub.s32 %r16, %r3, 1;\n" +" mov.s32 %r17, 0;\n" +" max.s32 %r18, %r16, %r17;\n" +" sub.s32 %r19, %r1, 1;\n" +" add.s32 %r20, %r3, 1;\n" +" min.s32 %r21, %r19, %r20;\n" +" ld.param.s32 %r22, [__cudaparm_calc_neigh_list_cell_ncellz];\n" +" sub.s32 %r23, %r22, 1;\n" +" add.s32 %r24, %r4, 1;\n" +" min.s32 %r25, %r23, %r24;\n" +" sub.s32 %r26, %r9, 1;\n" +" mov.s32 %r27, 0;\n" +" max.s32 %r28, %r26, %r27;\n" +" add.s32 %r29, %r9, 1;\n" +" sub.s32 %r30, %r5, 1;\n" +" min.s32 %r31, %r29, %r30;\n" +" cvt.s32.u32 %r32, %tid.x;\n" +" add.s32 %r33, %r12, %r32;\n" +" mov.u32 %r34, 0;\n" +" ld.param.s32 %r35, [__cudaparm_calc_neigh_list_cell_inum];\n" +" cvt.s64.s32 %rd5, %r35;\n" +" sub.s32 %r36, %r4, 1;\n" +" mov.s32 %r37, %r33;\n" +" mul.wide.s32 %rd6, %r35, 4;\n" +" mov.s32 %r38, 0;\n" +" max.s32 %r39, %r36, %r38;\n" +" setp.ge.s32 %p2, %r25, %r39;\n" +" ld.param.s32 %r40, [__cudaparm_calc_neigh_list_cell_nt];\n" +" ld.param.s32 %r41, [__cudaparm_calc_neigh_list_cell_nall];\n" +" mov.s32 %r42, 0;\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32577_31_non_const_cell_list_sh480;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32578_34_non_const_pos_sh992;\n" +"$Lt_3_13826:\n" +" .loc 16 174 0\n" +" mov.s32 %r43, %r41;\n" +" setp.ge.s32 %p3, %r37, %r13;\n" +" @%p3 bra $Lt_3_14082;\n" +" .loc 16 180 0\n" +" ld.param.u64 %rd9, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n" +" add.u32 %r44, %r33, %r34;\n" +" cvt.s64.s32 %rd10, %r44;\n" +" mul.wide.s32 %rd11, %r44, 4;\n" +" add.u64 %rd12, %rd9, %rd11;\n" +" ld.global.s32 %r43, [%rd12+0];\n" +"$Lt_3_14082:\n" +" setp.lt.s32 %p4, %r43, %r40;\n" +" @!%p4 bra $Lt_3_14594;\n" +" .loc 16 183 0\n" +" mov.u32 %r45, %r43;\n" +" mov.s32 %r46, 0;\n" +" mov.u32 %r47, %r46;\n" +" mov.s32 %r48, 0;\n" +" mov.u32 %r49, %r48;\n" +" mov.s32 %r50, 0;\n" +" mov.u32 %r51, %r50;\n" +" tex.1d.v4.f32.s32 {%f6,%f7,%f8,%f9},[neigh_tex,{%r45,%r47,%r49,%r51}];\n" +" mov.f32 %f10, %f6;\n" +" mov.f32 %f11, %f7;\n" +" mov.f32 %f12, %f8;\n" +" mov.f32 %f13, %f10;\n" +" mov.f32 %f14, %f11;\n" +" mov.f32 %f15, %f12;\n" +"$Lt_3_14594:\n" +" cvt.s64.s32 %rd13, %r43;\n" +" mul.wide.s32 %rd14, %r43, 4;\n" +" setp.ge.s32 %p5, %r43, %r35;\n" +" @%p5 bra $Lt_3_15362;\n" +" .loc 16 186 0\n" +" mov.s32 %r52, %r35;\n" +" .loc 16 187 0\n" +" ld.param.u64 %rd15, [__cudaparm_calc_neigh_list_cell_nbor_list];\n" +" add.u64 %rd16, %rd13, %rd5;\n" +" mul.lo.u64 %rd17, %rd16, 4;\n" +" add.u64 %rd18, %rd15, %rd17;\n" +" mov.s64 %rd19, %rd18;\n" +" .loc 16 188 0\n" +" add.u64 %rd20, %rd6, %rd18;\n" +" .loc 16 189 0\n" +" add.u64 %rd21, %rd14, %rd15;\n" +" st.global.s32 [%rd21+0], %r43;\n" +" bra.uni $Lt_3_15106;\n" +"$Lt_3_15362:\n" +" .loc 16 192 0\n" +" ld.param.u64 %rd22, [__cudaparm_calc_neigh_list_cell_host_numj];\n" +" add.u64 %rd23, %rd22, %rd14;\n" +" sub.u64 %rd19, %rd23, %rd6;\n" +" .loc 16 193 0\n" +" ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_nbor_list];\n" +" ld.param.s32 %r53, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n" +" sub.s32 %r54, %r43, %r35;\n" +" mul.lo.s32 %r55, %r53, %r54;\n" +" cvt.s64.s32 %rd25, %r55;\n" +" mul.wide.s32 %rd26, %r55, 4;\n" +" add.u64 %rd20, %rd24, %rd26;\n" +" mov.s32 %r52, 1;\n" +"$Lt_3_15106:\n" +" .loc 16 198 0\n" +" mov.s32 %r56, %r39;\n" +" @!%p2 bra $Lt_3_23298;\n" +" sub.s32 %r57, %r25, %r39;\n" +" add.s32 %r58, %r57, 1;\n" +" setp.le.s32 %p6, %r18, %r21;\n" +" add.s32 %r59, %r25, 1;\n" +" mov.s32 %r60, 0;\n" +" mov.s32 %r61, %r58;\n" +"$Lt_3_16130:\n" +" .loc 16 199 0\n" +" mov.s32 %r62, %r18;\n" +" @!%p6 bra $Lt_3_16386;\n" +" sub.s32 %r63, %r21, %r18;\n" +" add.s32 %r64, %r63, 1;\n" +" setp.ge.s32 %p7, %r31, %r28;\n" +" add.s32 %r65, %r21, 1;\n" +" mov.s32 %r66, %r64;\n" +"$Lt_3_16898:\n" +" @!%p7 bra $Lt_3_17154;\n" +" sub.s32 %r67, %r31, %r28;\n" +" add.s32 %r68, %r67, 1;\n" +" mul.lo.s32 %r69, %r62, %r5;\n" +" mul.lo.s32 %r70, %r56, %r5;\n" +" mul.lo.s32 %r71, %r70, %r1;\n" +" add.s32 %r72, %r31, 1;\n" +" add.s32 %r73, %r69, %r71;\n" +" add.s32 %r74, %r73, %r28;\n" +" add.s32 %r75, %r72, %r73;\n" +" cvt.s64.s32 %rd27, %r74;\n" +" mul.wide.s32 %rd28, %r74, 4;\n" +" add.u64 %rd29, %rd1, %rd28;\n" +" mov.s32 %r76, %r68;\n" +"$Lt_3_17666:\n" +" .loc 16 204 0\n" +" ld.global.s32 %r77, [%rd29+0];\n" +" .loc 16 205 0\n" +" ld.global.s32 %r78, [%rd29+4];\n" +" .loc 16 209 0\n" +" sub.s32 %r79, %r78, %r77;\n" +" cvt.rn.f32.s32 %f16, %r79;\n" +" mov.f32 %f17, 0f43000000; \n" +" div.approx.ftz.f32 %f18, %f16, %f17;\n" +" cvt.rpi.ftz.f32.f32 %f19, %f18;\n" +" cvt.rzi.ftz.s32.f32 %r80, %f19;\n" +" mov.u32 %r81, 0;\n" +" setp.le.s32 %p8, %r80, %r81;\n" +" @%p8 bra $Lt_3_17922;\n" +" mov.s32 %r82, %r80;\n" +" mov.s32 %r83, 0;\n" +" setp.lt.s32 %p9, %r43, %r40;\n" +" mul.lo.s32 %r84, %r80, 128;\n" +" mov.s32 %r85, %r82;\n" +"$Lt_3_18434:\n" +" sub.s32 %r86, %r79, %r83;\n" +" mov.s32 %r87, 128;\n" +" min.s32 %r88, %r86, %r87;\n" +" setp.le.s32 %p10, %r88, %r32;\n" +" @%p10 bra $Lt_3_18690;\n" +" .loc 16 215 0\n" +" ld.param.u64 %rd30, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n" +" add.s32 %r89, %r83, %r32;\n" +" add.s32 %r90, %r77, %r89;\n" +" cvt.s64.s32 %rd31, %r90;\n" +" mul.wide.s32 %rd32, %r90, 4;\n" +" add.u64 %rd33, %rd30, %rd32;\n" +" ld.global.s32 %r91, [%rd33+0];\n" +" .loc 16 216 0\n" +" cvt.s64.s32 %rd34, %r32;\n" +" mul.wide.s32 %rd35, %r32, 4;\n" +" add.u64 %rd36, %rd7, %rd35;\n" +" st.shared.s32 [%rd36+0], %r91;\n" +" .loc 16 217 0\n" +" mov.u32 %r92, %r91;\n" +" mov.s32 %r93, 0;\n" +" mov.u32 %r94, %r93;\n" +" mov.s32 %r95, 0;\n" +" mov.u32 %r96, %r95;\n" +" mov.s32 %r97, 0;\n" +" mov.u32 %r98, %r97;\n" +" tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[neigh_tex,{%r92,%r94,%r96,%r98}];\n" +" mov.f32 %f24, %f20;\n" +" mov.f32 %f25, %f21;\n" +" mov.f32 %f26, %f22;\n" +" .loc 16 218 0\n" +" mul.lo.u64 %rd37, %rd34, 16;\n" +" add.u64 %rd38, %rd8, %rd37;\n" +" st.shared.v2.f32 [%rd38+0], {%f24,%f25};\n" +" .loc 16 220 0\n" +" st.shared.f32 [%rd38+8], %f26;\n" +"$Lt_3_18690:\n" +" .loc 16 222 0\n" +" bar.sync 0;\n" +" @!%p9 bra $Lt_3_19714;\n" +" mov.u32 %r99, 0;\n" +" setp.le.s32 %p11, %r88, %r99;\n" +" @%p11 bra $Lt_3_19714;\n" +" mov.s32 %r100, %r88;\n" +" mov.s64 %rd39, 0;\n" +" ld.param.f32 %f27, [__cudaparm_calc_neigh_list_cell_cell_size];\n" +" mul.ftz.f32 %f28, %f27, %f27;\n" +" mov.s64 %rd40, %rd8;\n" +" mov.f32 %f29, %f15;\n" +" mov.f32 %f30, %f14;\n" +" mov.f32 %f31, %f13;\n" +" mov.s32 %r101, 0;\n" +" mov.s32 %r102, %r100;\n" +"$Lt_3_20226:\n" +" ld.shared.v4.f32 {%f32,%f33,%f34,_}, [%rd40+0];\n" +" .loc 16 228 0\n" +" sub.ftz.f32 %f35, %f31, %f32;\n" +" .loc 16 229 0\n" +" sub.ftz.f32 %f36, %f30, %f33;\n" +" .loc 16 230 0\n" +" sub.ftz.f32 %f37, %f29, %f34;\n" +" .loc 16 227 0\n" +" mul.ftz.f32 %f38, %f36, %f36;\n" +" fma.rn.ftz.f32 %f39, %f35, %f35, %f38;\n" +" fma.rn.ftz.f32 %f40, %f37, %f37, %f39;\n" +" setp.gt.ftz.f32 %p12, %f28, %f40;\n" +" @!%p12 bra $Lt_3_24578;\n" +" cvt.ftz.f64.f32 %fd1, %f40;\n" +" mov.f64 %fd2, 0d3ee4f8b588e368f1; \n" +" setp.gt.f64 %p13, %fd1, %fd2;\n" +" @!%p13 bra $Lt_3_24578;\n" +" ld.param.s32 %r103, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n" +" setp.le.s32 %p14, %r103, %r60;\n" +" @%p14 bra $Lt_3_20482;\n" +" .loc 16 235 0\n" +" mul.lo.u64 %rd41, %rd39, 4;\n" +" add.u64 %rd42, %rd7, %rd41;\n" +" ld.shared.s32 %r104, [%rd42+0];\n" +" st.global.s32 [%rd20+0], %r104;\n" +" .loc 16 236 0\n" +" cvt.s64.s32 %rd43, %r52;\n" +" mul.wide.s32 %rd44, %r52, 4;\n" +" add.u64 %rd20, %rd20, %rd44;\n" +"$Lt_3_20482:\n" +" .loc 16 238 0\n" +" add.s32 %r60, %r60, 1;\n" +"$Lt_3_24578:\n" +"$L_3_12802:\n" +" add.s32 %r101, %r101, 1;\n" +" add.s64 %rd39, %rd39, 1;\n" +" add.u64 %rd40, %rd40, 16;\n" +" setp.ne.s32 %p15, %r88, %r101;\n" +" @%p15 bra $Lt_3_20226;\n" +"$Lt_3_19714:\n" +"$Lt_3_19202:\n" +" .loc 16 242 0\n" +" bar.sync 0;\n" +" add.s32 %r83, %r83, 128;\n" +" setp.ne.s32 %p16, %r83, %r84;\n" +" @%p16 bra $Lt_3_18434;\n" +"$Lt_3_17922:\n" +" add.s32 %r74, %r74, 1;\n" +" add.u64 %rd29, %rd29, 4;\n" +" setp.ne.s32 %p17, %r74, %r75;\n" +" @%p17 bra $Lt_3_17666;\n" +"$Lt_3_17154:\n" +" add.s32 %r62, %r62, 1;\n" +" setp.ne.s32 %p18, %r65, %r62;\n" +" @%p18 bra $Lt_3_16898;\n" +"$Lt_3_16386:\n" +" add.s32 %r56, %r56, 1;\n" +" setp.ne.s32 %p19, %r59, %r56;\n" +" @%p19 bra $Lt_3_16130;\n" +" bra.uni $Lt_3_15618;\n" +"$Lt_3_23298:\n" +" mov.s32 %r60, 0;\n" +"$Lt_3_15618:\n" +" @!%p4 bra $Lt_3_22274;\n" +" .loc 16 248 0\n" +" st.global.s32 [%rd19+0], %r60;\n" +"$Lt_3_22274:\n" +" .loc 16 172 0\n" +" add.s32 %r42, %r42, 1;\n" +" add.u32 %r34, %r34, %r15;\n" +" add.s32 %r37, %r37, %r15;\n" +" cvt.rn.f32.s32 %f41, %r42;\n" +" setp.lt.ftz.f32 %p20, %f41, %f4;\n" +" @%p20 bra $Lt_3_13826;\n" +"$Lt_3_13314:\n" +" .loc 16 250 0\n" +" exit;\n" +"$LDWend_calc_neigh_list_cell:\n" +" }\n" +" .entry kernel_special (\n" +" .param .u64 __cudaparm_kernel_special_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_special_host_nbor_list,\n" +" .param .u64 __cudaparm_kernel_special_host_numj,\n" +" .param .u64 __cudaparm_kernel_special_tag,\n" +" .param .u64 __cudaparm_kernel_special_nspecial,\n" +" .param .u64 __cudaparm_kernel_special_special,\n" +" .param .s32 __cudaparm_kernel_special_inum,\n" +" .param .s32 __cudaparm_kernel_special_nt,\n" +" .param .s32 __cudaparm_kernel_special_max_nbors)\n" +" {\n" +" .reg .u32 %r<34>;\n" +" .reg .u64 %rd<36>;\n" +" .reg .pred %p<11>;\n" +" .loc 16 256 0\n" +"$LDWbegin_kernel_special:\n" +" mov.u32 %r1, %ctaid.x;\n" +" mov.u32 %r2, %ntid.x;\n" +" mul.lo.u32 %r3, %r1, %r2;\n" +" mov.u32 %r4, %tid.x;\n" +" add.u32 %r5, %r4, %r3;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_special_nt];\n" +" setp.le.s32 %p1, %r6, %r5;\n" +" @%p1 bra $Lt_4_6146;\n" +" .loc 16 264 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial];\n" +" mul.lo.s32 %r7, %r5, 3;\n" +" cvt.s64.s32 %rd2, %r7;\n" +" mul.wide.s32 %rd3, %r7, 4;\n" +" add.u64 %rd4, %rd1, %rd3;\n" +" ld.global.s32 %r8, [%rd4+0];\n" +" .loc 16 265 0\n" +" ld.global.s32 %r9, [%rd4+4];\n" +" .loc 16 266 0\n" +" ld.global.s32 %r10, [%rd4+8];\n" +" ld.param.s32 %r11, [__cudaparm_kernel_special_inum];\n" +" setp.le.s32 %p2, %r11, %r5;\n" +" @%p2 bra $Lt_4_6914;\n" +" .loc 16 270 0\n" +" mov.s32 %r12, %r11;\n" +" .loc 16 272 0\n" +" cvt.s64.s32 %rd5, %r11;\n" +" ld.param.u64 %rd6, [__cudaparm_kernel_special_dev_nbor];\n" +" cvt.s64.s32 %rd7, %r5;\n" +" add.u64 %rd8, %rd7, %rd5;\n" +" mul.lo.u64 %rd9, %rd8, 4;\n" +" add.u64 %rd10, %rd6, %rd9;\n" +" ld.global.s32 %r13, [%rd10+0];\n" +" .loc 16 273 0\n" +" mul.wide.s32 %rd11, %r11, 4;\n" +" add.u64 %rd12, %rd10, %rd11;\n" +" bra.uni $Lt_4_6658;\n" +"$Lt_4_6914:\n" +" .loc 16 276 0\n" +" sub.s32 %r14, %r5, %r11;\n" +" ld.param.u64 %rd13, [__cudaparm_kernel_special_host_nbor_list];\n" +" ld.param.s32 %r15, [__cudaparm_kernel_special_max_nbors];\n" +" mul.lo.s32 %r16, %r15, %r14;\n" +" cvt.s64.s32 %rd14, %r16;\n" +" mul.wide.s32 %rd15, %r16, 4;\n" +" add.u64 %rd12, %rd13, %rd15;\n" +" .loc 16 277 0\n" +" ld.param.u64 %rd16, [__cudaparm_kernel_special_host_numj];\n" +" cvt.s64.s32 %rd17, %r14;\n" +" mul.wide.s32 %rd18, %r14, 4;\n" +" add.u64 %rd19, %rd16, %rd18;\n" +" ld.global.s32 %r13, [%rd19+0];\n" +" mov.s32 %r12, 1;\n" +"$Lt_4_6658:\n" +" .loc 16 279 0\n" +" mul.lo.s32 %r17, %r13, %r12;\n" +" cvt.s64.s32 %rd20, %r17;\n" +" mul.wide.s32 %rd21, %r17, 4;\n" +" add.u64 %rd22, %rd12, %rd21;\n" +" setp.le.u64 %p3, %rd22, %rd12;\n" +" @%p3 bra $Lt_4_7170;\n" +" mov.s32 %r18, 0;\n" +" setp.gt.s32 %p4, %r10, %r18;\n" +" cvt.s64.s32 %rd23, %r12;\n" +" ld.param.u64 %rd24, [__cudaparm_kernel_special_tag];\n" +"$Lt_4_7682:\n" +" .loc 16 282 0\n" +" ld.global.s32 %r19, [%rd12+0];\n" +" .loc 16 283 0\n" +" cvt.s64.s32 %rd25, %r19;\n" +" mul.wide.s32 %rd26, %r19, 4;\n" +" add.u64 %rd27, %rd24, %rd26;\n" +" ld.global.s32 %r20, [%rd27+0];\n" +" @!%p4 bra $Lt_4_7938;\n" +" mov.s32 %r21, %r10;\n" +" cvt.s64.s32 %rd28, %r5;\n" +" cvt.s64.s32 %rd29, %r6;\n" +" mul.wide.s32 %rd30, %r6, 4;\n" +" ld.param.u64 %rd31, [__cudaparm_kernel_special_special];\n" +" mul.wide.s32 %rd32, %r5, 4;\n" +" add.u64 %rd33, %rd31, %rd32;\n" +" mov.s32 %r22, 0;\n" +" mov.s32 %r23, %r21;\n" +"$Lt_4_8450:\n" +" ld.global.s32 %r24, [%rd33+0];\n" +" setp.ne.s32 %p5, %r24, %r20;\n" +" @%p5 bra $Lt_4_8706;\n" +" .loc 16 293 0\n" +" setp.le.s32 %p6, %r8, %r22;\n" +" mov.s32 %r25, 3;\n" +" mov.s32 %r26, 2;\n" +" selp.s32 %r27, %r25, %r26, %p6;\n" +" mov.s32 %r28, 2;\n" +" mov.s32 %r29, 1;\n" +" selp.s32 %r30, %r28, %r29, %p6;\n" +" setp.le.s32 %p7, %r9, %r22;\n" +" selp.s32 %r31, %r27, %r30, %p7;\n" +" shl.b32 %r32, %r31, 30;\n" +" xor.b32 %r19, %r19, %r32;\n" +" .loc 16 294 0\n" +" st.global.s32 [%rd12+0], %r19;\n" +"$Lt_4_8706:\n" +" add.s32 %r22, %r22, 1;\n" +" add.u64 %rd33, %rd30, %rd33;\n" +" setp.ne.s32 %p8, %r10, %r22;\n" +" @%p8 bra $Lt_4_8450;\n" +"$Lt_4_7938:\n" +" .loc 16 281 0\n" +" mul.lo.u64 %rd34, %rd23, 4;\n" +" add.u64 %rd12, %rd12, %rd34;\n" +" setp.gt.u64 %p9, %rd22, %rd12;\n" +" @%p9 bra $Lt_4_7682;\n" +"$Lt_4_7170:\n" +"$Lt_4_6146:\n" +" .loc 16 300 0\n" +" exit;\n" +"$LDWend_kernel_special:\n" +" }\n" +; diff --git a/lib/gpu/pair_gpu_dev_kernel.ptx b/lib/gpu/pair_gpu_dev_kernel.ptx new file mode 100644 index 000000000..a1c023967 --- /dev/null +++ b/lib/gpu/pair_gpu_dev_kernel.ptx @@ -0,0 +1,134 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bba8_00000000-9_pair_gpu_dev_kernel.cpp3.i (/home/sjplimp/ccBI#.SuFQHy) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bba8_00000000-8_pair_gpu_dev_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "pair_gpu_dev_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + + .entry kernel_zero ( + .param .u64 __cudaparm_kernel_zero_mem, + .param .s32 __cudaparm_kernel_zero_numel) + { + .reg .u32 %r<9>; + .reg .u64 %rd<6>; + .reg .pred %p<3>; + .loc 16 95 0 +$LDWbegin_kernel_zero: + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mul.lo.u32 %r3, %r1, %r2; + mov.u32 %r4, %tid.x; + add.u32 %r5, %r4, %r3; + ld.param.s32 %r6, [__cudaparm_kernel_zero_numel]; + setp.le.s32 %p1, %r6, %r5; + @%p1 bra $Lt_0_1026; + .loc 16 99 0 + mov.s32 %r7, 0; + ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem]; + cvt.s64.s32 %rd2, %r5; + mul.wide.s32 %rd3, %r5, 4; + add.u64 %rd4, %rd1, %rd3; + st.global.s32 [%rd4+0], %r7; +$Lt_0_1026: + .loc 16 100 0 + exit; +$LDWend_kernel_zero: + } // kernel_zero + + .entry kernel_info ( + .param .u64 __cudaparm_kernel_info_info) + { + .reg .u32 %r<16>; + .reg .u64 %rd<3>; + .loc 16 102 0 +$LDWbegin_kernel_info: + .loc 16 103 0 + ld.param.u64 %rd1, [__cudaparm_kernel_info_info]; + mov.s32 %r1, 200; + st.global.s32 [%rd1+0], %r1; + .loc 16 104 0 + mov.s32 %r2, 32; + st.global.s32 [%rd1+4], %r2; + .loc 16 105 0 + mov.s32 %r3, 32; + st.global.s32 [%rd1+8], %r3; + .loc 16 106 0 + mov.s32 %r4, 1; + st.global.s32 [%rd1+12], %r4; + .loc 16 107 0 + mov.s32 %r5, 8; + st.global.s32 [%rd1+16], %r5; + .loc 16 108 0 + mov.s32 %r6, 64; + st.global.s32 [%rd1+20], %r6; + .loc 16 109 0 + mov.s32 %r7, 128; + st.global.s32 [%rd1+24], %r7; + .loc 16 110 0 + mov.s32 %r8, 11; + st.global.s32 [%rd1+28], %r8; + .loc 16 111 0 + mov.s32 %r9, 8; + st.global.s32 [%rd1+32], %r9; + .loc 16 112 0 + mov.s32 %r10, 128; + st.global.s32 [%rd1+36], %r10; + .loc 16 113 0 + mov.s32 %r11, 128; + st.global.s32 [%rd1+40], %r11; + .loc 16 114 0 + mov.s32 %r12, 128; + st.global.s32 [%rd1+44], %r12; + .loc 16 115 0 + mov.s32 %r13, 128; + st.global.s32 [%rd1+48], %r13; + .loc 16 116 0 + mov.s32 %r14, 8; + st.global.s32 [%rd1+52], %r14; + .loc 16 117 0 + exit; +$LDWend_kernel_info: + } // kernel_info + diff --git a/lib/gpu/pair_gpu_dev_ptx.h b/lib/gpu/pair_gpu_dev_ptx.h new file mode 100644 index 000000000..8de31c943 --- /dev/null +++ b/lib/gpu/pair_gpu_dev_ptx.h @@ -0,0 +1,88 @@ +const char * pair_gpu_dev_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .entry kernel_zero (\n" +" .param .u64 __cudaparm_kernel_zero_mem,\n" +" .param .s32 __cudaparm_kernel_zero_numel)\n" +" {\n" +" .reg .u32 %r<9>;\n" +" .reg .u64 %rd<6>;\n" +" .reg .pred %p<3>;\n" +" .loc 16 95 0\n" +"$LDWbegin_kernel_zero:\n" +" mov.u32 %r1, %ctaid.x;\n" +" mov.u32 %r2, %ntid.x;\n" +" mul.lo.u32 %r3, %r1, %r2;\n" +" mov.u32 %r4, %tid.x;\n" +" add.u32 %r5, %r4, %r3;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_zero_numel];\n" +" setp.le.s32 %p1, %r6, %r5;\n" +" @%p1 bra $Lt_0_1026;\n" +" .loc 16 99 0\n" +" mov.s32 %r7, 0;\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem];\n" +" cvt.s64.s32 %rd2, %r5;\n" +" mul.wide.s32 %rd3, %r5, 4;\n" +" add.u64 %rd4, %rd1, %rd3;\n" +" st.global.s32 [%rd4+0], %r7;\n" +"$Lt_0_1026:\n" +" .loc 16 100 0\n" +" exit;\n" +"$LDWend_kernel_zero:\n" +" }\n" +" .entry kernel_info (\n" +" .param .u64 __cudaparm_kernel_info_info)\n" +" {\n" +" .reg .u32 %r<16>;\n" +" .reg .u64 %rd<3>;\n" +" .loc 16 102 0\n" +"$LDWbegin_kernel_info:\n" +" .loc 16 103 0\n" +" ld.param.u64 %rd1, [__cudaparm_kernel_info_info];\n" +" mov.s32 %r1, 200;\n" +" st.global.s32 [%rd1+0], %r1;\n" +" .loc 16 104 0\n" +" mov.s32 %r2, 32;\n" +" st.global.s32 [%rd1+4], %r2;\n" +" .loc 16 105 0\n" +" mov.s32 %r3, 32;\n" +" st.global.s32 [%rd1+8], %r3;\n" +" .loc 16 106 0\n" +" mov.s32 %r4, 1;\n" +" st.global.s32 [%rd1+12], %r4;\n" +" .loc 16 107 0\n" +" mov.s32 %r5, 8;\n" +" st.global.s32 [%rd1+16], %r5;\n" +" .loc 16 108 0\n" +" mov.s32 %r6, 64;\n" +" st.global.s32 [%rd1+20], %r6;\n" +" .loc 16 109 0\n" +" mov.s32 %r7, 128;\n" +" st.global.s32 [%rd1+24], %r7;\n" +" .loc 16 110 0\n" +" mov.s32 %r8, 11;\n" +" st.global.s32 [%rd1+28], %r8;\n" +" .loc 16 111 0\n" +" mov.s32 %r9, 8;\n" +" st.global.s32 [%rd1+32], %r9;\n" +" .loc 16 112 0\n" +" mov.s32 %r10, 128;\n" +" st.global.s32 [%rd1+36], %r10;\n" +" .loc 16 113 0\n" +" mov.s32 %r11, 128;\n" +" st.global.s32 [%rd1+40], %r11;\n" +" .loc 16 114 0\n" +" mov.s32 %r12, 128;\n" +" st.global.s32 [%rd1+44], %r12;\n" +" .loc 16 115 0\n" +" mov.s32 %r13, 128;\n" +" st.global.s32 [%rd1+48], %r13;\n" +" .loc 16 116 0\n" +" mov.s32 %r14, 8;\n" +" st.global.s32 [%rd1+52], %r14;\n" +" .loc 16 117 0\n" +" exit;\n" +"$LDWend_kernel_info:\n" +" }\n" +; diff --git a/lib/gpu/pair_gpu_nbor_kernel.ptx b/lib/gpu/pair_gpu_nbor_kernel.ptx new file mode 100644 index 000000000..4f595cacb --- /dev/null +++ b/lib/gpu/pair_gpu_nbor_kernel.ptx @@ -0,0 +1,118 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bb58_00000000-9_pair_gpu_nbor_kernel.cpp3.i (/home/sjplimp/ccBI#.bBFvWV) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bb58_00000000-8_pair_gpu_nbor_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "pair_gpu_nbor_kernel.cu" + .file 17 "/usr/local/cuda/include/common_functions.h" + .file 18 "/usr/local/cuda/include/math_functions.h" + .file 19 "/usr/local/cuda/include/math_constants.h" + .file 20 "/usr/local/cuda/include/device_functions.h" + .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + + .entry kernel_unpack ( + .param .u64 __cudaparm_kernel_unpack_dev_nbor, + .param .u64 __cudaparm_kernel_unpack_dev_ij, + .param .s32 __cudaparm_kernel_unpack_inum) + { + .reg .u32 %r<11>; + .reg .u64 %rd<27>; + .reg .pred %p<5>; + .loc 16 29 0 +$LDWbegin_kernel_unpack: + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %ntid.x; + mul.lo.u32 %r3, %r1, %r2; + mov.u32 %r4, %tid.x; + add.u32 %r5, %r4, %r3; + ld.param.s32 %r6, [__cudaparm_kernel_unpack_inum]; + setp.le.s32 %p1, %r6, %r5; + @%p1 bra $Lt_0_2050; + .loc 16 35 0 + cvt.s64.s32 %rd1, %r6; + ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor]; + cvt.s64.s32 %rd3, %r5; + add.u64 %rd4, %rd3, %rd1; + mul.lo.u64 %rd5, %rd4, 4; + add.u64 %rd6, %rd2, %rd5; + ld.global.s32 %r7, [%rd6+0]; + .loc 16 36 0 + mul.wide.s32 %rd7, %r6, 4; + add.u64 %rd8, %rd6, %rd7; + mov.s64 %rd9, %rd8; + .loc 16 37 0 + ld.param.u64 %rd10, [__cudaparm_kernel_unpack_dev_ij]; + ld.global.s32 %r8, [%rd8+0]; + cvt.s64.s32 %rd11, %r8; + mul.wide.s32 %rd12, %r8, 4; + add.u64 %rd13, %rd10, %rd12; + .loc 16 38 0 + cvt.s64.s32 %rd14, %r7; + mul.wide.s32 %rd15, %r7, 4; + add.u64 %rd16, %rd15, %rd13; + setp.le.u64 %p2, %rd16, %rd13; + @%p2 bra $Lt_0_2562; + add.u64 %rd17, %rd15, 3; + shr.s64 %rd18, %rd17, 63; + mov.s64 %rd19, 3; + and.b64 %rd20, %rd18, %rd19; + add.s64 %rd21, %rd20, %rd17; + shr.s64 %rd22, %rd21, 2; + mov.s64 %rd23, 1; + max.s64 %rd24, %rd22, %rd23; + mov.s64 %rd25, %rd24; +$Lt_0_3074: + //<loop> Loop body line 38, nesting depth: 1, estimated iterations: unknown + .loc 16 41 0 + ld.global.s32 %r9, [%rd13+0]; + st.global.s32 [%rd9+0], %r9; + .loc 16 42 0 + add.u64 %rd9, %rd7, %rd9; + .loc 16 40 0 + add.u64 %rd13, %rd13, 4; + setp.gt.u64 %p3, %rd16, %rd13; + @%p3 bra $Lt_0_3074; +$Lt_0_2562: +$Lt_0_2050: + .loc 16 45 0 + exit; +$LDWend_kernel_unpack: + } // kernel_unpack + diff --git a/lib/gpu/pair_gpu_nbor_ptx.h b/lib/gpu/pair_gpu_nbor_ptx.h new file mode 100644 index 000000000..955c686d3 --- /dev/null +++ b/lib/gpu/pair_gpu_nbor_ptx.h @@ -0,0 +1,72 @@ +const char * pair_gpu_nbor_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .entry kernel_unpack (\n" +" .param .u64 __cudaparm_kernel_unpack_dev_nbor,\n" +" .param .u64 __cudaparm_kernel_unpack_dev_ij,\n" +" .param .s32 __cudaparm_kernel_unpack_inum)\n" +" {\n" +" .reg .u32 %r<11>;\n" +" .reg .u64 %rd<27>;\n" +" .reg .pred %p<5>;\n" +" .loc 16 29 0\n" +"$LDWbegin_kernel_unpack:\n" +" mov.u32 %r1, %ctaid.x;\n" +" mov.u32 %r2, %ntid.x;\n" +" mul.lo.u32 %r3, %r1, %r2;\n" +" mov.u32 %r4, %tid.x;\n" +" add.u32 %r5, %r4, %r3;\n" +" ld.param.s32 %r6, [__cudaparm_kernel_unpack_inum];\n" +" setp.le.s32 %p1, %r6, %r5;\n" +" @%p1 bra $Lt_0_2050;\n" +" .loc 16 35 0\n" +" cvt.s64.s32 %rd1, %r6;\n" +" ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor];\n" +" cvt.s64.s32 %rd3, %r5;\n" +" add.u64 %rd4, %rd3, %rd1;\n" +" mul.lo.u64 %rd5, %rd4, 4;\n" +" add.u64 %rd6, %rd2, %rd5;\n" +" ld.global.s32 %r7, [%rd6+0];\n" +" .loc 16 36 0\n" +" mul.wide.s32 %rd7, %r6, 4;\n" +" add.u64 %rd8, %rd6, %rd7;\n" +" mov.s64 %rd9, %rd8;\n" +" .loc 16 37 0\n" +" ld.param.u64 %rd10, [__cudaparm_kernel_unpack_dev_ij];\n" +" ld.global.s32 %r8, [%rd8+0];\n" +" cvt.s64.s32 %rd11, %r8;\n" +" mul.wide.s32 %rd12, %r8, 4;\n" +" add.u64 %rd13, %rd10, %rd12;\n" +" .loc 16 38 0\n" +" cvt.s64.s32 %rd14, %r7;\n" +" mul.wide.s32 %rd15, %r7, 4;\n" +" add.u64 %rd16, %rd15, %rd13;\n" +" setp.le.u64 %p2, %rd16, %rd13;\n" +" @%p2 bra $Lt_0_2562;\n" +" add.u64 %rd17, %rd15, 3;\n" +" shr.s64 %rd18, %rd17, 63;\n" +" mov.s64 %rd19, 3;\n" +" and.b64 %rd20, %rd18, %rd19;\n" +" add.s64 %rd21, %rd20, %rd17;\n" +" shr.s64 %rd22, %rd21, 2;\n" +" mov.s64 %rd23, 1;\n" +" max.s64 %rd24, %rd22, %rd23;\n" +" mov.s64 %rd25, %rd24;\n" +"$Lt_0_3074:\n" +" .loc 16 41 0\n" +" ld.global.s32 %r9, [%rd13+0];\n" +" st.global.s32 [%rd9+0], %r9;\n" +" .loc 16 42 0\n" +" add.u64 %rd9, %rd7, %rd9;\n" +" .loc 16 40 0\n" +" add.u64 %rd13, %rd13, 4;\n" +" setp.gt.u64 %p3, %rd16, %rd13;\n" +" @%p3 bra $Lt_0_3074;\n" +"$Lt_0_2562:\n" +"$Lt_0_2050:\n" +" .loc 16 45 0\n" +" exit;\n" +"$LDWend_kernel_unpack:\n" +" }\n" +; diff --git a/lib/gpu/pppm_d_gpu_kernel.ptx b/lib/gpu/pppm_d_gpu_kernel.ptx new file mode 100644 index 000000000..488a2c580 --- /dev/null +++ b/lib/gpu/pppm_d_gpu_kernel.ptx @@ -0,0 +1,900 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bc69_00000000-9_pppm_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.fFsh3D) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bc69_00000000-8_pppm_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 17 "pppm_gpu_kernel.cu" + .file 18 "/usr/local/cuda/include/common_functions.h" + .file 19 "/usr/local/cuda/include/math_functions.h" + .file 20 "/usr/local/cuda/include/math_constants.h" + .file 21 "/usr/local/cuda/include/device_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + .global .texref q_tex; + + .entry particle_map ( + .param .u64 __cudaparm_particle_map_x_, + .param .u64 __cudaparm_particle_map_q_, + .param .f64 __cudaparm_particle_map_delvolinv, + .param .s32 __cudaparm_particle_map_nlocal, + .param .u64 __cudaparm_particle_map_counts, + .param .u64 __cudaparm_particle_map_ans, + .param .f64 __cudaparm_particle_map_b_lo_x, + .param .f64 __cudaparm_particle_map_b_lo_y, + .param .f64 __cudaparm_particle_map_b_lo_z, + .param .f64 __cudaparm_particle_map_delxinv, + .param .f64 __cudaparm_particle_map_delyinv, + .param .f64 __cudaparm_particle_map_delzinv, + .param .s32 __cudaparm_particle_map_nlocal_x, + .param .s32 __cudaparm_particle_map_nlocal_y, + .param .s32 __cudaparm_particle_map_nlocal_z, + .param .s32 __cudaparm_particle_map_atom_stride, + .param .s32 __cudaparm_particle_map_max_atoms, + .param .u64 __cudaparm_particle_map_error) + { + .reg .u32 %r<50>; + .reg .u64 %rd<12>; + .reg .f32 %f<14>; + .reg .f64 %fd<36>; + .reg .pred %p<11>; + .loc 17 113 0 +$LDWbegin_particle_map: + mov.u32 %r1, %ntid.x; + mov.u32 %r2, %ctaid.x; + mul.lo.u32 %r3, %r2, %r1; + mov.u32 %r4, %nctaid.x; + mul.lo.u32 %r5, %r4, %r1; + mov.u32 %r6, %tid.x; + add.u32 %r7, %r6, %r3; + sub.s32 %r8, %r5, 1; + mul.lo.s32 %r9, %r7, 64; + div.s32 %r10, %r9, %r5; + mul.lo.s32 %r11, %r8, %r10; + sub.s32 %r12, %r9, %r11; + ld.param.s32 %r13, [__cudaparm_particle_map_nlocal]; + setp.le.s32 %p1, %r13, %r12; + @%p1 bra $Lt_0_7426; + .loc 17 125 0 + mov.u32 %r14, %r12; + mov.s32 %r15, 0; + mov.u32 %r16, %r15; + mov.s32 %r17, 0; + mov.u32 %r18, %r17; + mov.s32 %r19, 0; + mov.u32 %r20, %r19; + tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}]; + mov.f32 %f5, %f1; + mov.f32 %f6, %f2; + mov.f32 %f7, %f3; + .loc 17 127 0 + mov.u32 %r21, %r12; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}]; + mov.f32 %f12, %f8; + cvt.ftz.f64.f32 %fd1, %f12; + ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv]; + mul.f64 %fd3, %fd1, %fd2; + mov.f64 %fd4, 0d0000000000000000; // 0 + setp.neu.f64 %p2, %fd3, %fd4; + @!%p2 bra $Lt_0_7426; + .loc 17 130 0 + ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv]; + cvt.ftz.f64.f32 %fd6, %f5; + ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x]; + sub.f64 %fd8, %fd6, %fd7; + mul.f64 %fd9, %fd5, %fd8; + mov.f64 %fd10, 0d0000000000000000; // 0 + setp.lt.f64 %p3, %fd9, %fd10; + @%p3 bra $Lt_0_8706; + ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv]; + cvt.ftz.f64.f32 %fd12, %f6; + ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y]; + sub.f64 %fd14, %fd12, %fd13; + mul.f64 %fd15, %fd11, %fd14; + mov.f64 %fd16, 0d0000000000000000; // 0 + setp.lt.f64 %p4, %fd15, %fd16; + @%p4 bra $Lt_0_8706; + ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv]; + cvt.ftz.f64.f32 %fd18, %f7; + ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z]; + sub.f64 %fd20, %fd18, %fd19; + mul.f64 %fd21, %fd17, %fd20; + mov.f64 %fd22, 0d0000000000000000; // 0 + setp.lt.f64 %p5, %fd21, %fd22; + @%p5 bra $Lt_0_8706; + cvt.rzi.s32.f64 %r28, %fd9; + ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x]; + setp.ge.s32 %p6, %r28, %r29; + @%p6 bra $Lt_0_8706; + cvt.rzi.s32.f64 %r30, %fd15; + ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y]; + setp.ge.s32 %p7, %r30, %r31; + @%p7 bra $Lt_0_8706; + cvt.rzi.s32.f64 %r32, %fd21; + ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z]; + setp.gt.s32 %p8, %r33, %r32; + @%p8 bra $L_0_4866; +$Lt_0_8706: +$L_0_5122: + .loc 17 139 0 + mov.s32 %r34, 1; + ld.param.u64 %rd1, [__cudaparm_particle_map_error]; + st.global.s32 [%rd1+0], %r34; + bra.uni $Lt_0_7426; +$L_0_4866: + .loc 17 146 0 + mul.lo.s32 %r35, %r32, %r31; + add.s32 %r36, %r30, %r35; + mul.lo.s32 %r37, %r36, %r29; + add.s32 %r38, %r28, %r37; + ld.param.u64 %rd2, [__cudaparm_particle_map_counts]; + cvt.s64.s32 %rd3, %r38; + mul.wide.s32 %rd4, %r38, 4; + add.u64 %rd5, %rd2, %rd4; + mov.s32 %r39, 1; + atom.global.add.s32 %r40, [%rd5], %r39; + mov.s32 %r41, %r40; + ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms]; + setp.gt.s32 %p9, %r42, %r41; + @%p9 bra $Lt_0_7682; + .loc 17 148 0 + mov.s32 %r43, 2; + ld.param.u64 %rd6, [__cudaparm_particle_map_error]; + st.global.s32 [%rd6+0], %r43; + .loc 16 118 0 + mov.s32 %r44, -1; + atom.global.add.s32 %r45, [%rd5], %r44; + bra.uni $Lt_0_7426; +$Lt_0_7682: + .loc 17 151 0 + ld.param.u64 %rd7, [__cudaparm_particle_map_ans]; + ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride]; + mul.lo.s32 %r47, %r46, %r41; + add.s32 %r48, %r38, %r47; + cvt.s64.s32 %rd8, %r48; + mul.wide.s32 %rd9, %r48, 32; + add.u64 %rd10, %rd7, %rd9; + cvt.rn.f64.s32 %fd23, %r28; + mov.f64 %fd24, 0d3fe0000000000000; // 0.5 + add.f64 %fd25, %fd23, %fd24; + sub.f64 %fd26, %fd25, %fd9; + cvt.rn.f64.s32 %fd27, %r30; + mov.f64 %fd28, 0d3fe0000000000000; // 0.5 + add.f64 %fd29, %fd27, %fd28; + sub.f64 %fd30, %fd29, %fd15; + st.global.v2.f64 [%rd10+0], {%fd26,%fd30}; + cvt.rn.f64.s32 %fd31, %r32; + mov.f64 %fd32, 0d3fe0000000000000; // 0.5 + add.f64 %fd33, %fd31, %fd32; + sub.f64 %fd34, %fd33, %fd21; + st.global.v2.f64 [%rd10+16], {%fd34,%fd3}; +$Lt_0_7426: +$L_0_4610: +$Lt_0_6914: +$Lt_0_6402: + .loc 17 155 0 + exit; +$LDWend_particle_map: + } // particle_map + + .entry make_rho ( + .param .u64 __cudaparm_make_rho_counts, + .param .u64 __cudaparm_make_rho_atoms, + .param .u64 __cudaparm_make_rho_brick, + .param .u64 __cudaparm_make_rho__rho_coeff, + .param .s32 __cudaparm_make_rho_atom_stride, + .param .s32 __cudaparm_make_rho_npts_x, + .param .s32 __cudaparm_make_rho_npts_y, + .param .s32 __cudaparm_make_rho_npts_z, + .param .s32 __cudaparm_make_rho_nlocal_x, + .param .s32 __cudaparm_make_rho_nlocal_y, + .param .s32 __cudaparm_make_rho_nlocal_z, + .param .s32 __cudaparm_make_rho_order_m_1, + .param .s32 __cudaparm_make_rho_order, + .param .s32 __cudaparm_make_rho_order2) + { + .reg .u32 %r<119>; + .reg .u64 %rd<57>; + .reg .f64 %fd<26>; + .reg .pred %p<27>; + .shared .align 8 .b8 __cuda___cuda_local_var_32531_34_non_const_rho_coeff200[512]; + .shared .align 8 .b8 __cuda___cuda_local_var_32532_34_non_const_front712[640]; + .shared .align 8 .b8 __cuda___cuda_local_var_32533_34_non_const_ans1352[4096]; + .loc 17 164 0 +$LDWbegin_make_rho: + ld.param.s32 %r1, [__cudaparm_make_rho_order2]; + ld.param.s32 %r2, [__cudaparm_make_rho_order]; + add.s32 %r3, %r1, %r2; + cvt.s32.u32 %r4, %tid.x; + setp.le.s32 %p1, %r3, %r4; + @%p1 bra $Lt_1_16898; + .loc 17 171 0 + mov.u64 %rd1, __cuda___cuda_local_var_32531_34_non_const_rho_coeff200; + cvt.s64.s32 %rd2, %r4; + mul.wide.s32 %rd3, %r4, 8; + ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f64 %fd1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f64 [%rd6+0], %fd1; +$Lt_1_16898: + mov.u64 %rd1, __cuda___cuda_local_var_32531_34_non_const_rho_coeff200; + shr.s32 %r5, %r4, 31; + mov.s32 %r6, 31; + and.b32 %r7, %r5, %r6; + add.s32 %r8, %r7, %r4; + shr.s32 %r9, %r8, 5; + mul.lo.s32 %r10, %r9, 32; + sub.s32 %r11, %r4, %r10; + setp.lt.s32 %p2, %r11, %r2; + @!%p2 bra $Lt_1_17410; + .loc 17 177 0 + mov.u64 %rd7, __cuda___cuda_local_var_32532_34_non_const_front712; + mov.f64 %fd2, 0d0000000000000000; // 0 + cvt.s64.s32 %rd8, %r11; + shr.s32 %r12, %r4, 31; + mov.s32 %r13, 31; + and.b32 %r14, %r12, %r13; + add.s32 %r15, %r14, %r4; + shr.s32 %r16, %r15, 5; + cvt.s64.s32 %rd9, %r16; + mul.wide.s32 %rd10, %r16, 40; + add.u64 %rd11, %rd8, %rd10; + mul.lo.u64 %rd12, %rd11, 8; + add.u64 %rd13, %rd7, %rd12; + st.shared.f64 [%rd13+256], %fd2; +$Lt_1_17410: + mov.u64 %rd7, __cuda___cuda_local_var_32532_34_non_const_front712; + .loc 17 179 0 + bar.sync 0; + ld.param.s32 %r17, [__cudaparm_make_rho_npts_x]; + shr.s32 %r18, %r17, 31; + mov.s32 %r19, 31; + and.b32 %r20, %r18, %r19; + add.s32 %r21, %r20, %r17; + shr.s32 %r22, %r21, 5; + add.s32 %r23, %r22, 1; + mov.u32 %r24, 0; + setp.le.s32 %p3, %r23, %r24; + @%p3 bra $Lt_1_17922; + shr.s32 %r25, %r4, 31; + mov.s32 %r26, 31; + and.b32 %r27, %r25, %r26; + add.s32 %r28, %r27, %r4; + shr.s32 %r29, %r28, 5; + add.s32 %r30, %r11, 32; + ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y]; + ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x]; + mul.lo.s32 %r33, %r31, %r32; + mov.u32 %r34, %ctaid.x; + mul.lo.u32 %r35, %r34, 2; + add.u32 %r36, %r29, %r35; + ld.param.s32 %r37, [__cudaparm_make_rho_npts_y]; + div.s32 %r38, %r36, %r37; + ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1]; + setp.lt.s32 %p4, %r38, %r39; + sub.s32 %r40, %r39, %r38; + mov.s32 %r41, 0; + selp.s32 %r42, %r40, %r41, %p4; + ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z]; + setp.ge.s32 %p5, %r38, %r43; + sub.s32 %r44, %r43, %r38; + add.s32 %r45, %r44, %r2; + sub.s32 %r46, %r45, 1; + selp.s32 %r47, %r46, %r2, %p5; + rem.s32 %r48, %r36, %r37; + setp.lt.s32 %p6, %r48, %r39; + sub.s32 %r49, %r39, %r48; + mov.s32 %r50, 0; + selp.s32 %r51, %r49, %r50, %p6; + setp.ge.s32 %p7, %r48, %r31; + sub.s32 %r52, %r31, %r48; + add.s32 %r53, %r52, %r2; + sub.s32 %r54, %r53, 1; + selp.s32 %r55, %r54, %r2, %p7; + mov.s32 %r56, %r23; + mov.s32 %r57, 0; + setp.gt.s32 %p8, %r2, %r57; + mov.s32 %r58, 0; + cvt.s64.s32 %rd14, %r11; + cvt.s64.s32 %rd15, %r29; + mul.lo.s32 %r59, %r23, 32; + mul.wide.s32 %rd16, %r29, 40; + add.u64 %rd17, %rd14, %rd16; + ld.param.s32 %r60, [__cudaparm_make_rho_npts_z]; + setp.gt.s32 %p9, %r60, %r38; + mul.lo.u64 %rd18, %rd17, 8; + selp.s32 %r61, 1, 0, %p9; + add.u64 %rd19, %rd18, %rd7; + mov.u64 %rd20, __cuda___cuda_local_var_32533_34_non_const_ans1352; + mov.s32 %r62, %r56; +$Lt_1_18434: + //<loop> Loop body line 179, nesting depth: 1, estimated iterations: unknown + @!%p8 bra $Lt_1_18690; + mov.s32 %r63, %r2; + cvt.s64.s32 %rd21, %r4; + mul.wide.s32 %rd22, %r4, 8; + add.u64 %rd23, %rd20, %rd22; + mov.s32 %r64, 0; + mov.s32 %r65, %r63; +$Lt_1_19202: + //<loop> Loop body line 179, nesting depth: 2, estimated iterations: unknown + .loc 17 203 0 + mov.f64 %fd3, 0d0000000000000000; // 0 + st.shared.f64 [%rd23+0], %fd3; + add.s32 %r64, %r64, 1; + add.u64 %rd23, %rd23, 512; + setp.ne.s32 %p10, %r64, %r2; + @%p10 bra $Lt_1_19202; +$Lt_1_18690: + add.s32 %r66, %r11, %r58; + set.lt.u32.s32 %r67, %r66, %r32; + neg.s32 %r68, %r67; + and.b32 %r69, %r61, %r68; + mov.u32 %r70, 0; + setp.eq.s32 %p11, %r69, %r70; + @%p11 bra $Lt_1_20226; + .loc 17 206 0 + mov.s32 %r71, %r42; + setp.ge.s32 %p12, %r42, %r47; + @%p12 bra $Lt_1_20226; + sub.s32 %r72, %r47, %r42; + setp.lt.s32 %p13, %r51, %r55; + mov.s32 %r73, %r72; +$Lt_1_20738: + //<loop> Loop body line 206, nesting depth: 2, estimated iterations: unknown + .loc 17 208 0 + mov.s32 %r74, %r51; + @!%p13 bra $Lt_1_20994; + sub.s32 %r75, %r55, %r51; + sub.s32 %r76, %r71, %r42; + add.s32 %r77, %r38, %r42; + add.s32 %r78, %r48, %r51; + sub.s32 %r79, %r77, %r39; + sub.s32 %r80, %r78, %r39; + add.s32 %r81, %r76, %r79; + mul.lo.s32 %r82, %r33, %r81; + ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride]; + ld.param.u64 %rd24, [__cudaparm_make_rho_counts]; + mov.s32 %r84, %r75; +$Lt_1_21506: + //<loop> Loop body line 208, nesting depth: 3, estimated iterations: unknown + .loc 17 210 0 + sub.s32 %r85, %r74, %r51; + add.s32 %r86, %r85, %r80; + mul.lo.s32 %r87, %r86, %r32; + add.s32 %r88, %r82, %r87; + add.s32 %r89, %r66, %r88; + cvt.s64.s32 %rd25, %r89; + mul.wide.s32 %rd26, %r89, 4; + add.u64 %rd27, %rd24, %rd26; + ld.global.s32 %r90, [%rd27+0]; + mul.lo.s32 %r91, %r90, %r83; + .loc 17 211 0 + mov.s32 %r92, %r89; + setp.ge.s32 %p14, %r89, %r91; + @%p14 bra $Lt_1_21762; + sub.s32 %r93, %r3, 1; + cvt.s64.s32 %rd28, %r83; + mul.wide.s32 %rd29, %r83, 32; + mov.s32 %r94, -1; + setp.gt.s32 %p15, %r93, %r94; + ld.param.u64 %rd30, [__cudaparm_make_rho_atoms]; + mul.lo.u64 %rd31, %rd25, 32; + add.u64 %rd32, %rd30, %rd31; +$Lt_1_22274: + //<loop> Loop body line 211, nesting depth: 4, estimated iterations: unknown + .loc 17 212 0 + ld.global.f64 %fd4, [%rd32+0]; + @!%p15 bra $Lt_1_29954; + sub.s32 %r95, %r93, %r74; + mov.s32 %r96, -1; + sub.s32 %r97, %r96, %r74; + cvt.s64.s32 %rd33, %r2; + mul.wide.s32 %rd34, %r2, 8; + ld.global.f64 %fd5, [%rd32+8]; + ld.global.f64 %fd6, [%rd32+16]; + cvt.s64.s32 %rd35, %r95; + mul.wide.s32 %rd36, %r95, 8; + add.u64 %rd37, %rd1, %rd36; + sub.s32 %r98, %r93, %r71; + cvt.s64.s32 %rd38, %r98; + mul.wide.s32 %rd39, %r98, 8; + add.u64 %rd40, %rd1, %rd39; + mov.f64 %fd7, 0d0000000000000000; // 0 + mov.f64 %fd8, 0d0000000000000000; // 0 +$Lt_1_23042: + //<loop> Loop body line 212, nesting depth: 5, estimated iterations: unknown + .loc 17 217 0 + ld.shared.f64 %fd9, [%rd37+0]; + mad.rn.f64 %fd8, %fd8, %fd5, %fd9; + .loc 17 218 0 + ld.shared.f64 %fd10, [%rd40+0]; + mad.rn.f64 %fd7, %fd7, %fd6, %fd10; + sub.u64 %rd40, %rd40, %rd34; + sub.s32 %r95, %r95, %r2; + sub.u64 %rd37, %rd37, %rd34; + setp.gt.s32 %p16, %r95, %r97; + @%p16 bra $Lt_1_23042; + bra.uni $Lt_1_22530; +$Lt_1_29954: + mov.f64 %fd7, 0d0000000000000000; // 0 + mov.f64 %fd8, 0d0000000000000000; // 0 +$Lt_1_22530: + .loc 17 220 0 + ld.global.f64 %fd11, [%rd32+24]; + mul.f64 %fd12, %fd7, %fd8; + mul.f64 %fd13, %fd11, %fd12; + @!%p8 bra $Lt_1_23554; + mov.s32 %r99, %r2; + cvt.s64.s32 %rd41, %r4; + mul.wide.s32 %rd42, %r4, 8; + add.u64 %rd43, %rd20, %rd42; + mov.s32 %r100, 0; + mov.s32 %r101, %r99; +$Lt_1_24066: + //<loop> Loop body line 220, nesting depth: 5, estimated iterations: unknown + .loc 17 224 0 + add.s32 %r102, %r100, %r1; + mov.s32 %r103, %r102; + setp.lt.s32 %p17, %r102, %r100; + @%p17 bra $Lt_1_30466; + cvt.s64.s32 %rd44, %r2; + mul.wide.s32 %rd34, %r2, 8; + cvt.s64.s32 %rd45, %r102; + mul.wide.s32 %rd46, %r102, 8; + add.u64 %rd47, %rd1, %rd46; + mov.f64 %fd14, 0d0000000000000000; // 0 +$Lt_1_24834: + //<loop> Loop body line 224, nesting depth: 6, estimated iterations: unknown + .loc 17 225 0 + ld.shared.f64 %fd15, [%rd47+0]; + mad.rn.f64 %fd14, %fd4, %fd14, %fd15; + sub.s32 %r103, %r103, %r2; + sub.u64 %rd47, %rd47, %rd34; + setp.ge.s32 %p18, %r103, %r100; + @%p18 bra $Lt_1_24834; + bra.uni $Lt_1_24322; +$Lt_1_30466: + mov.f64 %fd14, 0d0000000000000000; // 0 +$Lt_1_24322: + .loc 17 226 0 + ld.shared.f64 %fd16, [%rd43+0]; + mad.rn.f64 %fd17, %fd14, %fd13, %fd16; + st.shared.f64 [%rd43+0], %fd17; + add.s32 %r100, %r100, 1; + add.u64 %rd43, %rd43, 512; + setp.ne.s32 %p19, %r100, %r2; + @%p19 bra $Lt_1_24066; +$Lt_1_23554: + add.s32 %r92, %r92, %r83; + add.u64 %rd32, %rd29, %rd32; + setp.gt.s32 %p20, %r91, %r92; + @%p20 bra $Lt_1_22274; +$Lt_1_21762: + add.s32 %r74, %r74, 1; + setp.ne.s32 %p21, %r55, %r74; + @%p21 bra $Lt_1_21506; +$Lt_1_20994: + add.s32 %r71, %r71, 1; + setp.ne.s32 %p22, %r47, %r71; + @%p22 bra $Lt_1_20738; +$Lt_1_20226: +$Lt_1_19714: + .loc 17 235 0 + bar.sync 0; + @!%p2 bra $Lt_1_26626; + .loc 17 237 0 + ld.shared.f64 %fd18, [%rd19+256]; + st.shared.f64 [%rd19+0], %fd18; + .loc 17 238 0 + mov.f64 %fd19, 0d0000000000000000; // 0 + st.shared.f64 [%rd19+256], %fd19; + bra.uni $Lt_1_26370; +$Lt_1_26626: + .loc 17 240 0 + mov.f64 %fd20, 0d0000000000000000; // 0 + st.shared.f64 [%rd19+0], %fd20; +$Lt_1_26370: + @!%p8 bra $Lt_1_26882; + mov.s32 %r104, %r2; + cvt.s64.s32 %rd48, %r4; + mov.s32 %r105, %r11; + add.s32 %r106, %r11, %r2; + mul.wide.s32 %rd49, %r4, 8; + add.u64 %rd50, %rd20, %rd49; + mov.s64 %rd51, %rd19; + mov.s32 %r107, %r104; +$Lt_1_27394: + //<loop> Loop body line 240, nesting depth: 2, estimated iterations: unknown + .loc 17 243 0 + ld.shared.f64 %fd21, [%rd50+0]; + ld.shared.f64 %fd22, [%rd51+0]; + add.f64 %fd23, %fd21, %fd22; + st.shared.f64 [%rd51+0], %fd23; + .loc 17 244 0 + bar.sync 0; + add.s32 %r105, %r105, 1; + add.u64 %rd51, %rd51, 8; + add.u64 %rd50, %rd50, 512; + setp.ne.s32 %p23, %r105, %r106; + @%p23 bra $Lt_1_27394; +$Lt_1_26882: + set.lt.u32.s32 %r108, %r66, %r17; + neg.s32 %r109, %r108; + and.b32 %r110, %r61, %r109; + mov.u32 %r111, 0; + setp.eq.s32 %p24, %r110, %r111; + @%p24 bra $Lt_1_27906; + .loc 17 248 0 + ld.shared.f64 %fd24, [%rd19+0]; + ld.param.u64 %rd52, [__cudaparm_make_rho_brick]; + add.s32 %r112, %r11, %r58; + mul.lo.s32 %r113, %r37, %r17; + mul.lo.s32 %r114, %r38, %r113; + mul.lo.s32 %r115, %r48, %r17; + add.s32 %r116, %r114, %r115; + add.s32 %r117, %r112, %r116; + cvt.s64.s32 %rd53, %r117; + mul.wide.s32 %rd54, %r117, 8; + add.u64 %rd55, %rd52, %rd54; + st.global.f64 [%rd55+0], %fd24; +$Lt_1_27906: + add.s32 %r58, %r58, 32; + setp.ne.s32 %p25, %r58, %r59; + @%p25 bra $Lt_1_18434; +$Lt_1_17922: + .loc 17 252 0 + exit; +$LDWend_make_rho: + } // make_rho + + .entry interp ( + .param .u64 __cudaparm_interp_x_, + .param .u64 __cudaparm_interp_q_, + .param .s32 __cudaparm_interp_nlocal, + .param .u64 __cudaparm_interp_brick, + .param .u64 __cudaparm_interp__rho_coeff, + .param .s32 __cudaparm_interp_npts_x, + .param .s32 __cudaparm_interp_npts_yx, + .param .f64 __cudaparm_interp_b_lo_x, + .param .f64 __cudaparm_interp_b_lo_y, + .param .f64 __cudaparm_interp_b_lo_z, + .param .f64 __cudaparm_interp_delxinv, + .param .f64 __cudaparm_interp_delyinv, + .param .f64 __cudaparm_interp_delzinv, + .param .s32 __cudaparm_interp_order, + .param .s32 __cudaparm_interp_order2, + .param .f64 __cudaparm_interp_qqrd2e_scale, + .param .u64 __cudaparm_interp_ans) + { + .reg .u32 %r<56>; + .reg .u64 %rd<37>; + .reg .f32 %f<19>; + .reg .f64 %fd<63>; + .reg .pred %p<14>; + .shared .align 8 .b8 __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568[512]; + .shared .align 8 .b8 __cuda___cuda_local_var_32630_34_non_const_rho1d_06080[4096]; + .shared .align 8 .b8 __cuda___cuda_local_var_32631_34_non_const_rho1d_110176[4096]; + // __cuda_local_var_32647_12_non_const_ek = 16 + .loc 17 262 0 +$LDWbegin_interp: + ld.param.s32 %r1, [__cudaparm_interp_order2]; + ld.param.s32 %r2, [__cudaparm_interp_order]; + add.s32 %r3, %r1, %r2; + cvt.s32.u32 %r4, %tid.x; + setp.le.s32 %p1, %r3, %r4; + @%p1 bra $Lt_2_8706; + .loc 17 269 0 + mov.u64 %rd1, __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568; + cvt.s64.s32 %rd2, %r4; + mul.wide.s32 %rd3, %r4, 8; + ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f64 %fd1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f64 [%rd6+0], %fd1; +$Lt_2_8706: + mov.u64 %rd1, __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568; + .loc 17 270 0 + bar.sync 0; + mov.u32 %r5, %ctaid.x; + mov.u32 %r6, %ntid.x; + mul.lo.u32 %r7, %r5, %r6; + add.u32 %r8, %r4, %r7; + ld.param.s32 %r9, [__cudaparm_interp_nlocal]; + setp.le.s32 %p2, %r9, %r8; + @%p2 bra $Lt_2_9218; + .loc 17 278 0 + mov.u32 %r10, %r8; + mov.s32 %r11, 0; + mov.u32 %r12, %r11; + mov.s32 %r13, 0; + mov.u32 %r14, %r13; + mov.s32 %r15, 0; + mov.u32 %r16, %r15; + tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}]; + mov.f32 %f5, %f1; + mov.f32 %f6, %f2; + mov.f32 %f7, %f3; + .loc 17 279 0 + mov.u32 %r17, %r8; + mov.s32 %r18, 0; + mov.u32 %r19, %r18; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}]; + mov.f32 %f12, %f8; + cvt.ftz.f64.f32 %fd2, %f12; + ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale]; + mul.f64 %fd4, %fd2, %fd3; + mov.f64 %fd5, 0d0000000000000000; // 0 + setp.neu.f64 %p3, %fd4, %fd5; + @!%p3 bra $Lt_2_9986; + mov.s32 %r24, 0; + setp.gt.s32 %p4, %r2, %r24; + ld.param.f64 %fd6, [__cudaparm_interp_delxinv]; + cvt.ftz.f64.f32 %fd7, %f5; + ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x]; + sub.f64 %fd9, %fd7, %fd8; + mul.f64 %fd10, %fd6, %fd9; + @!%p4 bra $Lt_2_16386; + mov.u64 %rd7, __cuda___cuda_local_var_32630_34_non_const_rho1d_06080; + mov.u64 %rd8, __cuda___cuda_local_var_32631_34_non_const_rho1d_110176; + cvt.rzi.s32.f64 %r25, %fd10; + cvt.rn.f64.s32 %fd11, %r25; + mov.f64 %fd12, 0d3fe0000000000000; // 0.5 + add.f64 %fd13, %fd11, %fd12; + sub.f64 %fd14, %fd13, %fd10; + ld.param.f64 %fd15, [__cudaparm_interp_delyinv]; + cvt.ftz.f64.f32 %fd16, %f6; + ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y]; + sub.f64 %fd18, %fd16, %fd17; + mul.f64 %fd19, %fd15, %fd18; + cvt.rzi.s32.f64 %r26, %fd19; + cvt.rn.f64.s32 %fd20, %r26; + mov.f64 %fd21, 0d3fe0000000000000; // 0.5 + add.f64 %fd22, %fd20, %fd21; + sub.f64 %fd23, %fd22, %fd19; + mov.s32 %r27, %r2; + cvt.s64.s32 %rd9, %r4; + mov.s32 %r28, %r1; + mul.wide.s32 %rd3, %r4, 8; + add.u64 %rd10, %rd3, %rd7; + add.u64 %rd11, %rd3, %rd8; + mov.s32 %r29, 0; + mov.s32 %r30, %r27; +$Lt_2_10754: + //<loop> Loop body line 279, nesting depth: 1, estimated iterations: unknown + .loc 17 298 0 + mov.f64 %fd24, 0d0000000000000000; // 0 + mov.f64 %fd25, 0d0000000000000000; // 0 + st.shared.f64 [%rd10+0], %fd25; + .loc 17 299 0 + mov.f64 %fd26, 0d0000000000000000; // 0 + mov.f64 %fd27, 0d0000000000000000; // 0 + st.shared.f64 [%rd11+0], %fd27; + .loc 17 300 0 + mov.s32 %r31, %r28; + setp.lt.s32 %p5, %r28, %r29; + @%p5 bra $Lt_2_11010; + cvt.s64.s32 %rd12, %r2; + mul.wide.s32 %rd13, %r2, 8; + cvt.s64.s32 %rd14, %r28; + mul.wide.s32 %rd15, %r28, 8; + add.u64 %rd16, %rd1, %rd15; +$Lt_2_11522: + //<loop> Loop body line 300, nesting depth: 2, estimated iterations: unknown + .loc 17 301 0 + ld.shared.f64 %fd28, [%rd16+0]; + mad.rn.f64 %fd24, %fd24, %fd14, %fd28; + st.shared.f64 [%rd10+0], %fd24; + .loc 17 302 0 + mad.rn.f64 %fd26, %fd26, %fd23, %fd28; + st.shared.f64 [%rd11+0], %fd26; + sub.s32 %r31, %r31, %r2; + sub.u64 %rd16, %rd16, %rd13; + setp.ge.s32 %p6, %r31, %r29; + @%p6 bra $Lt_2_11522; +$Lt_2_11010: + add.s32 %r29, %r29, 1; + add.s32 %r28, %r28, 1; + add.u64 %rd11, %rd11, 512; + add.u64 %rd10, %rd10, 512; + setp.ne.s32 %p7, %r28, %r3; + @%p7 bra $Lt_2_10754; + bra.uni $Lt_2_10242; +$Lt_2_16386: + cvt.rzi.s32.f64 %r25, %fd10; + mov.u64 %rd8, __cuda___cuda_local_var_32631_34_non_const_rho1d_110176; + mov.u64 %rd7, __cuda___cuda_local_var_32630_34_non_const_rho1d_06080; +$Lt_2_10242: + .loc 17 306 0 + ld.param.f64 %fd29, [__cudaparm_interp_delzinv]; + cvt.ftz.f64.f32 %fd30, %f7; + ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z]; + sub.f64 %fd32, %fd30, %fd31; + mul.f64 %fd33, %fd29, %fd32; + cvt.rzi.s32.f64 %r32, %fd33; + ld.param.s32 %r33, [__cudaparm_interp_npts_yx]; + mul.lo.s32 %r34, %r32, %r33; + add.s32 %r35, %r25, %r34; + @!%p4 bra $Lt_2_16898; + cvt.rn.f64.s32 %fd34, %r32; + mov.f64 %fd35, 0d3fe0000000000000; // 0.5 + add.f64 %fd36, %fd34, %fd35; + sub.f64 %fd37, %fd36, %fd33; + mov.s32 %r36, %r2; + cvt.ftz.f64.f32 %fd38, %f6; + cvt.s64.s32 %rd17, %r4; + ld.param.f64 %fd39, [__cudaparm_interp_delyinv]; + ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y]; + sub.f64 %fd41, %fd38, %fd40; + mul.f64 %fd42, %fd39, %fd41; + cvt.rzi.s32.f64 %r37, %fd42; + mul.wide.s32 %rd3, %r4, 8; + ld.param.s32 %r38, [__cudaparm_interp_npts_x]; + mul.lo.s32 %r39, %r37, %r38; + add.u64 %rd18, %rd3, %rd7; + add.u64 %rd19, %rd3, %rd8; + cvt.s64.s32 %rd20, %r38; + mul.wide.s32 %rd21, %r38, 32; + add.s32 %r40, %r39, %r35; + mov.s32 %r41, %r40; + ld.param.u64 %rd22, [__cudaparm_interp_brick]; + mov.s32 %r42, 0; + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, 0f00000000; // 0 + mov.s32 %r43, %r36; +$Lt_2_12802: + //<loop> Loop body line 306, nesting depth: 1, estimated iterations: unknown + .loc 17 309 0 + add.s32 %r44, %r42, %r1; + mov.s32 %r45, %r44; + setp.lt.s32 %p8, %r44, %r42; + @%p8 bra $Lt_2_17154; + cvt.s64.s32 %rd23, %r2; + mul.wide.s32 %rd13, %r2, 8; + cvt.s64.s32 %rd24, %r44; + mul.wide.s32 %rd25, %r44, 8; + add.u64 %rd26, %rd1, %rd25; + mov.f64 %fd43, 0d0000000000000000; // 0 +$Lt_2_13570: + //<loop> Loop body line 309, nesting depth: 2, estimated iterations: unknown + .loc 17 310 0 + ld.shared.f64 %fd44, [%rd26+0]; + mad.rn.f64 %fd43, %fd37, %fd43, %fd44; + sub.s32 %r45, %r45, %r2; + sub.u64 %rd26, %rd26, %rd13; + setp.ge.s32 %p9, %r45, %r42; + @%p9 bra $Lt_2_13570; + bra.uni $Lt_2_13058; +$Lt_2_17154: + mov.f64 %fd43, 0d0000000000000000; // 0 +$Lt_2_13058: + .loc 17 312 0 + mov.s32 %r46, %r41; + mov.s32 %r47, %r2; + mov.s32 %r48, %r46; + mul.f64 %fd45, %fd4, %fd43; + mov.s64 %rd27, %rd19; + cvt.s64.s32 %rd28, %r46; + mul.wide.s32 %rd29, %r46, 32; + mov.s32 %r49, 0; + mov.s32 %r50, %r47; +$Lt_2_14594: + //<loop> Loop body line 312, nesting depth: 2, estimated iterations: unknown + mov.s32 %r51, %r2; + mov.s32 %r52, %r48; + add.s32 %r53, %r48, %r2; + mov.s64 %rd30, %rd18; + ld.shared.f64 %fd46, [%rd27+0]; + add.u64 %rd31, %rd29, %rd22; + mul.f64 %fd47, %fd45, %fd46; + mov.s32 %r54, %r51; +$Lt_2_15362: + //<loop> Loop body line 312, nesting depth: 3, estimated iterations: unknown + .loc 17 316 0 + ld.shared.f64 %fd48, [%rd30+0]; + mul.f64 %fd49, %fd48, %fd47; + .loc 17 318 0 + cvt.ftz.f64.f32 %fd50, %f15; + ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0]; + mul.f64 %fd53, %fd49, %fd51; + sub.f64 %fd54, %fd50, %fd53; + cvt.rn.ftz.f32.f64 %f15, %fd54; + .loc 17 319 0 + cvt.ftz.f64.f32 %fd55, %f14; + mul.f64 %fd56, %fd49, %fd52; + sub.f64 %fd57, %fd55, %fd56; + cvt.rn.ftz.f32.f64 %f14, %fd57; + .loc 17 320 0 + cvt.ftz.f64.f32 %fd58, %f13; + ld.global.f64 %fd59, [%rd31+16]; + mul.f64 %fd60, %fd49, %fd59; + sub.f64 %fd61, %fd58, %fd60; + cvt.rn.ftz.f32.f64 %f13, %fd61; + add.s32 %r52, %r52, 1; + add.u64 %rd31, %rd31, 32; + add.u64 %rd30, %rd30, 512; + setp.ne.s32 %p10, %r52, %r53; + @%p10 bra $Lt_2_15362; + add.s32 %r49, %r49, 1; + add.s32 %r48, %r48, %r38; + add.u64 %rd29, %rd29, %rd21; + add.u64 %rd27, %rd27, 512; + setp.ne.s32 %p11, %r49, %r2; + @%p11 bra $Lt_2_14594; + add.s32 %r42, %r42, 1; + add.s32 %r41, %r46, %r33; + setp.ne.s32 %p12, %r42, %r2; + @%p12 bra $Lt_2_12802; + bra.uni $Lt_2_9730; +$Lt_2_16898: + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, 0f00000000; // 0 + bra.uni $Lt_2_9730; +$Lt_2_9986: + mov.f32 %f13, 0f00000000; // 0 + mov.f32 %f14, 0f00000000; // 0 + mov.f32 %f15, 0f00000000; // 0 +$Lt_2_9730: + .loc 17 327 0 + ld.param.u64 %rd32, [__cudaparm_interp_ans]; + cvt.s64.s32 %rd33, %r8; + mul.wide.s32 %rd34, %r8, 16; + add.u64 %rd35, %rd32, %rd34; + mov.f32 %f16, %f17; + st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16}; +$Lt_2_9218: + .loc 17 329 0 + exit; +$LDWend_interp: + } // interp + diff --git a/lib/gpu/pppm_d_gpu_ptx.h b/lib/gpu/pppm_d_gpu_ptx.h new file mode 100644 index 000000000..6fac17690 --- /dev/null +++ b/lib/gpu/pppm_d_gpu_ptx.h @@ -0,0 +1,837 @@ +const char * pppm_d_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .global .texref q_tex;\n" +" .entry particle_map (\n" +" .param .u64 __cudaparm_particle_map_x_,\n" +" .param .u64 __cudaparm_particle_map_q_,\n" +" .param .f64 __cudaparm_particle_map_delvolinv,\n" +" .param .s32 __cudaparm_particle_map_nlocal,\n" +" .param .u64 __cudaparm_particle_map_counts,\n" +" .param .u64 __cudaparm_particle_map_ans,\n" +" .param .f64 __cudaparm_particle_map_b_lo_x,\n" +" .param .f64 __cudaparm_particle_map_b_lo_y,\n" +" .param .f64 __cudaparm_particle_map_b_lo_z,\n" +" .param .f64 __cudaparm_particle_map_delxinv,\n" +" .param .f64 __cudaparm_particle_map_delyinv,\n" +" .param .f64 __cudaparm_particle_map_delzinv,\n" +" .param .s32 __cudaparm_particle_map_nlocal_x,\n" +" .param .s32 __cudaparm_particle_map_nlocal_y,\n" +" .param .s32 __cudaparm_particle_map_nlocal_z,\n" +" .param .s32 __cudaparm_particle_map_atom_stride,\n" +" .param .s32 __cudaparm_particle_map_max_atoms,\n" +" .param .u64 __cudaparm_particle_map_error)\n" +" {\n" +" .reg .u32 %r<50>;\n" +" .reg .u64 %rd<12>;\n" +" .reg .f32 %f<14>;\n" +" .reg .f64 %fd<36>;\n" +" .reg .pred %p<11>;\n" +" .loc 17 113 0\n" +"$LDWbegin_particle_map:\n" +" mov.u32 %r1, %ntid.x;\n" +" mov.u32 %r2, %ctaid.x;\n" +" mul.lo.u32 %r3, %r2, %r1;\n" +" mov.u32 %r4, %nctaid.x;\n" +" mul.lo.u32 %r5, %r4, %r1;\n" +" mov.u32 %r6, %tid.x;\n" +" add.u32 %r7, %r6, %r3;\n" +" sub.s32 %r8, %r5, 1;\n" +" mul.lo.s32 %r9, %r7, 64;\n" +" div.s32 %r10, %r9, %r5;\n" +" mul.lo.s32 %r11, %r8, %r10;\n" +" sub.s32 %r12, %r9, %r11;\n" +" ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];\n" +" setp.le.s32 %p1, %r13, %r12;\n" +" @%p1 bra $Lt_0_7426;\n" +" .loc 17 125 0\n" +" mov.u32 %r14, %r12;\n" +" mov.s32 %r15, 0;\n" +" mov.u32 %r16, %r15;\n" +" mov.s32 %r17, 0;\n" +" mov.u32 %r18, %r17;\n" +" mov.s32 %r19, 0;\n" +" mov.u32 %r20, %r19;\n" +" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];\n" +" mov.f32 %f5, %f1;\n" +" mov.f32 %f6, %f2;\n" +" mov.f32 %f7, %f3;\n" +" .loc 17 127 0\n" +" mov.u32 %r21, %r12;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];\n" +" mov.f32 %f12, %f8;\n" +" cvt.ftz.f64.f32 %fd1, %f12;\n" +" ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv];\n" +" mul.f64 %fd3, %fd1, %fd2;\n" +" mov.f64 %fd4, 0d0000000000000000; \n" +" setp.neu.f64 %p2, %fd3, %fd4;\n" +" @!%p2 bra $Lt_0_7426;\n" +" .loc 17 130 0\n" +" ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv];\n" +" cvt.ftz.f64.f32 %fd6, %f5;\n" +" ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x];\n" +" sub.f64 %fd8, %fd6, %fd7;\n" +" mul.f64 %fd9, %fd5, %fd8;\n" +" mov.f64 %fd10, 0d0000000000000000; \n" +" setp.lt.f64 %p3, %fd9, %fd10;\n" +" @%p3 bra $Lt_0_8706;\n" +" ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv];\n" +" cvt.ftz.f64.f32 %fd12, %f6;\n" +" ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y];\n" +" sub.f64 %fd14, %fd12, %fd13;\n" +" mul.f64 %fd15, %fd11, %fd14;\n" +" mov.f64 %fd16, 0d0000000000000000; \n" +" setp.lt.f64 %p4, %fd15, %fd16;\n" +" @%p4 bra $Lt_0_8706;\n" +" ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv];\n" +" cvt.ftz.f64.f32 %fd18, %f7;\n" +" ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z];\n" +" sub.f64 %fd20, %fd18, %fd19;\n" +" mul.f64 %fd21, %fd17, %fd20;\n" +" mov.f64 %fd22, 0d0000000000000000; \n" +" setp.lt.f64 %p5, %fd21, %fd22;\n" +" @%p5 bra $Lt_0_8706;\n" +" cvt.rzi.s32.f64 %r28, %fd9;\n" +" ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];\n" +" setp.ge.s32 %p6, %r28, %r29;\n" +" @%p6 bra $Lt_0_8706;\n" +" cvt.rzi.s32.f64 %r30, %fd15;\n" +" ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];\n" +" setp.ge.s32 %p7, %r30, %r31;\n" +" @%p7 bra $Lt_0_8706;\n" +" cvt.rzi.s32.f64 %r32, %fd21;\n" +" ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];\n" +" setp.gt.s32 %p8, %r33, %r32;\n" +" @%p8 bra $L_0_4866;\n" +"$Lt_0_8706:\n" +"$L_0_5122:\n" +" .loc 17 139 0\n" +" mov.s32 %r34, 1;\n" +" ld.param.u64 %rd1, [__cudaparm_particle_map_error];\n" +" st.global.s32 [%rd1+0], %r34;\n" +" bra.uni $Lt_0_7426;\n" +"$L_0_4866:\n" +" .loc 17 146 0\n" +" mul.lo.s32 %r35, %r32, %r31;\n" +" add.s32 %r36, %r30, %r35;\n" +" mul.lo.s32 %r37, %r36, %r29;\n" +" add.s32 %r38, %r28, %r37;\n" +" ld.param.u64 %rd2, [__cudaparm_particle_map_counts];\n" +" cvt.s64.s32 %rd3, %r38;\n" +" mul.wide.s32 %rd4, %r38, 4;\n" +" add.u64 %rd5, %rd2, %rd4;\n" +" mov.s32 %r39, 1;\n" +" atom.global.add.s32 %r40, [%rd5], %r39;\n" +" mov.s32 %r41, %r40;\n" +" ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];\n" +" setp.gt.s32 %p9, %r42, %r41;\n" +" @%p9 bra $Lt_0_7682;\n" +" .loc 17 148 0\n" +" mov.s32 %r43, 2;\n" +" ld.param.u64 %rd6, [__cudaparm_particle_map_error];\n" +" st.global.s32 [%rd6+0], %r43;\n" +" .loc 16 118 0\n" +" mov.s32 %r44, -1;\n" +" atom.global.add.s32 %r45, [%rd5], %r44;\n" +" bra.uni $Lt_0_7426;\n" +"$Lt_0_7682:\n" +" .loc 17 151 0\n" +" ld.param.u64 %rd7, [__cudaparm_particle_map_ans];\n" +" ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];\n" +" mul.lo.s32 %r47, %r46, %r41;\n" +" add.s32 %r48, %r38, %r47;\n" +" cvt.s64.s32 %rd8, %r48;\n" +" mul.wide.s32 %rd9, %r48, 32;\n" +" add.u64 %rd10, %rd7, %rd9;\n" +" cvt.rn.f64.s32 %fd23, %r28;\n" +" mov.f64 %fd24, 0d3fe0000000000000; \n" +" add.f64 %fd25, %fd23, %fd24;\n" +" sub.f64 %fd26, %fd25, %fd9;\n" +" cvt.rn.f64.s32 %fd27, %r30;\n" +" mov.f64 %fd28, 0d3fe0000000000000; \n" +" add.f64 %fd29, %fd27, %fd28;\n" +" sub.f64 %fd30, %fd29, %fd15;\n" +" st.global.v2.f64 [%rd10+0], {%fd26,%fd30};\n" +" cvt.rn.f64.s32 %fd31, %r32;\n" +" mov.f64 %fd32, 0d3fe0000000000000; \n" +" add.f64 %fd33, %fd31, %fd32;\n" +" sub.f64 %fd34, %fd33, %fd21;\n" +" st.global.v2.f64 [%rd10+16], {%fd34,%fd3};\n" +"$Lt_0_7426:\n" +"$L_0_4610:\n" +"$Lt_0_6914:\n" +"$Lt_0_6402:\n" +" .loc 17 155 0\n" +" exit;\n" +"$LDWend_particle_map:\n" +" }\n" +" .entry make_rho (\n" +" .param .u64 __cudaparm_make_rho_counts,\n" +" .param .u64 __cudaparm_make_rho_atoms,\n" +" .param .u64 __cudaparm_make_rho_brick,\n" +" .param .u64 __cudaparm_make_rho__rho_coeff,\n" +" .param .s32 __cudaparm_make_rho_atom_stride,\n" +" .param .s32 __cudaparm_make_rho_npts_x,\n" +" .param .s32 __cudaparm_make_rho_npts_y,\n" +" .param .s32 __cudaparm_make_rho_npts_z,\n" +" .param .s32 __cudaparm_make_rho_nlocal_x,\n" +" .param .s32 __cudaparm_make_rho_nlocal_y,\n" +" .param .s32 __cudaparm_make_rho_nlocal_z,\n" +" .param .s32 __cudaparm_make_rho_order_m_1,\n" +" .param .s32 __cudaparm_make_rho_order,\n" +" .param .s32 __cudaparm_make_rho_order2)\n" +" {\n" +" .reg .u32 %r<119>;\n" +" .reg .u64 %rd<57>;\n" +" .reg .f64 %fd<26>;\n" +" .reg .pred %p<27>;\n" +" .shared .align 8 .b8 __cuda___cuda_local_var_32531_34_non_const_rho_coeff200[512];\n" +" .shared .align 8 .b8 __cuda___cuda_local_var_32532_34_non_const_front712[640];\n" +" .shared .align 8 .b8 __cuda___cuda_local_var_32533_34_non_const_ans1352[4096];\n" +" .loc 17 164 0\n" +"$LDWbegin_make_rho:\n" +" ld.param.s32 %r1, [__cudaparm_make_rho_order2];\n" +" ld.param.s32 %r2, [__cudaparm_make_rho_order];\n" +" add.s32 %r3, %r1, %r2;\n" +" cvt.s32.u32 %r4, %tid.x;\n" +" setp.le.s32 %p1, %r3, %r4;\n" +" @%p1 bra $Lt_1_16898;\n" +" .loc 17 171 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32531_34_non_const_rho_coeff200;\n" +" cvt.s64.s32 %rd2, %r4;\n" +" mul.wide.s32 %rd3, %r4, 8;\n" +" ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f64 %fd1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f64 [%rd6+0], %fd1;\n" +"$Lt_1_16898:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32531_34_non_const_rho_coeff200;\n" +" shr.s32 %r5, %r4, 31;\n" +" mov.s32 %r6, 31;\n" +" and.b32 %r7, %r5, %r6;\n" +" add.s32 %r8, %r7, %r4;\n" +" shr.s32 %r9, %r8, 5;\n" +" mul.lo.s32 %r10, %r9, 32;\n" +" sub.s32 %r11, %r4, %r10;\n" +" setp.lt.s32 %p2, %r11, %r2;\n" +" @!%p2 bra $Lt_1_17410;\n" +" .loc 17 177 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32532_34_non_const_front712;\n" +" mov.f64 %fd2, 0d0000000000000000; \n" +" cvt.s64.s32 %rd8, %r11;\n" +" shr.s32 %r12, %r4, 31;\n" +" mov.s32 %r13, 31;\n" +" and.b32 %r14, %r12, %r13;\n" +" add.s32 %r15, %r14, %r4;\n" +" shr.s32 %r16, %r15, 5;\n" +" cvt.s64.s32 %rd9, %r16;\n" +" mul.wide.s32 %rd10, %r16, 40;\n" +" add.u64 %rd11, %rd8, %rd10;\n" +" mul.lo.u64 %rd12, %rd11, 8;\n" +" add.u64 %rd13, %rd7, %rd12;\n" +" st.shared.f64 [%rd13+256], %fd2;\n" +"$Lt_1_17410:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32532_34_non_const_front712;\n" +" .loc 17 179 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];\n" +" shr.s32 %r18, %r17, 31;\n" +" mov.s32 %r19, 31;\n" +" and.b32 %r20, %r18, %r19;\n" +" add.s32 %r21, %r20, %r17;\n" +" shr.s32 %r22, %r21, 5;\n" +" add.s32 %r23, %r22, 1;\n" +" mov.u32 %r24, 0;\n" +" setp.le.s32 %p3, %r23, %r24;\n" +" @%p3 bra $Lt_1_17922;\n" +" shr.s32 %r25, %r4, 31;\n" +" mov.s32 %r26, 31;\n" +" and.b32 %r27, %r25, %r26;\n" +" add.s32 %r28, %r27, %r4;\n" +" shr.s32 %r29, %r28, 5;\n" +" add.s32 %r30, %r11, 32;\n" +" ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];\n" +" ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];\n" +" mul.lo.s32 %r33, %r31, %r32;\n" +" mov.u32 %r34, %ctaid.x;\n" +" mul.lo.u32 %r35, %r34, 2;\n" +" add.u32 %r36, %r29, %r35;\n" +" ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];\n" +" div.s32 %r38, %r36, %r37;\n" +" ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];\n" +" setp.lt.s32 %p4, %r38, %r39;\n" +" sub.s32 %r40, %r39, %r38;\n" +" mov.s32 %r41, 0;\n" +" selp.s32 %r42, %r40, %r41, %p4;\n" +" ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];\n" +" setp.ge.s32 %p5, %r38, %r43;\n" +" sub.s32 %r44, %r43, %r38;\n" +" add.s32 %r45, %r44, %r2;\n" +" sub.s32 %r46, %r45, 1;\n" +" selp.s32 %r47, %r46, %r2, %p5;\n" +" rem.s32 %r48, %r36, %r37;\n" +" setp.lt.s32 %p6, %r48, %r39;\n" +" sub.s32 %r49, %r39, %r48;\n" +" mov.s32 %r50, 0;\n" +" selp.s32 %r51, %r49, %r50, %p6;\n" +" setp.ge.s32 %p7, %r48, %r31;\n" +" sub.s32 %r52, %r31, %r48;\n" +" add.s32 %r53, %r52, %r2;\n" +" sub.s32 %r54, %r53, 1;\n" +" selp.s32 %r55, %r54, %r2, %p7;\n" +" mov.s32 %r56, %r23;\n" +" mov.s32 %r57, 0;\n" +" setp.gt.s32 %p8, %r2, %r57;\n" +" mov.s32 %r58, 0;\n" +" cvt.s64.s32 %rd14, %r11;\n" +" cvt.s64.s32 %rd15, %r29;\n" +" mul.lo.s32 %r59, %r23, 32;\n" +" mul.wide.s32 %rd16, %r29, 40;\n" +" add.u64 %rd17, %rd14, %rd16;\n" +" ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];\n" +" setp.gt.s32 %p9, %r60, %r38;\n" +" mul.lo.u64 %rd18, %rd17, 8;\n" +" selp.s32 %r61, 1, 0, %p9;\n" +" add.u64 %rd19, %rd18, %rd7;\n" +" mov.u64 %rd20, __cuda___cuda_local_var_32533_34_non_const_ans1352;\n" +" mov.s32 %r62, %r56;\n" +"$Lt_1_18434:\n" +" @!%p8 bra $Lt_1_18690;\n" +" mov.s32 %r63, %r2;\n" +" cvt.s64.s32 %rd21, %r4;\n" +" mul.wide.s32 %rd22, %r4, 8;\n" +" add.u64 %rd23, %rd20, %rd22;\n" +" mov.s32 %r64, 0;\n" +" mov.s32 %r65, %r63;\n" +"$Lt_1_19202:\n" +" .loc 17 203 0\n" +" mov.f64 %fd3, 0d0000000000000000; \n" +" st.shared.f64 [%rd23+0], %fd3;\n" +" add.s32 %r64, %r64, 1;\n" +" add.u64 %rd23, %rd23, 512;\n" +" setp.ne.s32 %p10, %r64, %r2;\n" +" @%p10 bra $Lt_1_19202;\n" +"$Lt_1_18690:\n" +" add.s32 %r66, %r11, %r58;\n" +" set.lt.u32.s32 %r67, %r66, %r32;\n" +" neg.s32 %r68, %r67;\n" +" and.b32 %r69, %r61, %r68;\n" +" mov.u32 %r70, 0;\n" +" setp.eq.s32 %p11, %r69, %r70;\n" +" @%p11 bra $Lt_1_20226;\n" +" .loc 17 206 0\n" +" mov.s32 %r71, %r42;\n" +" setp.ge.s32 %p12, %r42, %r47;\n" +" @%p12 bra $Lt_1_20226;\n" +" sub.s32 %r72, %r47, %r42;\n" +" setp.lt.s32 %p13, %r51, %r55;\n" +" mov.s32 %r73, %r72;\n" +"$Lt_1_20738:\n" +" .loc 17 208 0\n" +" mov.s32 %r74, %r51;\n" +" @!%p13 bra $Lt_1_20994;\n" +" sub.s32 %r75, %r55, %r51;\n" +" sub.s32 %r76, %r71, %r42;\n" +" add.s32 %r77, %r38, %r42;\n" +" add.s32 %r78, %r48, %r51;\n" +" sub.s32 %r79, %r77, %r39;\n" +" sub.s32 %r80, %r78, %r39;\n" +" add.s32 %r81, %r76, %r79;\n" +" mul.lo.s32 %r82, %r33, %r81;\n" +" ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];\n" +" ld.param.u64 %rd24, [__cudaparm_make_rho_counts];\n" +" mov.s32 %r84, %r75;\n" +"$Lt_1_21506:\n" +" .loc 17 210 0\n" +" sub.s32 %r85, %r74, %r51;\n" +" add.s32 %r86, %r85, %r80;\n" +" mul.lo.s32 %r87, %r86, %r32;\n" +" add.s32 %r88, %r82, %r87;\n" +" add.s32 %r89, %r66, %r88;\n" +" cvt.s64.s32 %rd25, %r89;\n" +" mul.wide.s32 %rd26, %r89, 4;\n" +" add.u64 %rd27, %rd24, %rd26;\n" +" ld.global.s32 %r90, [%rd27+0];\n" +" mul.lo.s32 %r91, %r90, %r83;\n" +" .loc 17 211 0\n" +" mov.s32 %r92, %r89;\n" +" setp.ge.s32 %p14, %r89, %r91;\n" +" @%p14 bra $Lt_1_21762;\n" +" sub.s32 %r93, %r3, 1;\n" +" cvt.s64.s32 %rd28, %r83;\n" +" mul.wide.s32 %rd29, %r83, 32;\n" +" mov.s32 %r94, -1;\n" +" setp.gt.s32 %p15, %r93, %r94;\n" +" ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];\n" +" mul.lo.u64 %rd31, %rd25, 32;\n" +" add.u64 %rd32, %rd30, %rd31;\n" +"$Lt_1_22274:\n" +" .loc 17 212 0\n" +" ld.global.f64 %fd4, [%rd32+0];\n" +" @!%p15 bra $Lt_1_29954;\n" +" sub.s32 %r95, %r93, %r74;\n" +" mov.s32 %r96, -1;\n" +" sub.s32 %r97, %r96, %r74;\n" +" cvt.s64.s32 %rd33, %r2;\n" +" mul.wide.s32 %rd34, %r2, 8;\n" +" ld.global.f64 %fd5, [%rd32+8];\n" +" ld.global.f64 %fd6, [%rd32+16];\n" +" cvt.s64.s32 %rd35, %r95;\n" +" mul.wide.s32 %rd36, %r95, 8;\n" +" add.u64 %rd37, %rd1, %rd36;\n" +" sub.s32 %r98, %r93, %r71;\n" +" cvt.s64.s32 %rd38, %r98;\n" +" mul.wide.s32 %rd39, %r98, 8;\n" +" add.u64 %rd40, %rd1, %rd39;\n" +" mov.f64 %fd7, 0d0000000000000000; \n" +" mov.f64 %fd8, 0d0000000000000000; \n" +"$Lt_1_23042:\n" +" .loc 17 217 0\n" +" ld.shared.f64 %fd9, [%rd37+0];\n" +" mad.rn.f64 %fd8, %fd8, %fd5, %fd9;\n" +" .loc 17 218 0\n" +" ld.shared.f64 %fd10, [%rd40+0];\n" +" mad.rn.f64 %fd7, %fd7, %fd6, %fd10;\n" +" sub.u64 %rd40, %rd40, %rd34;\n" +" sub.s32 %r95, %r95, %r2;\n" +" sub.u64 %rd37, %rd37, %rd34;\n" +" setp.gt.s32 %p16, %r95, %r97;\n" +" @%p16 bra $Lt_1_23042;\n" +" bra.uni $Lt_1_22530;\n" +"$Lt_1_29954:\n" +" mov.f64 %fd7, 0d0000000000000000; \n" +" mov.f64 %fd8, 0d0000000000000000; \n" +"$Lt_1_22530:\n" +" .loc 17 220 0\n" +" ld.global.f64 %fd11, [%rd32+24];\n" +" mul.f64 %fd12, %fd7, %fd8;\n" +" mul.f64 %fd13, %fd11, %fd12;\n" +" @!%p8 bra $Lt_1_23554;\n" +" mov.s32 %r99, %r2;\n" +" cvt.s64.s32 %rd41, %r4;\n" +" mul.wide.s32 %rd42, %r4, 8;\n" +" add.u64 %rd43, %rd20, %rd42;\n" +" mov.s32 %r100, 0;\n" +" mov.s32 %r101, %r99;\n" +"$Lt_1_24066:\n" +" .loc 17 224 0\n" +" add.s32 %r102, %r100, %r1;\n" +" mov.s32 %r103, %r102;\n" +" setp.lt.s32 %p17, %r102, %r100;\n" +" @%p17 bra $Lt_1_30466;\n" +" cvt.s64.s32 %rd44, %r2;\n" +" mul.wide.s32 %rd34, %r2, 8;\n" +" cvt.s64.s32 %rd45, %r102;\n" +" mul.wide.s32 %rd46, %r102, 8;\n" +" add.u64 %rd47, %rd1, %rd46;\n" +" mov.f64 %fd14, 0d0000000000000000; \n" +"$Lt_1_24834:\n" +" .loc 17 225 0\n" +" ld.shared.f64 %fd15, [%rd47+0];\n" +" mad.rn.f64 %fd14, %fd4, %fd14, %fd15;\n" +" sub.s32 %r103, %r103, %r2;\n" +" sub.u64 %rd47, %rd47, %rd34;\n" +" setp.ge.s32 %p18, %r103, %r100;\n" +" @%p18 bra $Lt_1_24834;\n" +" bra.uni $Lt_1_24322;\n" +"$Lt_1_30466:\n" +" mov.f64 %fd14, 0d0000000000000000; \n" +"$Lt_1_24322:\n" +" .loc 17 226 0\n" +" ld.shared.f64 %fd16, [%rd43+0];\n" +" mad.rn.f64 %fd17, %fd14, %fd13, %fd16;\n" +" st.shared.f64 [%rd43+0], %fd17;\n" +" add.s32 %r100, %r100, 1;\n" +" add.u64 %rd43, %rd43, 512;\n" +" setp.ne.s32 %p19, %r100, %r2;\n" +" @%p19 bra $Lt_1_24066;\n" +"$Lt_1_23554:\n" +" add.s32 %r92, %r92, %r83;\n" +" add.u64 %rd32, %rd29, %rd32;\n" +" setp.gt.s32 %p20, %r91, %r92;\n" +" @%p20 bra $Lt_1_22274;\n" +"$Lt_1_21762:\n" +" add.s32 %r74, %r74, 1;\n" +" setp.ne.s32 %p21, %r55, %r74;\n" +" @%p21 bra $Lt_1_21506;\n" +"$Lt_1_20994:\n" +" add.s32 %r71, %r71, 1;\n" +" setp.ne.s32 %p22, %r47, %r71;\n" +" @%p22 bra $Lt_1_20738;\n" +"$Lt_1_20226:\n" +"$Lt_1_19714:\n" +" .loc 17 235 0\n" +" bar.sync 0;\n" +" @!%p2 bra $Lt_1_26626;\n" +" .loc 17 237 0\n" +" ld.shared.f64 %fd18, [%rd19+256];\n" +" st.shared.f64 [%rd19+0], %fd18;\n" +" .loc 17 238 0\n" +" mov.f64 %fd19, 0d0000000000000000; \n" +" st.shared.f64 [%rd19+256], %fd19;\n" +" bra.uni $Lt_1_26370;\n" +"$Lt_1_26626:\n" +" .loc 17 240 0\n" +" mov.f64 %fd20, 0d0000000000000000; \n" +" st.shared.f64 [%rd19+0], %fd20;\n" +"$Lt_1_26370:\n" +" @!%p8 bra $Lt_1_26882;\n" +" mov.s32 %r104, %r2;\n" +" cvt.s64.s32 %rd48, %r4;\n" +" mov.s32 %r105, %r11;\n" +" add.s32 %r106, %r11, %r2;\n" +" mul.wide.s32 %rd49, %r4, 8;\n" +" add.u64 %rd50, %rd20, %rd49;\n" +" mov.s64 %rd51, %rd19;\n" +" mov.s32 %r107, %r104;\n" +"$Lt_1_27394:\n" +" .loc 17 243 0\n" +" ld.shared.f64 %fd21, [%rd50+0];\n" +" ld.shared.f64 %fd22, [%rd51+0];\n" +" add.f64 %fd23, %fd21, %fd22;\n" +" st.shared.f64 [%rd51+0], %fd23;\n" +" .loc 17 244 0\n" +" bar.sync 0;\n" +" add.s32 %r105, %r105, 1;\n" +" add.u64 %rd51, %rd51, 8;\n" +" add.u64 %rd50, %rd50, 512;\n" +" setp.ne.s32 %p23, %r105, %r106;\n" +" @%p23 bra $Lt_1_27394;\n" +"$Lt_1_26882:\n" +" set.lt.u32.s32 %r108, %r66, %r17;\n" +" neg.s32 %r109, %r108;\n" +" and.b32 %r110, %r61, %r109;\n" +" mov.u32 %r111, 0;\n" +" setp.eq.s32 %p24, %r110, %r111;\n" +" @%p24 bra $Lt_1_27906;\n" +" .loc 17 248 0\n" +" ld.shared.f64 %fd24, [%rd19+0];\n" +" ld.param.u64 %rd52, [__cudaparm_make_rho_brick];\n" +" add.s32 %r112, %r11, %r58;\n" +" mul.lo.s32 %r113, %r37, %r17;\n" +" mul.lo.s32 %r114, %r38, %r113;\n" +" mul.lo.s32 %r115, %r48, %r17;\n" +" add.s32 %r116, %r114, %r115;\n" +" add.s32 %r117, %r112, %r116;\n" +" cvt.s64.s32 %rd53, %r117;\n" +" mul.wide.s32 %rd54, %r117, 8;\n" +" add.u64 %rd55, %rd52, %rd54;\n" +" st.global.f64 [%rd55+0], %fd24;\n" +"$Lt_1_27906:\n" +" add.s32 %r58, %r58, 32;\n" +" setp.ne.s32 %p25, %r58, %r59;\n" +" @%p25 bra $Lt_1_18434;\n" +"$Lt_1_17922:\n" +" .loc 17 252 0\n" +" exit;\n" +"$LDWend_make_rho:\n" +" }\n" +" .entry interp (\n" +" .param .u64 __cudaparm_interp_x_,\n" +" .param .u64 __cudaparm_interp_q_,\n" +" .param .s32 __cudaparm_interp_nlocal,\n" +" .param .u64 __cudaparm_interp_brick,\n" +" .param .u64 __cudaparm_interp__rho_coeff,\n" +" .param .s32 __cudaparm_interp_npts_x,\n" +" .param .s32 __cudaparm_interp_npts_yx,\n" +" .param .f64 __cudaparm_interp_b_lo_x,\n" +" .param .f64 __cudaparm_interp_b_lo_y,\n" +" .param .f64 __cudaparm_interp_b_lo_z,\n" +" .param .f64 __cudaparm_interp_delxinv,\n" +" .param .f64 __cudaparm_interp_delyinv,\n" +" .param .f64 __cudaparm_interp_delzinv,\n" +" .param .s32 __cudaparm_interp_order,\n" +" .param .s32 __cudaparm_interp_order2,\n" +" .param .f64 __cudaparm_interp_qqrd2e_scale,\n" +" .param .u64 __cudaparm_interp_ans)\n" +" {\n" +" .reg .u32 %r<56>;\n" +" .reg .u64 %rd<37>;\n" +" .reg .f32 %f<19>;\n" +" .reg .f64 %fd<63>;\n" +" .reg .pred %p<14>;\n" +" .shared .align 8 .b8 __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568[512];\n" +" .shared .align 8 .b8 __cuda___cuda_local_var_32630_34_non_const_rho1d_06080[4096];\n" +" .shared .align 8 .b8 __cuda___cuda_local_var_32631_34_non_const_rho1d_110176[4096];\n" +" .loc 17 262 0\n" +"$LDWbegin_interp:\n" +" ld.param.s32 %r1, [__cudaparm_interp_order2];\n" +" ld.param.s32 %r2, [__cudaparm_interp_order];\n" +" add.s32 %r3, %r1, %r2;\n" +" cvt.s32.u32 %r4, %tid.x;\n" +" setp.le.s32 %p1, %r3, %r4;\n" +" @%p1 bra $Lt_2_8706;\n" +" .loc 17 269 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568;\n" +" cvt.s64.s32 %rd2, %r4;\n" +" mul.wide.s32 %rd3, %r4, 8;\n" +" ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f64 %fd1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f64 [%rd6+0], %fd1;\n" +"$Lt_2_8706:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568;\n" +" .loc 17 270 0\n" +" bar.sync 0;\n" +" mov.u32 %r5, %ctaid.x;\n" +" mov.u32 %r6, %ntid.x;\n" +" mul.lo.u32 %r7, %r5, %r6;\n" +" add.u32 %r8, %r4, %r7;\n" +" ld.param.s32 %r9, [__cudaparm_interp_nlocal];\n" +" setp.le.s32 %p2, %r9, %r8;\n" +" @%p2 bra $Lt_2_9218;\n" +" .loc 17 278 0\n" +" mov.u32 %r10, %r8;\n" +" mov.s32 %r11, 0;\n" +" mov.u32 %r12, %r11;\n" +" mov.s32 %r13, 0;\n" +" mov.u32 %r14, %r13;\n" +" mov.s32 %r15, 0;\n" +" mov.u32 %r16, %r15;\n" +" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}];\n" +" mov.f32 %f5, %f1;\n" +" mov.f32 %f6, %f2;\n" +" mov.f32 %f7, %f3;\n" +" .loc 17 279 0\n" +" mov.u32 %r17, %r8;\n" +" mov.s32 %r18, 0;\n" +" mov.u32 %r19, %r18;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}];\n" +" mov.f32 %f12, %f8;\n" +" cvt.ftz.f64.f32 %fd2, %f12;\n" +" ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale];\n" +" mul.f64 %fd4, %fd2, %fd3;\n" +" mov.f64 %fd5, 0d0000000000000000; \n" +" setp.neu.f64 %p3, %fd4, %fd5;\n" +" @!%p3 bra $Lt_2_9986;\n" +" mov.s32 %r24, 0;\n" +" setp.gt.s32 %p4, %r2, %r24;\n" +" ld.param.f64 %fd6, [__cudaparm_interp_delxinv];\n" +" cvt.ftz.f64.f32 %fd7, %f5;\n" +" ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x];\n" +" sub.f64 %fd9, %fd7, %fd8;\n" +" mul.f64 %fd10, %fd6, %fd9;\n" +" @!%p4 bra $Lt_2_16386;\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32630_34_non_const_rho1d_06080;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32631_34_non_const_rho1d_110176;\n" +" cvt.rzi.s32.f64 %r25, %fd10;\n" +" cvt.rn.f64.s32 %fd11, %r25;\n" +" mov.f64 %fd12, 0d3fe0000000000000; \n" +" add.f64 %fd13, %fd11, %fd12;\n" +" sub.f64 %fd14, %fd13, %fd10;\n" +" ld.param.f64 %fd15, [__cudaparm_interp_delyinv];\n" +" cvt.ftz.f64.f32 %fd16, %f6;\n" +" ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y];\n" +" sub.f64 %fd18, %fd16, %fd17;\n" +" mul.f64 %fd19, %fd15, %fd18;\n" +" cvt.rzi.s32.f64 %r26, %fd19;\n" +" cvt.rn.f64.s32 %fd20, %r26;\n" +" mov.f64 %fd21, 0d3fe0000000000000; \n" +" add.f64 %fd22, %fd20, %fd21;\n" +" sub.f64 %fd23, %fd22, %fd19;\n" +" mov.s32 %r27, %r2;\n" +" cvt.s64.s32 %rd9, %r4;\n" +" mov.s32 %r28, %r1;\n" +" mul.wide.s32 %rd3, %r4, 8;\n" +" add.u64 %rd10, %rd3, %rd7;\n" +" add.u64 %rd11, %rd3, %rd8;\n" +" mov.s32 %r29, 0;\n" +" mov.s32 %r30, %r27;\n" +"$Lt_2_10754:\n" +" .loc 17 298 0\n" +" mov.f64 %fd24, 0d0000000000000000; \n" +" mov.f64 %fd25, 0d0000000000000000; \n" +" st.shared.f64 [%rd10+0], %fd25;\n" +" .loc 17 299 0\n" +" mov.f64 %fd26, 0d0000000000000000; \n" +" mov.f64 %fd27, 0d0000000000000000; \n" +" st.shared.f64 [%rd11+0], %fd27;\n" +" .loc 17 300 0\n" +" mov.s32 %r31, %r28;\n" +" setp.lt.s32 %p5, %r28, %r29;\n" +" @%p5 bra $Lt_2_11010;\n" +" cvt.s64.s32 %rd12, %r2;\n" +" mul.wide.s32 %rd13, %r2, 8;\n" +" cvt.s64.s32 %rd14, %r28;\n" +" mul.wide.s32 %rd15, %r28, 8;\n" +" add.u64 %rd16, %rd1, %rd15;\n" +"$Lt_2_11522:\n" +" .loc 17 301 0\n" +" ld.shared.f64 %fd28, [%rd16+0];\n" +" mad.rn.f64 %fd24, %fd24, %fd14, %fd28;\n" +" st.shared.f64 [%rd10+0], %fd24;\n" +" .loc 17 302 0\n" +" mad.rn.f64 %fd26, %fd26, %fd23, %fd28;\n" +" st.shared.f64 [%rd11+0], %fd26;\n" +" sub.s32 %r31, %r31, %r2;\n" +" sub.u64 %rd16, %rd16, %rd13;\n" +" setp.ge.s32 %p6, %r31, %r29;\n" +" @%p6 bra $Lt_2_11522;\n" +"$Lt_2_11010:\n" +" add.s32 %r29, %r29, 1;\n" +" add.s32 %r28, %r28, 1;\n" +" add.u64 %rd11, %rd11, 512;\n" +" add.u64 %rd10, %rd10, 512;\n" +" setp.ne.s32 %p7, %r28, %r3;\n" +" @%p7 bra $Lt_2_10754;\n" +" bra.uni $Lt_2_10242;\n" +"$Lt_2_16386:\n" +" cvt.rzi.s32.f64 %r25, %fd10;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32631_34_non_const_rho1d_110176;\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32630_34_non_const_rho1d_06080;\n" +"$Lt_2_10242:\n" +" .loc 17 306 0\n" +" ld.param.f64 %fd29, [__cudaparm_interp_delzinv];\n" +" cvt.ftz.f64.f32 %fd30, %f7;\n" +" ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z];\n" +" sub.f64 %fd32, %fd30, %fd31;\n" +" mul.f64 %fd33, %fd29, %fd32;\n" +" cvt.rzi.s32.f64 %r32, %fd33;\n" +" ld.param.s32 %r33, [__cudaparm_interp_npts_yx];\n" +" mul.lo.s32 %r34, %r32, %r33;\n" +" add.s32 %r35, %r25, %r34;\n" +" @!%p4 bra $Lt_2_16898;\n" +" cvt.rn.f64.s32 %fd34, %r32;\n" +" mov.f64 %fd35, 0d3fe0000000000000; \n" +" add.f64 %fd36, %fd34, %fd35;\n" +" sub.f64 %fd37, %fd36, %fd33;\n" +" mov.s32 %r36, %r2;\n" +" cvt.ftz.f64.f32 %fd38, %f6;\n" +" cvt.s64.s32 %rd17, %r4;\n" +" ld.param.f64 %fd39, [__cudaparm_interp_delyinv];\n" +" ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y];\n" +" sub.f64 %fd41, %fd38, %fd40;\n" +" mul.f64 %fd42, %fd39, %fd41;\n" +" cvt.rzi.s32.f64 %r37, %fd42;\n" +" mul.wide.s32 %rd3, %r4, 8;\n" +" ld.param.s32 %r38, [__cudaparm_interp_npts_x];\n" +" mul.lo.s32 %r39, %r37, %r38;\n" +" add.u64 %rd18, %rd3, %rd7;\n" +" add.u64 %rd19, %rd3, %rd8;\n" +" cvt.s64.s32 %rd20, %r38;\n" +" mul.wide.s32 %rd21, %r38, 32;\n" +" add.s32 %r40, %r39, %r35;\n" +" mov.s32 %r41, %r40;\n" +" ld.param.u64 %rd22, [__cudaparm_interp_brick];\n" +" mov.s32 %r42, 0;\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, 0f00000000; \n" +" mov.s32 %r43, %r36;\n" +"$Lt_2_12802:\n" +" .loc 17 309 0\n" +" add.s32 %r44, %r42, %r1;\n" +" mov.s32 %r45, %r44;\n" +" setp.lt.s32 %p8, %r44, %r42;\n" +" @%p8 bra $Lt_2_17154;\n" +" cvt.s64.s32 %rd23, %r2;\n" +" mul.wide.s32 %rd13, %r2, 8;\n" +" cvt.s64.s32 %rd24, %r44;\n" +" mul.wide.s32 %rd25, %r44, 8;\n" +" add.u64 %rd26, %rd1, %rd25;\n" +" mov.f64 %fd43, 0d0000000000000000; \n" +"$Lt_2_13570:\n" +" .loc 17 310 0\n" +" ld.shared.f64 %fd44, [%rd26+0];\n" +" mad.rn.f64 %fd43, %fd37, %fd43, %fd44;\n" +" sub.s32 %r45, %r45, %r2;\n" +" sub.u64 %rd26, %rd26, %rd13;\n" +" setp.ge.s32 %p9, %r45, %r42;\n" +" @%p9 bra $Lt_2_13570;\n" +" bra.uni $Lt_2_13058;\n" +"$Lt_2_17154:\n" +" mov.f64 %fd43, 0d0000000000000000; \n" +"$Lt_2_13058:\n" +" .loc 17 312 0\n" +" mov.s32 %r46, %r41;\n" +" mov.s32 %r47, %r2;\n" +" mov.s32 %r48, %r46;\n" +" mul.f64 %fd45, %fd4, %fd43;\n" +" mov.s64 %rd27, %rd19;\n" +" cvt.s64.s32 %rd28, %r46;\n" +" mul.wide.s32 %rd29, %r46, 32;\n" +" mov.s32 %r49, 0;\n" +" mov.s32 %r50, %r47;\n" +"$Lt_2_14594:\n" +" mov.s32 %r51, %r2;\n" +" mov.s32 %r52, %r48;\n" +" add.s32 %r53, %r48, %r2;\n" +" mov.s64 %rd30, %rd18;\n" +" ld.shared.f64 %fd46, [%rd27+0];\n" +" add.u64 %rd31, %rd29, %rd22;\n" +" mul.f64 %fd47, %fd45, %fd46;\n" +" mov.s32 %r54, %r51;\n" +"$Lt_2_15362:\n" +" .loc 17 316 0\n" +" ld.shared.f64 %fd48, [%rd30+0];\n" +" mul.f64 %fd49, %fd48, %fd47;\n" +" .loc 17 318 0\n" +" cvt.ftz.f64.f32 %fd50, %f15;\n" +" ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0];\n" +" mul.f64 %fd53, %fd49, %fd51;\n" +" sub.f64 %fd54, %fd50, %fd53;\n" +" cvt.rn.ftz.f32.f64 %f15, %fd54;\n" +" .loc 17 319 0\n" +" cvt.ftz.f64.f32 %fd55, %f14;\n" +" mul.f64 %fd56, %fd49, %fd52;\n" +" sub.f64 %fd57, %fd55, %fd56;\n" +" cvt.rn.ftz.f32.f64 %f14, %fd57;\n" +" .loc 17 320 0\n" +" cvt.ftz.f64.f32 %fd58, %f13;\n" +" ld.global.f64 %fd59, [%rd31+16];\n" +" mul.f64 %fd60, %fd49, %fd59;\n" +" sub.f64 %fd61, %fd58, %fd60;\n" +" cvt.rn.ftz.f32.f64 %f13, %fd61;\n" +" add.s32 %r52, %r52, 1;\n" +" add.u64 %rd31, %rd31, 32;\n" +" add.u64 %rd30, %rd30, 512;\n" +" setp.ne.s32 %p10, %r52, %r53;\n" +" @%p10 bra $Lt_2_15362;\n" +" add.s32 %r49, %r49, 1;\n" +" add.s32 %r48, %r48, %r38;\n" +" add.u64 %rd29, %rd29, %rd21;\n" +" add.u64 %rd27, %rd27, 512;\n" +" setp.ne.s32 %p11, %r49, %r2;\n" +" @%p11 bra $Lt_2_14594;\n" +" add.s32 %r42, %r42, 1;\n" +" add.s32 %r41, %r46, %r33;\n" +" setp.ne.s32 %p12, %r42, %r2;\n" +" @%p12 bra $Lt_2_12802;\n" +" bra.uni $Lt_2_9730;\n" +"$Lt_2_16898:\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, 0f00000000; \n" +" bra.uni $Lt_2_9730;\n" +"$Lt_2_9986:\n" +" mov.f32 %f13, 0f00000000; \n" +" mov.f32 %f14, 0f00000000; \n" +" mov.f32 %f15, 0f00000000; \n" +"$Lt_2_9730:\n" +" .loc 17 327 0\n" +" ld.param.u64 %rd32, [__cudaparm_interp_ans];\n" +" cvt.s64.s32 %rd33, %r8;\n" +" mul.wide.s32 %rd34, %r8, 16;\n" +" add.u64 %rd35, %rd32, %rd34;\n" +" mov.f32 %f16, %f17;\n" +" st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16};\n" +"$Lt_2_9218:\n" +" .loc 17 329 0\n" +" exit;\n" +"$LDWend_interp:\n" +" }\n" +; diff --git a/lib/gpu/pppm_f_gpu_kernel.ptx b/lib/gpu/pppm_f_gpu_kernel.ptx new file mode 100644 index 000000000..624a67de1 --- /dev/null +++ b/lib/gpu/pppm_f_gpu_kernel.ptx @@ -0,0 +1,881 @@ + .version 2.3 + .target sm_20 + .address_size 64 + // compiled with /usr/local/cuda/open64/lib//be + // nvopencc 4.0 built on 2011-05-12 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_0000bc4a_00000000-9_pppm_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.A49KLP) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "<command-line>" + .file 2 "/tmp/tmpxft_0000bc4a_00000000-8_pppm_gpu_kernel.cudafe2.gpu" + .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" + .file 4 "/usr/local/cuda/include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/include/host_defines.h" + .file 6 "/usr/local/cuda/include/builtin_types.h" + .file 7 "/usr/local/cuda/include/device_types.h" + .file 8 "/usr/local/cuda/include/driver_types.h" + .file 9 "/usr/local/cuda/include/surface_types.h" + .file 10 "/usr/local/cuda/include/texture_types.h" + .file 11 "/usr/local/cuda/include/vector_types.h" + .file 12 "/usr/local/cuda/include/device_launch_parameters.h" + .file 13 "/usr/local/cuda/include/crt/storage_class.h" + .file 14 "/usr/include/bits/types.h" + .file 15 "/usr/include/time.h" + .file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h" + .file 17 "pppm_gpu_kernel.cu" + .file 18 "/usr/local/cuda/include/common_functions.h" + .file 19 "/usr/local/cuda/include/math_functions.h" + .file 20 "/usr/local/cuda/include/math_constants.h" + .file 21 "/usr/local/cuda/include/device_functions.h" + .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" + .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" + .file 26 "/usr/local/cuda/include/surface_functions.h" + .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" + + .global .texref pos_tex; + .global .texref q_tex; + + .entry particle_map ( + .param .u64 __cudaparm_particle_map_x_, + .param .u64 __cudaparm_particle_map_q_, + .param .f32 __cudaparm_particle_map_delvolinv, + .param .s32 __cudaparm_particle_map_nlocal, + .param .u64 __cudaparm_particle_map_counts, + .param .u64 __cudaparm_particle_map_ans, + .param .f32 __cudaparm_particle_map_b_lo_x, + .param .f32 __cudaparm_particle_map_b_lo_y, + .param .f32 __cudaparm_particle_map_b_lo_z, + .param .f32 __cudaparm_particle_map_delxinv, + .param .f32 __cudaparm_particle_map_delyinv, + .param .f32 __cudaparm_particle_map_delzinv, + .param .s32 __cudaparm_particle_map_nlocal_x, + .param .s32 __cudaparm_particle_map_nlocal_y, + .param .s32 __cudaparm_particle_map_nlocal_z, + .param .s32 __cudaparm_particle_map_atom_stride, + .param .s32 __cudaparm_particle_map_max_atoms, + .param .u64 __cudaparm_particle_map_error) + { + .reg .u32 %r<50>; + .reg .u64 %rd<12>; + .reg .f32 %f<44>; + .reg .pred %p<11>; + .loc 17 113 0 +$LDWbegin_particle_map: + mov.u32 %r1, %ntid.x; + mov.u32 %r2, %ctaid.x; + mul.lo.u32 %r3, %r2, %r1; + mov.u32 %r4, %nctaid.x; + mul.lo.u32 %r5, %r4, %r1; + mov.u32 %r6, %tid.x; + add.u32 %r7, %r6, %r3; + sub.s32 %r8, %r5, 1; + mul.lo.s32 %r9, %r7, 64; + div.s32 %r10, %r9, %r5; + mul.lo.s32 %r11, %r8, %r10; + sub.s32 %r12, %r9, %r11; + ld.param.s32 %r13, [__cudaparm_particle_map_nlocal]; + setp.le.s32 %p1, %r13, %r12; + @%p1 bra $Lt_0_7426; + .loc 17 125 0 + mov.u32 %r14, %r12; + mov.s32 %r15, 0; + mov.u32 %r16, %r15; + mov.s32 %r17, 0; + mov.u32 %r18, %r17; + mov.s32 %r19, 0; + mov.u32 %r20, %r19; + tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}]; + mov.f32 %f5, %f1; + mov.f32 %f6, %f2; + mov.f32 %f7, %f3; + .loc 17 127 0 + mov.u32 %r21, %r12; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + mov.s32 %r24, 0; + mov.u32 %r25, %r24; + mov.s32 %r26, 0; + mov.u32 %r27, %r26; + tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}]; + mov.f32 %f12, %f8; + ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv]; + mul.ftz.f32 %f14, %f13, %f12; + mov.f32 %f15, 0f00000000; // 0 + setp.neu.ftz.f32 %p2, %f14, %f15; + @!%p2 bra $Lt_0_7426; + .loc 17 130 0 + ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x]; + sub.ftz.f32 %f17, %f5, %f16; + ld.param.f32 %f18, [__cudaparm_particle_map_delxinv]; + mul.ftz.f32 %f19, %f18, %f17; + mov.f32 %f20, 0f00000000; // 0 + setp.lt.ftz.f32 %p3, %f19, %f20; + @%p3 bra $Lt_0_8706; + ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y]; + sub.ftz.f32 %f22, %f6, %f21; + ld.param.f32 %f23, [__cudaparm_particle_map_delyinv]; + mul.ftz.f32 %f24, %f23, %f22; + mov.f32 %f25, 0f00000000; // 0 + setp.lt.ftz.f32 %p4, %f24, %f25; + @%p4 bra $Lt_0_8706; + ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z]; + sub.ftz.f32 %f27, %f7, %f26; + ld.param.f32 %f28, [__cudaparm_particle_map_delzinv]; + mul.ftz.f32 %f29, %f28, %f27; + mov.f32 %f30, 0f00000000; // 0 + setp.lt.ftz.f32 %p5, %f29, %f30; + @%p5 bra $Lt_0_8706; + cvt.rzi.ftz.s32.f32 %r28, %f19; + ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x]; + setp.ge.s32 %p6, %r28, %r29; + @%p6 bra $Lt_0_8706; + cvt.rzi.ftz.s32.f32 %r30, %f24; + ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y]; + setp.ge.s32 %p7, %r30, %r31; + @%p7 bra $Lt_0_8706; + cvt.rzi.ftz.s32.f32 %r32, %f29; + ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z]; + setp.gt.s32 %p8, %r33, %r32; + @%p8 bra $L_0_4866; +$Lt_0_8706: +$L_0_5122: + .loc 17 139 0 + mov.s32 %r34, 1; + ld.param.u64 %rd1, [__cudaparm_particle_map_error]; + st.global.s32 [%rd1+0], %r34; + bra.uni $Lt_0_7426; +$L_0_4866: + .loc 17 146 0 + mul.lo.s32 %r35, %r32, %r31; + add.s32 %r36, %r30, %r35; + mul.lo.s32 %r37, %r36, %r29; + add.s32 %r38, %r28, %r37; + ld.param.u64 %rd2, [__cudaparm_particle_map_counts]; + cvt.s64.s32 %rd3, %r38; + mul.wide.s32 %rd4, %r38, 4; + add.u64 %rd5, %rd2, %rd4; + mov.s32 %r39, 1; + atom.global.add.s32 %r40, [%rd5], %r39; + mov.s32 %r41, %r40; + ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms]; + setp.gt.s32 %p9, %r42, %r41; + @%p9 bra $Lt_0_7682; + .loc 17 148 0 + mov.s32 %r43, 2; + ld.param.u64 %rd6, [__cudaparm_particle_map_error]; + st.global.s32 [%rd6+0], %r43; + .loc 16 118 0 + mov.s32 %r44, -1; + atom.global.add.s32 %r45, [%rd5], %r44; + bra.uni $Lt_0_7426; +$Lt_0_7682: + .loc 17 151 0 + ld.param.u64 %rd7, [__cudaparm_particle_map_ans]; + ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride]; + mul.lo.s32 %r47, %r46, %r41; + add.s32 %r48, %r38, %r47; + cvt.s64.s32 %rd8, %r48; + mul.wide.s32 %rd9, %r48, 16; + add.u64 %rd10, %rd7, %rd9; + cvt.rn.f32.s32 %f31, %r28; + mov.f32 %f32, 0f3f000000; // 0.5 + add.ftz.f32 %f33, %f31, %f32; + sub.ftz.f32 %f34, %f33, %f19; + cvt.rn.f32.s32 %f35, %r30; + mov.f32 %f36, 0f3f000000; // 0.5 + add.ftz.f32 %f37, %f35, %f36; + sub.ftz.f32 %f38, %f37, %f24; + cvt.rn.f32.s32 %f39, %r32; + mov.f32 %f40, 0f3f000000; // 0.5 + add.ftz.f32 %f41, %f39, %f40; + sub.ftz.f32 %f42, %f41, %f29; + st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14}; +$Lt_0_7426: +$L_0_4610: +$Lt_0_6914: +$Lt_0_6402: + .loc 17 155 0 + exit; +$LDWend_particle_map: + } // particle_map + + .entry make_rho ( + .param .u64 __cudaparm_make_rho_counts, + .param .u64 __cudaparm_make_rho_atoms, + .param .u64 __cudaparm_make_rho_brick, + .param .u64 __cudaparm_make_rho__rho_coeff, + .param .s32 __cudaparm_make_rho_atom_stride, + .param .s32 __cudaparm_make_rho_npts_x, + .param .s32 __cudaparm_make_rho_npts_y, + .param .s32 __cudaparm_make_rho_npts_z, + .param .s32 __cudaparm_make_rho_nlocal_x, + .param .s32 __cudaparm_make_rho_nlocal_y, + .param .s32 __cudaparm_make_rho_nlocal_z, + .param .s32 __cudaparm_make_rho_order_m_1, + .param .s32 __cudaparm_make_rho_order, + .param .s32 __cudaparm_make_rho_order2) + { + .reg .u32 %r<119>; + .reg .u64 %rd<57>; + .reg .f32 %f<26>; + .reg .pred %p<27>; + .shared .align 4 .b8 __cuda___cuda_local_var_32531_33_non_const_rho_coeff168[256]; + .shared .align 4 .b8 __cuda___cuda_local_var_32532_33_non_const_front424[320]; + .shared .align 4 .b8 __cuda___cuda_local_var_32533_33_non_const_ans744[2048]; + .loc 17 164 0 +$LDWbegin_make_rho: + ld.param.s32 %r1, [__cudaparm_make_rho_order2]; + ld.param.s32 %r2, [__cudaparm_make_rho_order]; + add.s32 %r3, %r1, %r2; + cvt.s32.u32 %r4, %tid.x; + setp.le.s32 %p1, %r3, %r4; + @%p1 bra $Lt_1_16898; + .loc 17 171 0 + mov.u64 %rd1, __cuda___cuda_local_var_32531_33_non_const_rho_coeff168; + cvt.s64.s32 %rd2, %r4; + mul.wide.s32 %rd3, %r4, 4; + ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_1_16898: + mov.u64 %rd1, __cuda___cuda_local_var_32531_33_non_const_rho_coeff168; + shr.s32 %r5, %r4, 31; + mov.s32 %r6, 31; + and.b32 %r7, %r5, %r6; + add.s32 %r8, %r7, %r4; + shr.s32 %r9, %r8, 5; + mul.lo.s32 %r10, %r9, 32; + sub.s32 %r11, %r4, %r10; + setp.lt.s32 %p2, %r11, %r2; + @!%p2 bra $Lt_1_17410; + .loc 17 177 0 + mov.u64 %rd7, __cuda___cuda_local_var_32532_33_non_const_front424; + mov.f32 %f2, 0f00000000; // 0 + cvt.s64.s32 %rd8, %r11; + shr.s32 %r12, %r4, 31; + mov.s32 %r13, 31; + and.b32 %r14, %r12, %r13; + add.s32 %r15, %r14, %r4; + shr.s32 %r16, %r15, 5; + cvt.s64.s32 %rd9, %r16; + mul.wide.s32 %rd10, %r16, 40; + add.u64 %rd11, %rd8, %rd10; + mul.lo.u64 %rd12, %rd11, 4; + add.u64 %rd13, %rd7, %rd12; + st.shared.f32 [%rd13+128], %f2; +$Lt_1_17410: + mov.u64 %rd7, __cuda___cuda_local_var_32532_33_non_const_front424; + .loc 17 179 0 + bar.sync 0; + ld.param.s32 %r17, [__cudaparm_make_rho_npts_x]; + shr.s32 %r18, %r17, 31; + mov.s32 %r19, 31; + and.b32 %r20, %r18, %r19; + add.s32 %r21, %r20, %r17; + shr.s32 %r22, %r21, 5; + add.s32 %r23, %r22, 1; + mov.u32 %r24, 0; + setp.le.s32 %p3, %r23, %r24; + @%p3 bra $Lt_1_17922; + shr.s32 %r25, %r4, 31; + mov.s32 %r26, 31; + and.b32 %r27, %r25, %r26; + add.s32 %r28, %r27, %r4; + shr.s32 %r29, %r28, 5; + add.s32 %r30, %r11, 32; + ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y]; + ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x]; + mul.lo.s32 %r33, %r31, %r32; + mov.u32 %r34, %ctaid.x; + mul.lo.u32 %r35, %r34, 2; + add.u32 %r36, %r29, %r35; + ld.param.s32 %r37, [__cudaparm_make_rho_npts_y]; + div.s32 %r38, %r36, %r37; + ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1]; + setp.lt.s32 %p4, %r38, %r39; + sub.s32 %r40, %r39, %r38; + mov.s32 %r41, 0; + selp.s32 %r42, %r40, %r41, %p4; + ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z]; + setp.ge.s32 %p5, %r38, %r43; + sub.s32 %r44, %r43, %r38; + add.s32 %r45, %r44, %r2; + sub.s32 %r46, %r45, 1; + selp.s32 %r47, %r46, %r2, %p5; + rem.s32 %r48, %r36, %r37; + setp.lt.s32 %p6, %r48, %r39; + sub.s32 %r49, %r39, %r48; + mov.s32 %r50, 0; + selp.s32 %r51, %r49, %r50, %p6; + setp.ge.s32 %p7, %r48, %r31; + sub.s32 %r52, %r31, %r48; + add.s32 %r53, %r52, %r2; + sub.s32 %r54, %r53, 1; + selp.s32 %r55, %r54, %r2, %p7; + mov.s32 %r56, %r23; + mov.s32 %r57, 0; + setp.gt.s32 %p8, %r2, %r57; + mov.s32 %r58, 0; + cvt.s64.s32 %rd14, %r11; + cvt.s64.s32 %rd15, %r29; + mul.lo.s32 %r59, %r23, 32; + mul.wide.s32 %rd16, %r29, 40; + add.u64 %rd17, %rd14, %rd16; + ld.param.s32 %r60, [__cudaparm_make_rho_npts_z]; + setp.gt.s32 %p9, %r60, %r38; + mul.lo.u64 %rd18, %rd17, 4; + selp.s32 %r61, 1, 0, %p9; + add.u64 %rd19, %rd18, %rd7; + mov.u64 %rd20, __cuda___cuda_local_var_32533_33_non_const_ans744; + mov.s32 %r62, %r56; +$Lt_1_18434: + //<loop> Loop body line 179, nesting depth: 1, estimated iterations: unknown + @!%p8 bra $Lt_1_18690; + mov.s32 %r63, %r2; + cvt.s64.s32 %rd21, %r4; + mul.wide.s32 %rd22, %r4, 4; + add.u64 %rd23, %rd20, %rd22; + mov.s32 %r64, 0; + mov.s32 %r65, %r63; +$Lt_1_19202: + //<loop> Loop body line 179, nesting depth: 2, estimated iterations: unknown + .loc 17 203 0 + mov.f32 %f3, 0f00000000; // 0 + st.shared.f32 [%rd23+0], %f3; + add.s32 %r64, %r64, 1; + add.u64 %rd23, %rd23, 256; + setp.ne.s32 %p10, %r64, %r2; + @%p10 bra $Lt_1_19202; +$Lt_1_18690: + add.s32 %r66, %r11, %r58; + set.lt.u32.s32 %r67, %r66, %r32; + neg.s32 %r68, %r67; + and.b32 %r69, %r61, %r68; + mov.u32 %r70, 0; + setp.eq.s32 %p11, %r69, %r70; + @%p11 bra $Lt_1_20226; + .loc 17 206 0 + mov.s32 %r71, %r42; + setp.ge.s32 %p12, %r42, %r47; + @%p12 bra $Lt_1_20226; + sub.s32 %r72, %r47, %r42; + setp.lt.s32 %p13, %r51, %r55; + mov.s32 %r73, %r72; +$Lt_1_20738: + //<loop> Loop body line 206, nesting depth: 2, estimated iterations: unknown + .loc 17 208 0 + mov.s32 %r74, %r51; + @!%p13 bra $Lt_1_20994; + sub.s32 %r75, %r55, %r51; + sub.s32 %r76, %r71, %r42; + add.s32 %r77, %r38, %r42; + add.s32 %r78, %r48, %r51; + sub.s32 %r79, %r77, %r39; + sub.s32 %r80, %r78, %r39; + add.s32 %r81, %r76, %r79; + mul.lo.s32 %r82, %r33, %r81; + ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride]; + ld.param.u64 %rd24, [__cudaparm_make_rho_counts]; + mov.s32 %r84, %r75; +$Lt_1_21506: + //<loop> Loop body line 208, nesting depth: 3, estimated iterations: unknown + .loc 17 210 0 + sub.s32 %r85, %r74, %r51; + add.s32 %r86, %r85, %r80; + mul.lo.s32 %r87, %r86, %r32; + add.s32 %r88, %r82, %r87; + add.s32 %r89, %r66, %r88; + cvt.s64.s32 %rd25, %r89; + mul.wide.s32 %rd26, %r89, 4; + add.u64 %rd27, %rd24, %rd26; + ld.global.s32 %r90, [%rd27+0]; + mul.lo.s32 %r91, %r90, %r83; + .loc 17 211 0 + mov.s32 %r92, %r89; + setp.ge.s32 %p14, %r89, %r91; + @%p14 bra $Lt_1_21762; + sub.s32 %r93, %r3, 1; + cvt.s64.s32 %rd28, %r83; + mul.wide.s32 %rd29, %r83, 16; + mov.s32 %r94, -1; + setp.gt.s32 %p15, %r93, %r94; + ld.param.u64 %rd30, [__cudaparm_make_rho_atoms]; + mul.lo.u64 %rd31, %rd25, 16; + add.u64 %rd32, %rd30, %rd31; +$Lt_1_22274: + //<loop> Loop body line 211, nesting depth: 4, estimated iterations: unknown + .loc 17 212 0 + ld.global.f32 %f4, [%rd32+0]; + @!%p15 bra $Lt_1_29954; + sub.s32 %r95, %r93, %r74; + mov.s32 %r96, -1; + sub.s32 %r97, %r96, %r74; + cvt.s64.s32 %rd33, %r2; + mul.wide.s32 %rd34, %r2, 4; + ld.global.f32 %f5, [%rd32+4]; + ld.global.f32 %f6, [%rd32+8]; + cvt.s64.s32 %rd35, %r95; + mul.wide.s32 %rd36, %r95, 4; + add.u64 %rd37, %rd1, %rd36; + sub.s32 %r98, %r93, %r71; + cvt.s64.s32 %rd38, %r98; + mul.wide.s32 %rd39, %r98, 4; + add.u64 %rd40, %rd1, %rd39; + mov.f32 %f7, 0f00000000; // 0 + mov.f32 %f8, 0f00000000; // 0 +$Lt_1_23042: + //<loop> Loop body line 212, nesting depth: 5, estimated iterations: unknown + .loc 17 217 0 + ld.shared.f32 %f9, [%rd37+0]; + fma.rn.ftz.f32 %f8, %f8, %f5, %f9; + .loc 17 218 0 + ld.shared.f32 %f10, [%rd40+0]; + fma.rn.ftz.f32 %f7, %f7, %f6, %f10; + sub.u64 %rd40, %rd40, %rd34; + sub.s32 %r95, %r95, %r2; + sub.u64 %rd37, %rd37, %rd34; + setp.gt.s32 %p16, %r95, %r97; + @%p16 bra $Lt_1_23042; + bra.uni $Lt_1_22530; +$Lt_1_29954: + mov.f32 %f7, 0f00000000; // 0 + mov.f32 %f8, 0f00000000; // 0 +$Lt_1_22530: + .loc 17 220 0 + ld.global.f32 %f11, [%rd32+12]; + mul.ftz.f32 %f12, %f7, %f8; + mul.ftz.f32 %f13, %f11, %f12; + @!%p8 bra $Lt_1_23554; + mov.s32 %r99, %r2; + cvt.s64.s32 %rd41, %r4; + mul.wide.s32 %rd42, %r4, 4; + add.u64 %rd43, %rd20, %rd42; + mov.s32 %r100, 0; + mov.s32 %r101, %r99; +$Lt_1_24066: + //<loop> Loop body line 220, nesting depth: 5, estimated iterations: unknown + .loc 17 224 0 + add.s32 %r102, %r100, %r1; + mov.s32 %r103, %r102; + setp.lt.s32 %p17, %r102, %r100; + @%p17 bra $Lt_1_30466; + cvt.s64.s32 %rd44, %r2; + mul.wide.s32 %rd34, %r2, 4; + cvt.s64.s32 %rd45, %r102; + mul.wide.s32 %rd46, %r102, 4; + add.u64 %rd47, %rd1, %rd46; + mov.f32 %f14, 0f00000000; // 0 +$Lt_1_24834: + //<loop> Loop body line 224, nesting depth: 6, estimated iterations: unknown + .loc 17 225 0 + ld.shared.f32 %f15, [%rd47+0]; + fma.rn.ftz.f32 %f14, %f4, %f14, %f15; + sub.s32 %r103, %r103, %r2; + sub.u64 %rd47, %rd47, %rd34; + setp.ge.s32 %p18, %r103, %r100; + @%p18 bra $Lt_1_24834; + bra.uni $Lt_1_24322; +$Lt_1_30466: + mov.f32 %f14, 0f00000000; // 0 +$Lt_1_24322: + .loc 17 226 0 + ld.shared.f32 %f16, [%rd43+0]; + fma.rn.ftz.f32 %f17, %f14, %f13, %f16; + st.shared.f32 [%rd43+0], %f17; + add.s32 %r100, %r100, 1; + add.u64 %rd43, %rd43, 256; + setp.ne.s32 %p19, %r100, %r2; + @%p19 bra $Lt_1_24066; +$Lt_1_23554: + add.s32 %r92, %r92, %r83; + add.u64 %rd32, %rd29, %rd32; + setp.gt.s32 %p20, %r91, %r92; + @%p20 bra $Lt_1_22274; +$Lt_1_21762: + add.s32 %r74, %r74, 1; + setp.ne.s32 %p21, %r55, %r74; + @%p21 bra $Lt_1_21506; +$Lt_1_20994: + add.s32 %r71, %r71, 1; + setp.ne.s32 %p22, %r47, %r71; + @%p22 bra $Lt_1_20738; +$Lt_1_20226: +$Lt_1_19714: + .loc 17 235 0 + bar.sync 0; + @!%p2 bra $Lt_1_26626; + .loc 17 237 0 + ld.shared.f32 %f18, [%rd19+128]; + st.shared.f32 [%rd19+0], %f18; + .loc 17 238 0 + mov.f32 %f19, 0f00000000; // 0 + st.shared.f32 [%rd19+128], %f19; + bra.uni $Lt_1_26370; +$Lt_1_26626: + .loc 17 240 0 + mov.f32 %f20, 0f00000000; // 0 + st.shared.f32 [%rd19+0], %f20; +$Lt_1_26370: + @!%p8 bra $Lt_1_26882; + mov.s32 %r104, %r2; + cvt.s64.s32 %rd48, %r4; + mov.s32 %r105, %r11; + add.s32 %r106, %r11, %r2; + mul.wide.s32 %rd49, %r4, 4; + add.u64 %rd50, %rd20, %rd49; + mov.s64 %rd51, %rd19; + mov.s32 %r107, %r104; +$Lt_1_27394: + //<loop> Loop body line 240, nesting depth: 2, estimated iterations: unknown + .loc 17 243 0 + ld.shared.f32 %f21, [%rd50+0]; + ld.shared.f32 %f22, [%rd51+0]; + add.ftz.f32 %f23, %f21, %f22; + st.shared.f32 [%rd51+0], %f23; + .loc 17 244 0 + bar.sync 0; + add.s32 %r105, %r105, 1; + add.u64 %rd51, %rd51, 4; + add.u64 %rd50, %rd50, 256; + setp.ne.s32 %p23, %r105, %r106; + @%p23 bra $Lt_1_27394; +$Lt_1_26882: + set.lt.u32.s32 %r108, %r66, %r17; + neg.s32 %r109, %r108; + and.b32 %r110, %r61, %r109; + mov.u32 %r111, 0; + setp.eq.s32 %p24, %r110, %r111; + @%p24 bra $Lt_1_27906; + .loc 17 248 0 + ld.shared.f32 %f24, [%rd19+0]; + ld.param.u64 %rd52, [__cudaparm_make_rho_brick]; + add.s32 %r112, %r11, %r58; + mul.lo.s32 %r113, %r37, %r17; + mul.lo.s32 %r114, %r38, %r113; + mul.lo.s32 %r115, %r48, %r17; + add.s32 %r116, %r114, %r115; + add.s32 %r117, %r112, %r116; + cvt.s64.s32 %rd53, %r117; + mul.wide.s32 %rd54, %r117, 4; + add.u64 %rd55, %rd52, %rd54; + st.global.f32 [%rd55+0], %f24; +$Lt_1_27906: + add.s32 %r58, %r58, 32; + setp.ne.s32 %p25, %r58, %r59; + @%p25 bra $Lt_1_18434; +$Lt_1_17922: + .loc 17 252 0 + exit; +$LDWend_make_rho: + } // make_rho + + .entry interp ( + .param .u64 __cudaparm_interp_x_, + .param .u64 __cudaparm_interp_q_, + .param .s32 __cudaparm_interp_nlocal, + .param .u64 __cudaparm_interp_brick, + .param .u64 __cudaparm_interp__rho_coeff, + .param .s32 __cudaparm_interp_npts_x, + .param .s32 __cudaparm_interp_npts_yx, + .param .f32 __cudaparm_interp_b_lo_x, + .param .f32 __cudaparm_interp_b_lo_y, + .param .f32 __cudaparm_interp_b_lo_z, + .param .f32 __cudaparm_interp_delxinv, + .param .f32 __cudaparm_interp_delyinv, + .param .f32 __cudaparm_interp_delzinv, + .param .s32 __cudaparm_interp_order, + .param .s32 __cudaparm_interp_order2, + .param .f32 __cudaparm_interp_qqrd2e_scale, + .param .u64 __cudaparm_interp_ans) + { + .reg .u32 %r<56>; + .reg .u64 %rd<37>; + .reg .f32 %f<69>; + .reg .pred %p<14>; + .shared .align 4 .b8 __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888[256]; + .shared .align 4 .b8 __cuda___cuda_local_var_32630_33_non_const_rho1d_03144[2048]; + .shared .align 4 .b8 __cuda___cuda_local_var_32631_33_non_const_rho1d_15192[2048]; + // __cuda_local_var_32647_12_non_const_ek = 16 + .loc 17 262 0 +$LDWbegin_interp: + ld.param.s32 %r1, [__cudaparm_interp_order2]; + ld.param.s32 %r2, [__cudaparm_interp_order]; + add.s32 %r3, %r1, %r2; + cvt.s32.u32 %r4, %tid.x; + setp.le.s32 %p1, %r3, %r4; + @%p1 bra $Lt_2_8706; + .loc 17 269 0 + mov.u64 %rd1, __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888; + cvt.s64.s32 %rd2, %r4; + mul.wide.s32 %rd3, %r4, 4; + ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff]; + add.u64 %rd5, %rd4, %rd3; + ld.global.f32 %f1, [%rd5+0]; + add.u64 %rd6, %rd3, %rd1; + st.shared.f32 [%rd6+0], %f1; +$Lt_2_8706: + mov.u64 %rd1, __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888; + .loc 17 270 0 + bar.sync 0; + mov.u32 %r5, %ctaid.x; + mov.u32 %r6, %ntid.x; + mul.lo.u32 %r7, %r5, %r6; + add.u32 %r8, %r4, %r7; + ld.param.s32 %r9, [__cudaparm_interp_nlocal]; + setp.le.s32 %p2, %r9, %r8; + @%p2 bra $Lt_2_9218; + .loc 17 278 0 + mov.u32 %r10, %r8; + mov.s32 %r11, 0; + mov.u32 %r12, %r11; + mov.s32 %r13, 0; + mov.u32 %r14, %r13; + mov.s32 %r15, 0; + mov.u32 %r16, %r15; + tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}]; + mov.f32 %f6, %f2; + mov.f32 %f7, %f3; + mov.f32 %f8, %f4; + .loc 17 279 0 + mov.u32 %r17, %r8; + mov.s32 %r18, 0; + mov.u32 %r19, %r18; + mov.s32 %r20, 0; + mov.u32 %r21, %r20; + mov.s32 %r22, 0; + mov.u32 %r23, %r22; + tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}]; + mov.f32 %f13, %f9; + ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale]; + mul.ftz.f32 %f15, %f14, %f13; + mov.f32 %f16, 0f00000000; // 0 + setp.neu.ftz.f32 %p3, %f15, %f16; + @!%p3 bra $Lt_2_9986; + mov.s32 %r24, 0; + setp.gt.s32 %p4, %r2, %r24; + ld.param.f32 %f17, [__cudaparm_interp_b_lo_x]; + sub.ftz.f32 %f18, %f6, %f17; + ld.param.f32 %f19, [__cudaparm_interp_delxinv]; + mul.ftz.f32 %f20, %f19, %f18; + @!%p4 bra $Lt_2_16386; + mov.u64 %rd7, __cuda___cuda_local_var_32630_33_non_const_rho1d_03144; + mov.u64 %rd8, __cuda___cuda_local_var_32631_33_non_const_rho1d_15192; + cvt.rzi.ftz.s32.f32 %r25, %f20; + cvt.rn.f32.s32 %f21, %r25; + mov.f32 %f22, 0f3f000000; // 0.5 + add.ftz.f32 %f23, %f21, %f22; + sub.ftz.f32 %f24, %f23, %f20; + ld.param.f32 %f25, [__cudaparm_interp_b_lo_y]; + sub.ftz.f32 %f26, %f7, %f25; + ld.param.f32 %f27, [__cudaparm_interp_delyinv]; + mul.ftz.f32 %f28, %f27, %f26; + cvt.rzi.ftz.s32.f32 %r26, %f28; + cvt.rn.f32.s32 %f29, %r26; + mov.f32 %f30, 0f3f000000; // 0.5 + add.ftz.f32 %f31, %f29, %f30; + sub.ftz.f32 %f32, %f31, %f28; + mov.s32 %r27, %r2; + cvt.s64.s32 %rd9, %r4; + mov.s32 %r28, %r1; + mul.wide.s32 %rd3, %r4, 4; + add.u64 %rd10, %rd3, %rd7; + add.u64 %rd11, %rd3, %rd8; + mov.s32 %r29, 0; + mov.s32 %r30, %r27; +$Lt_2_10754: + //<loop> Loop body line 279, nesting depth: 1, estimated iterations: unknown + .loc 17 298 0 + mov.f32 %f33, 0f00000000; // 0 + mov.f32 %f34, 0f00000000; // 0 + st.shared.f32 [%rd10+0], %f34; + .loc 17 299 0 + mov.f32 %f35, 0f00000000; // 0 + mov.f32 %f36, 0f00000000; // 0 + st.shared.f32 [%rd11+0], %f36; + .loc 17 300 0 + mov.s32 %r31, %r28; + setp.lt.s32 %p5, %r28, %r29; + @%p5 bra $Lt_2_11010; + cvt.s64.s32 %rd12, %r2; + mul.wide.s32 %rd13, %r2, 4; + cvt.s64.s32 %rd14, %r28; + mul.wide.s32 %rd15, %r28, 4; + add.u64 %rd16, %rd1, %rd15; +$Lt_2_11522: + //<loop> Loop body line 300, nesting depth: 2, estimated iterations: unknown + .loc 17 301 0 + ld.shared.f32 %f37, [%rd16+0]; + fma.rn.ftz.f32 %f33, %f33, %f24, %f37; + st.shared.f32 [%rd10+0], %f33; + .loc 17 302 0 + fma.rn.ftz.f32 %f35, %f35, %f32, %f37; + st.shared.f32 [%rd11+0], %f35; + sub.s32 %r31, %r31, %r2; + sub.u64 %rd16, %rd16, %rd13; + setp.ge.s32 %p6, %r31, %r29; + @%p6 bra $Lt_2_11522; +$Lt_2_11010: + add.s32 %r29, %r29, 1; + add.s32 %r28, %r28, 1; + add.u64 %rd11, %rd11, 256; + add.u64 %rd10, %rd10, 256; + setp.ne.s32 %p7, %r28, %r3; + @%p7 bra $Lt_2_10754; + bra.uni $Lt_2_10242; +$Lt_2_16386: + cvt.rzi.ftz.s32.f32 %r25, %f20; + mov.u64 %rd8, __cuda___cuda_local_var_32631_33_non_const_rho1d_15192; + mov.u64 %rd7, __cuda___cuda_local_var_32630_33_non_const_rho1d_03144; +$Lt_2_10242: + .loc 17 306 0 + ld.param.f32 %f38, [__cudaparm_interp_b_lo_z]; + sub.ftz.f32 %f39, %f8, %f38; + ld.param.f32 %f40, [__cudaparm_interp_delzinv]; + mul.ftz.f32 %f41, %f40, %f39; + cvt.rzi.ftz.s32.f32 %r32, %f41; + ld.param.s32 %r33, [__cudaparm_interp_npts_yx]; + mul.lo.s32 %r34, %r32, %r33; + add.s32 %r35, %r25, %r34; + @!%p4 bra $Lt_2_16898; + cvt.rn.f32.s32 %f42, %r32; + mov.f32 %f43, 0f3f000000; // 0.5 + add.ftz.f32 %f44, %f42, %f43; + sub.ftz.f32 %f45, %f44, %f41; + mov.s32 %r36, %r2; + ld.param.f32 %f46, [__cudaparm_interp_b_lo_y]; + sub.ftz.f32 %f47, %f7, %f46; + cvt.s64.s32 %rd17, %r4; + ld.param.f32 %f48, [__cudaparm_interp_delyinv]; + mul.ftz.f32 %f49, %f48, %f47; + cvt.rzi.ftz.s32.f32 %r37, %f49; + ld.param.s32 %r38, [__cudaparm_interp_npts_x]; + mul.lo.s32 %r39, %r37, %r38; + mul.wide.s32 %rd3, %r4, 4; + add.s32 %r40, %r39, %r35; + add.u64 %rd18, %rd3, %rd7; + add.u64 %rd19, %rd3, %rd8; + cvt.s64.s32 %rd20, %r38; + mul.wide.s32 %rd21, %r38, 16; + mov.s32 %r41, %r40; + ld.param.u64 %rd22, [__cudaparm_interp_brick]; + mov.s32 %r42, 0; + mov.f32 %f50, 0f00000000; // 0 + mov.f32 %f51, 0f00000000; // 0 + mov.f32 %f52, 0f00000000; // 0 + mov.s32 %r43, %r36; +$Lt_2_12802: + //<loop> Loop body line 306, nesting depth: 1, estimated iterations: unknown + .loc 17 309 0 + add.s32 %r44, %r42, %r1; + mov.s32 %r45, %r44; + setp.lt.s32 %p8, %r44, %r42; + @%p8 bra $Lt_2_17154; + cvt.s64.s32 %rd23, %r2; + mul.wide.s32 %rd13, %r2, 4; + cvt.s64.s32 %rd24, %r44; + mul.wide.s32 %rd25, %r44, 4; + add.u64 %rd26, %rd1, %rd25; + mov.f32 %f53, 0f00000000; // 0 +$Lt_2_13570: + //<loop> Loop body line 309, nesting depth: 2, estimated iterations: unknown + .loc 17 310 0 + ld.shared.f32 %f54, [%rd26+0]; + fma.rn.ftz.f32 %f53, %f45, %f53, %f54; + sub.s32 %r45, %r45, %r2; + sub.u64 %rd26, %rd26, %rd13; + setp.ge.s32 %p9, %r45, %r42; + @%p9 bra $Lt_2_13570; + bra.uni $Lt_2_13058; +$Lt_2_17154: + mov.f32 %f53, 0f00000000; // 0 +$Lt_2_13058: + .loc 17 312 0 + mov.s32 %r46, %r41; + mov.s32 %r47, %r2; + mul.ftz.f32 %f55, %f15, %f53; + mov.s32 %r48, %r46; + mov.s64 %rd27, %rd19; + cvt.s64.s32 %rd28, %r46; + mul.wide.s32 %rd29, %r46, 16; + mov.s32 %r49, 0; + mov.s32 %r50, %r47; +$Lt_2_14594: + //<loop> Loop body line 312, nesting depth: 2, estimated iterations: unknown + mov.s32 %r51, %r2; + mov.s32 %r52, %r48; + add.s32 %r53, %r48, %r2; + mov.s64 %rd30, %rd18; + ld.shared.f32 %f56, [%rd27+0]; + add.u64 %rd31, %rd29, %rd22; + mul.ftz.f32 %f57, %f55, %f56; + mov.s32 %r54, %r51; +$Lt_2_15362: + //<loop> Loop body line 312, nesting depth: 3, estimated iterations: unknown + .loc 17 316 0 + ld.shared.f32 %f58, [%rd30+0]; + mul.ftz.f32 %f59, %f58, %f57; + ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0]; + .loc 17 318 0 + mul.ftz.f32 %f63, %f59, %f60; + sub.ftz.f32 %f52, %f52, %f63; + .loc 17 319 0 + mul.ftz.f32 %f64, %f59, %f61; + sub.ftz.f32 %f51, %f51, %f64; + .loc 17 320 0 + mul.ftz.f32 %f65, %f59, %f62; + sub.ftz.f32 %f50, %f50, %f65; + add.s32 %r52, %r52, 1; + add.u64 %rd31, %rd31, 16; + add.u64 %rd30, %rd30, 256; + setp.ne.s32 %p10, %r52, %r53; + @%p10 bra $Lt_2_15362; + add.s32 %r49, %r49, 1; + add.s32 %r48, %r48, %r38; + add.u64 %rd29, %rd29, %rd21; + add.u64 %rd27, %rd27, 256; + setp.ne.s32 %p11, %r49, %r2; + @%p11 bra $Lt_2_14594; + add.s32 %r42, %r42, 1; + add.s32 %r41, %r46, %r33; + setp.ne.s32 %p12, %r42, %r2; + @%p12 bra $Lt_2_12802; + bra.uni $Lt_2_9730; +$Lt_2_16898: + mov.f32 %f50, 0f00000000; // 0 + mov.f32 %f51, 0f00000000; // 0 + mov.f32 %f52, 0f00000000; // 0 + bra.uni $Lt_2_9730; +$Lt_2_9986: + mov.f32 %f50, 0f00000000; // 0 + mov.f32 %f51, 0f00000000; // 0 + mov.f32 %f52, 0f00000000; // 0 +$Lt_2_9730: + .loc 17 327 0 + ld.param.u64 %rd32, [__cudaparm_interp_ans]; + cvt.s64.s32 %rd33, %r8; + mul.wide.s32 %rd34, %r8, 16; + add.u64 %rd35, %rd32, %rd34; + mov.f32 %f66, %f67; + st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66}; +$Lt_2_9218: + .loc 17 329 0 + exit; +$LDWend_interp: + } // interp + diff --git a/lib/gpu/pppm_f_gpu_ptx.h b/lib/gpu/pppm_f_gpu_ptx.h new file mode 100644 index 000000000..41ab88dd5 --- /dev/null +++ b/lib/gpu/pppm_f_gpu_ptx.h @@ -0,0 +1,818 @@ +const char * pppm_f_gpu_kernel = +" .version 2.3\n" +" .target sm_20\n" +" .address_size 64\n" +" .global .texref pos_tex;\n" +" .global .texref q_tex;\n" +" .entry particle_map (\n" +" .param .u64 __cudaparm_particle_map_x_,\n" +" .param .u64 __cudaparm_particle_map_q_,\n" +" .param .f32 __cudaparm_particle_map_delvolinv,\n" +" .param .s32 __cudaparm_particle_map_nlocal,\n" +" .param .u64 __cudaparm_particle_map_counts,\n" +" .param .u64 __cudaparm_particle_map_ans,\n" +" .param .f32 __cudaparm_particle_map_b_lo_x,\n" +" .param .f32 __cudaparm_particle_map_b_lo_y,\n" +" .param .f32 __cudaparm_particle_map_b_lo_z,\n" +" .param .f32 __cudaparm_particle_map_delxinv,\n" +" .param .f32 __cudaparm_particle_map_delyinv,\n" +" .param .f32 __cudaparm_particle_map_delzinv,\n" +" .param .s32 __cudaparm_particle_map_nlocal_x,\n" +" .param .s32 __cudaparm_particle_map_nlocal_y,\n" +" .param .s32 __cudaparm_particle_map_nlocal_z,\n" +" .param .s32 __cudaparm_particle_map_atom_stride,\n" +" .param .s32 __cudaparm_particle_map_max_atoms,\n" +" .param .u64 __cudaparm_particle_map_error)\n" +" {\n" +" .reg .u32 %r<50>;\n" +" .reg .u64 %rd<12>;\n" +" .reg .f32 %f<44>;\n" +" .reg .pred %p<11>;\n" +" .loc 17 113 0\n" +"$LDWbegin_particle_map:\n" +" mov.u32 %r1, %ntid.x;\n" +" mov.u32 %r2, %ctaid.x;\n" +" mul.lo.u32 %r3, %r2, %r1;\n" +" mov.u32 %r4, %nctaid.x;\n" +" mul.lo.u32 %r5, %r4, %r1;\n" +" mov.u32 %r6, %tid.x;\n" +" add.u32 %r7, %r6, %r3;\n" +" sub.s32 %r8, %r5, 1;\n" +" mul.lo.s32 %r9, %r7, 64;\n" +" div.s32 %r10, %r9, %r5;\n" +" mul.lo.s32 %r11, %r8, %r10;\n" +" sub.s32 %r12, %r9, %r11;\n" +" ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];\n" +" setp.le.s32 %p1, %r13, %r12;\n" +" @%p1 bra $Lt_0_7426;\n" +" .loc 17 125 0\n" +" mov.u32 %r14, %r12;\n" +" mov.s32 %r15, 0;\n" +" mov.u32 %r16, %r15;\n" +" mov.s32 %r17, 0;\n" +" mov.u32 %r18, %r17;\n" +" mov.s32 %r19, 0;\n" +" mov.u32 %r20, %r19;\n" +" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];\n" +" mov.f32 %f5, %f1;\n" +" mov.f32 %f6, %f2;\n" +" mov.f32 %f7, %f3;\n" +" .loc 17 127 0\n" +" mov.u32 %r21, %r12;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" mov.s32 %r24, 0;\n" +" mov.u32 %r25, %r24;\n" +" mov.s32 %r26, 0;\n" +" mov.u32 %r27, %r26;\n" +" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];\n" +" mov.f32 %f12, %f8;\n" +" ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv];\n" +" mul.ftz.f32 %f14, %f13, %f12;\n" +" mov.f32 %f15, 0f00000000; \n" +" setp.neu.ftz.f32 %p2, %f14, %f15;\n" +" @!%p2 bra $Lt_0_7426;\n" +" .loc 17 130 0\n" +" ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x];\n" +" sub.ftz.f32 %f17, %f5, %f16;\n" +" ld.param.f32 %f18, [__cudaparm_particle_map_delxinv];\n" +" mul.ftz.f32 %f19, %f18, %f17;\n" +" mov.f32 %f20, 0f00000000; \n" +" setp.lt.ftz.f32 %p3, %f19, %f20;\n" +" @%p3 bra $Lt_0_8706;\n" +" ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y];\n" +" sub.ftz.f32 %f22, %f6, %f21;\n" +" ld.param.f32 %f23, [__cudaparm_particle_map_delyinv];\n" +" mul.ftz.f32 %f24, %f23, %f22;\n" +" mov.f32 %f25, 0f00000000; \n" +" setp.lt.ftz.f32 %p4, %f24, %f25;\n" +" @%p4 bra $Lt_0_8706;\n" +" ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z];\n" +" sub.ftz.f32 %f27, %f7, %f26;\n" +" ld.param.f32 %f28, [__cudaparm_particle_map_delzinv];\n" +" mul.ftz.f32 %f29, %f28, %f27;\n" +" mov.f32 %f30, 0f00000000; \n" +" setp.lt.ftz.f32 %p5, %f29, %f30;\n" +" @%p5 bra $Lt_0_8706;\n" +" cvt.rzi.ftz.s32.f32 %r28, %f19;\n" +" ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];\n" +" setp.ge.s32 %p6, %r28, %r29;\n" +" @%p6 bra $Lt_0_8706;\n" +" cvt.rzi.ftz.s32.f32 %r30, %f24;\n" +" ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];\n" +" setp.ge.s32 %p7, %r30, %r31;\n" +" @%p7 bra $Lt_0_8706;\n" +" cvt.rzi.ftz.s32.f32 %r32, %f29;\n" +" ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];\n" +" setp.gt.s32 %p8, %r33, %r32;\n" +" @%p8 bra $L_0_4866;\n" +"$Lt_0_8706:\n" +"$L_0_5122:\n" +" .loc 17 139 0\n" +" mov.s32 %r34, 1;\n" +" ld.param.u64 %rd1, [__cudaparm_particle_map_error];\n" +" st.global.s32 [%rd1+0], %r34;\n" +" bra.uni $Lt_0_7426;\n" +"$L_0_4866:\n" +" .loc 17 146 0\n" +" mul.lo.s32 %r35, %r32, %r31;\n" +" add.s32 %r36, %r30, %r35;\n" +" mul.lo.s32 %r37, %r36, %r29;\n" +" add.s32 %r38, %r28, %r37;\n" +" ld.param.u64 %rd2, [__cudaparm_particle_map_counts];\n" +" cvt.s64.s32 %rd3, %r38;\n" +" mul.wide.s32 %rd4, %r38, 4;\n" +" add.u64 %rd5, %rd2, %rd4;\n" +" mov.s32 %r39, 1;\n" +" atom.global.add.s32 %r40, [%rd5], %r39;\n" +" mov.s32 %r41, %r40;\n" +" ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];\n" +" setp.gt.s32 %p9, %r42, %r41;\n" +" @%p9 bra $Lt_0_7682;\n" +" .loc 17 148 0\n" +" mov.s32 %r43, 2;\n" +" ld.param.u64 %rd6, [__cudaparm_particle_map_error];\n" +" st.global.s32 [%rd6+0], %r43;\n" +" .loc 16 118 0\n" +" mov.s32 %r44, -1;\n" +" atom.global.add.s32 %r45, [%rd5], %r44;\n" +" bra.uni $Lt_0_7426;\n" +"$Lt_0_7682:\n" +" .loc 17 151 0\n" +" ld.param.u64 %rd7, [__cudaparm_particle_map_ans];\n" +" ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];\n" +" mul.lo.s32 %r47, %r46, %r41;\n" +" add.s32 %r48, %r38, %r47;\n" +" cvt.s64.s32 %rd8, %r48;\n" +" mul.wide.s32 %rd9, %r48, 16;\n" +" add.u64 %rd10, %rd7, %rd9;\n" +" cvt.rn.f32.s32 %f31, %r28;\n" +" mov.f32 %f32, 0f3f000000; \n" +" add.ftz.f32 %f33, %f31, %f32;\n" +" sub.ftz.f32 %f34, %f33, %f19;\n" +" cvt.rn.f32.s32 %f35, %r30;\n" +" mov.f32 %f36, 0f3f000000; \n" +" add.ftz.f32 %f37, %f35, %f36;\n" +" sub.ftz.f32 %f38, %f37, %f24;\n" +" cvt.rn.f32.s32 %f39, %r32;\n" +" mov.f32 %f40, 0f3f000000; \n" +" add.ftz.f32 %f41, %f39, %f40;\n" +" sub.ftz.f32 %f42, %f41, %f29;\n" +" st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14};\n" +"$Lt_0_7426:\n" +"$L_0_4610:\n" +"$Lt_0_6914:\n" +"$Lt_0_6402:\n" +" .loc 17 155 0\n" +" exit;\n" +"$LDWend_particle_map:\n" +" }\n" +" .entry make_rho (\n" +" .param .u64 __cudaparm_make_rho_counts,\n" +" .param .u64 __cudaparm_make_rho_atoms,\n" +" .param .u64 __cudaparm_make_rho_brick,\n" +" .param .u64 __cudaparm_make_rho__rho_coeff,\n" +" .param .s32 __cudaparm_make_rho_atom_stride,\n" +" .param .s32 __cudaparm_make_rho_npts_x,\n" +" .param .s32 __cudaparm_make_rho_npts_y,\n" +" .param .s32 __cudaparm_make_rho_npts_z,\n" +" .param .s32 __cudaparm_make_rho_nlocal_x,\n" +" .param .s32 __cudaparm_make_rho_nlocal_y,\n" +" .param .s32 __cudaparm_make_rho_nlocal_z,\n" +" .param .s32 __cudaparm_make_rho_order_m_1,\n" +" .param .s32 __cudaparm_make_rho_order,\n" +" .param .s32 __cudaparm_make_rho_order2)\n" +" {\n" +" .reg .u32 %r<119>;\n" +" .reg .u64 %rd<57>;\n" +" .reg .f32 %f<26>;\n" +" .reg .pred %p<27>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32531_33_non_const_rho_coeff168[256];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32532_33_non_const_front424[320];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32533_33_non_const_ans744[2048];\n" +" .loc 17 164 0\n" +"$LDWbegin_make_rho:\n" +" ld.param.s32 %r1, [__cudaparm_make_rho_order2];\n" +" ld.param.s32 %r2, [__cudaparm_make_rho_order];\n" +" add.s32 %r3, %r1, %r2;\n" +" cvt.s32.u32 %r4, %tid.x;\n" +" setp.le.s32 %p1, %r3, %r4;\n" +" @%p1 bra $Lt_1_16898;\n" +" .loc 17 171 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32531_33_non_const_rho_coeff168;\n" +" cvt.s64.s32 %rd2, %r4;\n" +" mul.wide.s32 %rd3, %r4, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_1_16898:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32531_33_non_const_rho_coeff168;\n" +" shr.s32 %r5, %r4, 31;\n" +" mov.s32 %r6, 31;\n" +" and.b32 %r7, %r5, %r6;\n" +" add.s32 %r8, %r7, %r4;\n" +" shr.s32 %r9, %r8, 5;\n" +" mul.lo.s32 %r10, %r9, 32;\n" +" sub.s32 %r11, %r4, %r10;\n" +" setp.lt.s32 %p2, %r11, %r2;\n" +" @!%p2 bra $Lt_1_17410;\n" +" .loc 17 177 0\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32532_33_non_const_front424;\n" +" mov.f32 %f2, 0f00000000; \n" +" cvt.s64.s32 %rd8, %r11;\n" +" shr.s32 %r12, %r4, 31;\n" +" mov.s32 %r13, 31;\n" +" and.b32 %r14, %r12, %r13;\n" +" add.s32 %r15, %r14, %r4;\n" +" shr.s32 %r16, %r15, 5;\n" +" cvt.s64.s32 %rd9, %r16;\n" +" mul.wide.s32 %rd10, %r16, 40;\n" +" add.u64 %rd11, %rd8, %rd10;\n" +" mul.lo.u64 %rd12, %rd11, 4;\n" +" add.u64 %rd13, %rd7, %rd12;\n" +" st.shared.f32 [%rd13+128], %f2;\n" +"$Lt_1_17410:\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32532_33_non_const_front424;\n" +" .loc 17 179 0\n" +" bar.sync 0;\n" +" ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];\n" +" shr.s32 %r18, %r17, 31;\n" +" mov.s32 %r19, 31;\n" +" and.b32 %r20, %r18, %r19;\n" +" add.s32 %r21, %r20, %r17;\n" +" shr.s32 %r22, %r21, 5;\n" +" add.s32 %r23, %r22, 1;\n" +" mov.u32 %r24, 0;\n" +" setp.le.s32 %p3, %r23, %r24;\n" +" @%p3 bra $Lt_1_17922;\n" +" shr.s32 %r25, %r4, 31;\n" +" mov.s32 %r26, 31;\n" +" and.b32 %r27, %r25, %r26;\n" +" add.s32 %r28, %r27, %r4;\n" +" shr.s32 %r29, %r28, 5;\n" +" add.s32 %r30, %r11, 32;\n" +" ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];\n" +" ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];\n" +" mul.lo.s32 %r33, %r31, %r32;\n" +" mov.u32 %r34, %ctaid.x;\n" +" mul.lo.u32 %r35, %r34, 2;\n" +" add.u32 %r36, %r29, %r35;\n" +" ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];\n" +" div.s32 %r38, %r36, %r37;\n" +" ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];\n" +" setp.lt.s32 %p4, %r38, %r39;\n" +" sub.s32 %r40, %r39, %r38;\n" +" mov.s32 %r41, 0;\n" +" selp.s32 %r42, %r40, %r41, %p4;\n" +" ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];\n" +" setp.ge.s32 %p5, %r38, %r43;\n" +" sub.s32 %r44, %r43, %r38;\n" +" add.s32 %r45, %r44, %r2;\n" +" sub.s32 %r46, %r45, 1;\n" +" selp.s32 %r47, %r46, %r2, %p5;\n" +" rem.s32 %r48, %r36, %r37;\n" +" setp.lt.s32 %p6, %r48, %r39;\n" +" sub.s32 %r49, %r39, %r48;\n" +" mov.s32 %r50, 0;\n" +" selp.s32 %r51, %r49, %r50, %p6;\n" +" setp.ge.s32 %p7, %r48, %r31;\n" +" sub.s32 %r52, %r31, %r48;\n" +" add.s32 %r53, %r52, %r2;\n" +" sub.s32 %r54, %r53, 1;\n" +" selp.s32 %r55, %r54, %r2, %p7;\n" +" mov.s32 %r56, %r23;\n" +" mov.s32 %r57, 0;\n" +" setp.gt.s32 %p8, %r2, %r57;\n" +" mov.s32 %r58, 0;\n" +" cvt.s64.s32 %rd14, %r11;\n" +" cvt.s64.s32 %rd15, %r29;\n" +" mul.lo.s32 %r59, %r23, 32;\n" +" mul.wide.s32 %rd16, %r29, 40;\n" +" add.u64 %rd17, %rd14, %rd16;\n" +" ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];\n" +" setp.gt.s32 %p9, %r60, %r38;\n" +" mul.lo.u64 %rd18, %rd17, 4;\n" +" selp.s32 %r61, 1, 0, %p9;\n" +" add.u64 %rd19, %rd18, %rd7;\n" +" mov.u64 %rd20, __cuda___cuda_local_var_32533_33_non_const_ans744;\n" +" mov.s32 %r62, %r56;\n" +"$Lt_1_18434:\n" +" @!%p8 bra $Lt_1_18690;\n" +" mov.s32 %r63, %r2;\n" +" cvt.s64.s32 %rd21, %r4;\n" +" mul.wide.s32 %rd22, %r4, 4;\n" +" add.u64 %rd23, %rd20, %rd22;\n" +" mov.s32 %r64, 0;\n" +" mov.s32 %r65, %r63;\n" +"$Lt_1_19202:\n" +" .loc 17 203 0\n" +" mov.f32 %f3, 0f00000000; \n" +" st.shared.f32 [%rd23+0], %f3;\n" +" add.s32 %r64, %r64, 1;\n" +" add.u64 %rd23, %rd23, 256;\n" +" setp.ne.s32 %p10, %r64, %r2;\n" +" @%p10 bra $Lt_1_19202;\n" +"$Lt_1_18690:\n" +" add.s32 %r66, %r11, %r58;\n" +" set.lt.u32.s32 %r67, %r66, %r32;\n" +" neg.s32 %r68, %r67;\n" +" and.b32 %r69, %r61, %r68;\n" +" mov.u32 %r70, 0;\n" +" setp.eq.s32 %p11, %r69, %r70;\n" +" @%p11 bra $Lt_1_20226;\n" +" .loc 17 206 0\n" +" mov.s32 %r71, %r42;\n" +" setp.ge.s32 %p12, %r42, %r47;\n" +" @%p12 bra $Lt_1_20226;\n" +" sub.s32 %r72, %r47, %r42;\n" +" setp.lt.s32 %p13, %r51, %r55;\n" +" mov.s32 %r73, %r72;\n" +"$Lt_1_20738:\n" +" .loc 17 208 0\n" +" mov.s32 %r74, %r51;\n" +" @!%p13 bra $Lt_1_20994;\n" +" sub.s32 %r75, %r55, %r51;\n" +" sub.s32 %r76, %r71, %r42;\n" +" add.s32 %r77, %r38, %r42;\n" +" add.s32 %r78, %r48, %r51;\n" +" sub.s32 %r79, %r77, %r39;\n" +" sub.s32 %r80, %r78, %r39;\n" +" add.s32 %r81, %r76, %r79;\n" +" mul.lo.s32 %r82, %r33, %r81;\n" +" ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];\n" +" ld.param.u64 %rd24, [__cudaparm_make_rho_counts];\n" +" mov.s32 %r84, %r75;\n" +"$Lt_1_21506:\n" +" .loc 17 210 0\n" +" sub.s32 %r85, %r74, %r51;\n" +" add.s32 %r86, %r85, %r80;\n" +" mul.lo.s32 %r87, %r86, %r32;\n" +" add.s32 %r88, %r82, %r87;\n" +" add.s32 %r89, %r66, %r88;\n" +" cvt.s64.s32 %rd25, %r89;\n" +" mul.wide.s32 %rd26, %r89, 4;\n" +" add.u64 %rd27, %rd24, %rd26;\n" +" ld.global.s32 %r90, [%rd27+0];\n" +" mul.lo.s32 %r91, %r90, %r83;\n" +" .loc 17 211 0\n" +" mov.s32 %r92, %r89;\n" +" setp.ge.s32 %p14, %r89, %r91;\n" +" @%p14 bra $Lt_1_21762;\n" +" sub.s32 %r93, %r3, 1;\n" +" cvt.s64.s32 %rd28, %r83;\n" +" mul.wide.s32 %rd29, %r83, 16;\n" +" mov.s32 %r94, -1;\n" +" setp.gt.s32 %p15, %r93, %r94;\n" +" ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];\n" +" mul.lo.u64 %rd31, %rd25, 16;\n" +" add.u64 %rd32, %rd30, %rd31;\n" +"$Lt_1_22274:\n" +" .loc 17 212 0\n" +" ld.global.f32 %f4, [%rd32+0];\n" +" @!%p15 bra $Lt_1_29954;\n" +" sub.s32 %r95, %r93, %r74;\n" +" mov.s32 %r96, -1;\n" +" sub.s32 %r97, %r96, %r74;\n" +" cvt.s64.s32 %rd33, %r2;\n" +" mul.wide.s32 %rd34, %r2, 4;\n" +" ld.global.f32 %f5, [%rd32+4];\n" +" ld.global.f32 %f6, [%rd32+8];\n" +" cvt.s64.s32 %rd35, %r95;\n" +" mul.wide.s32 %rd36, %r95, 4;\n" +" add.u64 %rd37, %rd1, %rd36;\n" +" sub.s32 %r98, %r93, %r71;\n" +" cvt.s64.s32 %rd38, %r98;\n" +" mul.wide.s32 %rd39, %r98, 4;\n" +" add.u64 %rd40, %rd1, %rd39;\n" +" mov.f32 %f7, 0f00000000; \n" +" mov.f32 %f8, 0f00000000; \n" +"$Lt_1_23042:\n" +" .loc 17 217 0\n" +" ld.shared.f32 %f9, [%rd37+0];\n" +" fma.rn.ftz.f32 %f8, %f8, %f5, %f9;\n" +" .loc 17 218 0\n" +" ld.shared.f32 %f10, [%rd40+0];\n" +" fma.rn.ftz.f32 %f7, %f7, %f6, %f10;\n" +" sub.u64 %rd40, %rd40, %rd34;\n" +" sub.s32 %r95, %r95, %r2;\n" +" sub.u64 %rd37, %rd37, %rd34;\n" +" setp.gt.s32 %p16, %r95, %r97;\n" +" @%p16 bra $Lt_1_23042;\n" +" bra.uni $Lt_1_22530;\n" +"$Lt_1_29954:\n" +" mov.f32 %f7, 0f00000000; \n" +" mov.f32 %f8, 0f00000000; \n" +"$Lt_1_22530:\n" +" .loc 17 220 0\n" +" ld.global.f32 %f11, [%rd32+12];\n" +" mul.ftz.f32 %f12, %f7, %f8;\n" +" mul.ftz.f32 %f13, %f11, %f12;\n" +" @!%p8 bra $Lt_1_23554;\n" +" mov.s32 %r99, %r2;\n" +" cvt.s64.s32 %rd41, %r4;\n" +" mul.wide.s32 %rd42, %r4, 4;\n" +" add.u64 %rd43, %rd20, %rd42;\n" +" mov.s32 %r100, 0;\n" +" mov.s32 %r101, %r99;\n" +"$Lt_1_24066:\n" +" .loc 17 224 0\n" +" add.s32 %r102, %r100, %r1;\n" +" mov.s32 %r103, %r102;\n" +" setp.lt.s32 %p17, %r102, %r100;\n" +" @%p17 bra $Lt_1_30466;\n" +" cvt.s64.s32 %rd44, %r2;\n" +" mul.wide.s32 %rd34, %r2, 4;\n" +" cvt.s64.s32 %rd45, %r102;\n" +" mul.wide.s32 %rd46, %r102, 4;\n" +" add.u64 %rd47, %rd1, %rd46;\n" +" mov.f32 %f14, 0f00000000; \n" +"$Lt_1_24834:\n" +" .loc 17 225 0\n" +" ld.shared.f32 %f15, [%rd47+0];\n" +" fma.rn.ftz.f32 %f14, %f4, %f14, %f15;\n" +" sub.s32 %r103, %r103, %r2;\n" +" sub.u64 %rd47, %rd47, %rd34;\n" +" setp.ge.s32 %p18, %r103, %r100;\n" +" @%p18 bra $Lt_1_24834;\n" +" bra.uni $Lt_1_24322;\n" +"$Lt_1_30466:\n" +" mov.f32 %f14, 0f00000000; \n" +"$Lt_1_24322:\n" +" .loc 17 226 0\n" +" ld.shared.f32 %f16, [%rd43+0];\n" +" fma.rn.ftz.f32 %f17, %f14, %f13, %f16;\n" +" st.shared.f32 [%rd43+0], %f17;\n" +" add.s32 %r100, %r100, 1;\n" +" add.u64 %rd43, %rd43, 256;\n" +" setp.ne.s32 %p19, %r100, %r2;\n" +" @%p19 bra $Lt_1_24066;\n" +"$Lt_1_23554:\n" +" add.s32 %r92, %r92, %r83;\n" +" add.u64 %rd32, %rd29, %rd32;\n" +" setp.gt.s32 %p20, %r91, %r92;\n" +" @%p20 bra $Lt_1_22274;\n" +"$Lt_1_21762:\n" +" add.s32 %r74, %r74, 1;\n" +" setp.ne.s32 %p21, %r55, %r74;\n" +" @%p21 bra $Lt_1_21506;\n" +"$Lt_1_20994:\n" +" add.s32 %r71, %r71, 1;\n" +" setp.ne.s32 %p22, %r47, %r71;\n" +" @%p22 bra $Lt_1_20738;\n" +"$Lt_1_20226:\n" +"$Lt_1_19714:\n" +" .loc 17 235 0\n" +" bar.sync 0;\n" +" @!%p2 bra $Lt_1_26626;\n" +" .loc 17 237 0\n" +" ld.shared.f32 %f18, [%rd19+128];\n" +" st.shared.f32 [%rd19+0], %f18;\n" +" .loc 17 238 0\n" +" mov.f32 %f19, 0f00000000; \n" +" st.shared.f32 [%rd19+128], %f19;\n" +" bra.uni $Lt_1_26370;\n" +"$Lt_1_26626:\n" +" .loc 17 240 0\n" +" mov.f32 %f20, 0f00000000; \n" +" st.shared.f32 [%rd19+0], %f20;\n" +"$Lt_1_26370:\n" +" @!%p8 bra $Lt_1_26882;\n" +" mov.s32 %r104, %r2;\n" +" cvt.s64.s32 %rd48, %r4;\n" +" mov.s32 %r105, %r11;\n" +" add.s32 %r106, %r11, %r2;\n" +" mul.wide.s32 %rd49, %r4, 4;\n" +" add.u64 %rd50, %rd20, %rd49;\n" +" mov.s64 %rd51, %rd19;\n" +" mov.s32 %r107, %r104;\n" +"$Lt_1_27394:\n" +" .loc 17 243 0\n" +" ld.shared.f32 %f21, [%rd50+0];\n" +" ld.shared.f32 %f22, [%rd51+0];\n" +" add.ftz.f32 %f23, %f21, %f22;\n" +" st.shared.f32 [%rd51+0], %f23;\n" +" .loc 17 244 0\n" +" bar.sync 0;\n" +" add.s32 %r105, %r105, 1;\n" +" add.u64 %rd51, %rd51, 4;\n" +" add.u64 %rd50, %rd50, 256;\n" +" setp.ne.s32 %p23, %r105, %r106;\n" +" @%p23 bra $Lt_1_27394;\n" +"$Lt_1_26882:\n" +" set.lt.u32.s32 %r108, %r66, %r17;\n" +" neg.s32 %r109, %r108;\n" +" and.b32 %r110, %r61, %r109;\n" +" mov.u32 %r111, 0;\n" +" setp.eq.s32 %p24, %r110, %r111;\n" +" @%p24 bra $Lt_1_27906;\n" +" .loc 17 248 0\n" +" ld.shared.f32 %f24, [%rd19+0];\n" +" ld.param.u64 %rd52, [__cudaparm_make_rho_brick];\n" +" add.s32 %r112, %r11, %r58;\n" +" mul.lo.s32 %r113, %r37, %r17;\n" +" mul.lo.s32 %r114, %r38, %r113;\n" +" mul.lo.s32 %r115, %r48, %r17;\n" +" add.s32 %r116, %r114, %r115;\n" +" add.s32 %r117, %r112, %r116;\n" +" cvt.s64.s32 %rd53, %r117;\n" +" mul.wide.s32 %rd54, %r117, 4;\n" +" add.u64 %rd55, %rd52, %rd54;\n" +" st.global.f32 [%rd55+0], %f24;\n" +"$Lt_1_27906:\n" +" add.s32 %r58, %r58, 32;\n" +" setp.ne.s32 %p25, %r58, %r59;\n" +" @%p25 bra $Lt_1_18434;\n" +"$Lt_1_17922:\n" +" .loc 17 252 0\n" +" exit;\n" +"$LDWend_make_rho:\n" +" }\n" +" .entry interp (\n" +" .param .u64 __cudaparm_interp_x_,\n" +" .param .u64 __cudaparm_interp_q_,\n" +" .param .s32 __cudaparm_interp_nlocal,\n" +" .param .u64 __cudaparm_interp_brick,\n" +" .param .u64 __cudaparm_interp__rho_coeff,\n" +" .param .s32 __cudaparm_interp_npts_x,\n" +" .param .s32 __cudaparm_interp_npts_yx,\n" +" .param .f32 __cudaparm_interp_b_lo_x,\n" +" .param .f32 __cudaparm_interp_b_lo_y,\n" +" .param .f32 __cudaparm_interp_b_lo_z,\n" +" .param .f32 __cudaparm_interp_delxinv,\n" +" .param .f32 __cudaparm_interp_delyinv,\n" +" .param .f32 __cudaparm_interp_delzinv,\n" +" .param .s32 __cudaparm_interp_order,\n" +" .param .s32 __cudaparm_interp_order2,\n" +" .param .f32 __cudaparm_interp_qqrd2e_scale,\n" +" .param .u64 __cudaparm_interp_ans)\n" +" {\n" +" .reg .u32 %r<56>;\n" +" .reg .u64 %rd<37>;\n" +" .reg .f32 %f<69>;\n" +" .reg .pred %p<14>;\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888[256];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32630_33_non_const_rho1d_03144[2048];\n" +" .shared .align 4 .b8 __cuda___cuda_local_var_32631_33_non_const_rho1d_15192[2048];\n" +" .loc 17 262 0\n" +"$LDWbegin_interp:\n" +" ld.param.s32 %r1, [__cudaparm_interp_order2];\n" +" ld.param.s32 %r2, [__cudaparm_interp_order];\n" +" add.s32 %r3, %r1, %r2;\n" +" cvt.s32.u32 %r4, %tid.x;\n" +" setp.le.s32 %p1, %r3, %r4;\n" +" @%p1 bra $Lt_2_8706;\n" +" .loc 17 269 0\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888;\n" +" cvt.s64.s32 %rd2, %r4;\n" +" mul.wide.s32 %rd3, %r4, 4;\n" +" ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];\n" +" add.u64 %rd5, %rd4, %rd3;\n" +" ld.global.f32 %f1, [%rd5+0];\n" +" add.u64 %rd6, %rd3, %rd1;\n" +" st.shared.f32 [%rd6+0], %f1;\n" +"$Lt_2_8706:\n" +" mov.u64 %rd1, __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888;\n" +" .loc 17 270 0\n" +" bar.sync 0;\n" +" mov.u32 %r5, %ctaid.x;\n" +" mov.u32 %r6, %ntid.x;\n" +" mul.lo.u32 %r7, %r5, %r6;\n" +" add.u32 %r8, %r4, %r7;\n" +" ld.param.s32 %r9, [__cudaparm_interp_nlocal];\n" +" setp.le.s32 %p2, %r9, %r8;\n" +" @%p2 bra $Lt_2_9218;\n" +" .loc 17 278 0\n" +" mov.u32 %r10, %r8;\n" +" mov.s32 %r11, 0;\n" +" mov.u32 %r12, %r11;\n" +" mov.s32 %r13, 0;\n" +" mov.u32 %r14, %r13;\n" +" mov.s32 %r15, 0;\n" +" mov.u32 %r16, %r15;\n" +" tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}];\n" +" mov.f32 %f6, %f2;\n" +" mov.f32 %f7, %f3;\n" +" mov.f32 %f8, %f4;\n" +" .loc 17 279 0\n" +" mov.u32 %r17, %r8;\n" +" mov.s32 %r18, 0;\n" +" mov.u32 %r19, %r18;\n" +" mov.s32 %r20, 0;\n" +" mov.u32 %r21, %r20;\n" +" mov.s32 %r22, 0;\n" +" mov.u32 %r23, %r22;\n" +" tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}];\n" +" mov.f32 %f13, %f9;\n" +" ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale];\n" +" mul.ftz.f32 %f15, %f14, %f13;\n" +" mov.f32 %f16, 0f00000000; \n" +" setp.neu.ftz.f32 %p3, %f15, %f16;\n" +" @!%p3 bra $Lt_2_9986;\n" +" mov.s32 %r24, 0;\n" +" setp.gt.s32 %p4, %r2, %r24;\n" +" ld.param.f32 %f17, [__cudaparm_interp_b_lo_x];\n" +" sub.ftz.f32 %f18, %f6, %f17;\n" +" ld.param.f32 %f19, [__cudaparm_interp_delxinv];\n" +" mul.ftz.f32 %f20, %f19, %f18;\n" +" @!%p4 bra $Lt_2_16386;\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32630_33_non_const_rho1d_03144;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32631_33_non_const_rho1d_15192;\n" +" cvt.rzi.ftz.s32.f32 %r25, %f20;\n" +" cvt.rn.f32.s32 %f21, %r25;\n" +" mov.f32 %f22, 0f3f000000; \n" +" add.ftz.f32 %f23, %f21, %f22;\n" +" sub.ftz.f32 %f24, %f23, %f20;\n" +" ld.param.f32 %f25, [__cudaparm_interp_b_lo_y];\n" +" sub.ftz.f32 %f26, %f7, %f25;\n" +" ld.param.f32 %f27, [__cudaparm_interp_delyinv];\n" +" mul.ftz.f32 %f28, %f27, %f26;\n" +" cvt.rzi.ftz.s32.f32 %r26, %f28;\n" +" cvt.rn.f32.s32 %f29, %r26;\n" +" mov.f32 %f30, 0f3f000000; \n" +" add.ftz.f32 %f31, %f29, %f30;\n" +" sub.ftz.f32 %f32, %f31, %f28;\n" +" mov.s32 %r27, %r2;\n" +" cvt.s64.s32 %rd9, %r4;\n" +" mov.s32 %r28, %r1;\n" +" mul.wide.s32 %rd3, %r4, 4;\n" +" add.u64 %rd10, %rd3, %rd7;\n" +" add.u64 %rd11, %rd3, %rd8;\n" +" mov.s32 %r29, 0;\n" +" mov.s32 %r30, %r27;\n" +"$Lt_2_10754:\n" +" .loc 17 298 0\n" +" mov.f32 %f33, 0f00000000; \n" +" mov.f32 %f34, 0f00000000; \n" +" st.shared.f32 [%rd10+0], %f34;\n" +" .loc 17 299 0\n" +" mov.f32 %f35, 0f00000000; \n" +" mov.f32 %f36, 0f00000000; \n" +" st.shared.f32 [%rd11+0], %f36;\n" +" .loc 17 300 0\n" +" mov.s32 %r31, %r28;\n" +" setp.lt.s32 %p5, %r28, %r29;\n" +" @%p5 bra $Lt_2_11010;\n" +" cvt.s64.s32 %rd12, %r2;\n" +" mul.wide.s32 %rd13, %r2, 4;\n" +" cvt.s64.s32 %rd14, %r28;\n" +" mul.wide.s32 %rd15, %r28, 4;\n" +" add.u64 %rd16, %rd1, %rd15;\n" +"$Lt_2_11522:\n" +" .loc 17 301 0\n" +" ld.shared.f32 %f37, [%rd16+0];\n" +" fma.rn.ftz.f32 %f33, %f33, %f24, %f37;\n" +" st.shared.f32 [%rd10+0], %f33;\n" +" .loc 17 302 0\n" +" fma.rn.ftz.f32 %f35, %f35, %f32, %f37;\n" +" st.shared.f32 [%rd11+0], %f35;\n" +" sub.s32 %r31, %r31, %r2;\n" +" sub.u64 %rd16, %rd16, %rd13;\n" +" setp.ge.s32 %p6, %r31, %r29;\n" +" @%p6 bra $Lt_2_11522;\n" +"$Lt_2_11010:\n" +" add.s32 %r29, %r29, 1;\n" +" add.s32 %r28, %r28, 1;\n" +" add.u64 %rd11, %rd11, 256;\n" +" add.u64 %rd10, %rd10, 256;\n" +" setp.ne.s32 %p7, %r28, %r3;\n" +" @%p7 bra $Lt_2_10754;\n" +" bra.uni $Lt_2_10242;\n" +"$Lt_2_16386:\n" +" cvt.rzi.ftz.s32.f32 %r25, %f20;\n" +" mov.u64 %rd8, __cuda___cuda_local_var_32631_33_non_const_rho1d_15192;\n" +" mov.u64 %rd7, __cuda___cuda_local_var_32630_33_non_const_rho1d_03144;\n" +"$Lt_2_10242:\n" +" .loc 17 306 0\n" +" ld.param.f32 %f38, [__cudaparm_interp_b_lo_z];\n" +" sub.ftz.f32 %f39, %f8, %f38;\n" +" ld.param.f32 %f40, [__cudaparm_interp_delzinv];\n" +" mul.ftz.f32 %f41, %f40, %f39;\n" +" cvt.rzi.ftz.s32.f32 %r32, %f41;\n" +" ld.param.s32 %r33, [__cudaparm_interp_npts_yx];\n" +" mul.lo.s32 %r34, %r32, %r33;\n" +" add.s32 %r35, %r25, %r34;\n" +" @!%p4 bra $Lt_2_16898;\n" +" cvt.rn.f32.s32 %f42, %r32;\n" +" mov.f32 %f43, 0f3f000000; \n" +" add.ftz.f32 %f44, %f42, %f43;\n" +" sub.ftz.f32 %f45, %f44, %f41;\n" +" mov.s32 %r36, %r2;\n" +" ld.param.f32 %f46, [__cudaparm_interp_b_lo_y];\n" +" sub.ftz.f32 %f47, %f7, %f46;\n" +" cvt.s64.s32 %rd17, %r4;\n" +" ld.param.f32 %f48, [__cudaparm_interp_delyinv];\n" +" mul.ftz.f32 %f49, %f48, %f47;\n" +" cvt.rzi.ftz.s32.f32 %r37, %f49;\n" +" ld.param.s32 %r38, [__cudaparm_interp_npts_x];\n" +" mul.lo.s32 %r39, %r37, %r38;\n" +" mul.wide.s32 %rd3, %r4, 4;\n" +" add.s32 %r40, %r39, %r35;\n" +" add.u64 %rd18, %rd3, %rd7;\n" +" add.u64 %rd19, %rd3, %rd8;\n" +" cvt.s64.s32 %rd20, %r38;\n" +" mul.wide.s32 %rd21, %r38, 16;\n" +" mov.s32 %r41, %r40;\n" +" ld.param.u64 %rd22, [__cudaparm_interp_brick];\n" +" mov.s32 %r42, 0;\n" +" mov.f32 %f50, 0f00000000; \n" +" mov.f32 %f51, 0f00000000; \n" +" mov.f32 %f52, 0f00000000; \n" +" mov.s32 %r43, %r36;\n" +"$Lt_2_12802:\n" +" .loc 17 309 0\n" +" add.s32 %r44, %r42, %r1;\n" +" mov.s32 %r45, %r44;\n" +" setp.lt.s32 %p8, %r44, %r42;\n" +" @%p8 bra $Lt_2_17154;\n" +" cvt.s64.s32 %rd23, %r2;\n" +" mul.wide.s32 %rd13, %r2, 4;\n" +" cvt.s64.s32 %rd24, %r44;\n" +" mul.wide.s32 %rd25, %r44, 4;\n" +" add.u64 %rd26, %rd1, %rd25;\n" +" mov.f32 %f53, 0f00000000; \n" +"$Lt_2_13570:\n" +" .loc 17 310 0\n" +" ld.shared.f32 %f54, [%rd26+0];\n" +" fma.rn.ftz.f32 %f53, %f45, %f53, %f54;\n" +" sub.s32 %r45, %r45, %r2;\n" +" sub.u64 %rd26, %rd26, %rd13;\n" +" setp.ge.s32 %p9, %r45, %r42;\n" +" @%p9 bra $Lt_2_13570;\n" +" bra.uni $Lt_2_13058;\n" +"$Lt_2_17154:\n" +" mov.f32 %f53, 0f00000000; \n" +"$Lt_2_13058:\n" +" .loc 17 312 0\n" +" mov.s32 %r46, %r41;\n" +" mov.s32 %r47, %r2;\n" +" mul.ftz.f32 %f55, %f15, %f53;\n" +" mov.s32 %r48, %r46;\n" +" mov.s64 %rd27, %rd19;\n" +" cvt.s64.s32 %rd28, %r46;\n" +" mul.wide.s32 %rd29, %r46, 16;\n" +" mov.s32 %r49, 0;\n" +" mov.s32 %r50, %r47;\n" +"$Lt_2_14594:\n" +" mov.s32 %r51, %r2;\n" +" mov.s32 %r52, %r48;\n" +" add.s32 %r53, %r48, %r2;\n" +" mov.s64 %rd30, %rd18;\n" +" ld.shared.f32 %f56, [%rd27+0];\n" +" add.u64 %rd31, %rd29, %rd22;\n" +" mul.ftz.f32 %f57, %f55, %f56;\n" +" mov.s32 %r54, %r51;\n" +"$Lt_2_15362:\n" +" .loc 17 316 0\n" +" ld.shared.f32 %f58, [%rd30+0];\n" +" mul.ftz.f32 %f59, %f58, %f57;\n" +" ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0];\n" +" .loc 17 318 0\n" +" mul.ftz.f32 %f63, %f59, %f60;\n" +" sub.ftz.f32 %f52, %f52, %f63;\n" +" .loc 17 319 0\n" +" mul.ftz.f32 %f64, %f59, %f61;\n" +" sub.ftz.f32 %f51, %f51, %f64;\n" +" .loc 17 320 0\n" +" mul.ftz.f32 %f65, %f59, %f62;\n" +" sub.ftz.f32 %f50, %f50, %f65;\n" +" add.s32 %r52, %r52, 1;\n" +" add.u64 %rd31, %rd31, 16;\n" +" add.u64 %rd30, %rd30, 256;\n" +" setp.ne.s32 %p10, %r52, %r53;\n" +" @%p10 bra $Lt_2_15362;\n" +" add.s32 %r49, %r49, 1;\n" +" add.s32 %r48, %r48, %r38;\n" +" add.u64 %rd29, %rd29, %rd21;\n" +" add.u64 %rd27, %rd27, 256;\n" +" setp.ne.s32 %p11, %r49, %r2;\n" +" @%p11 bra $Lt_2_14594;\n" +" add.s32 %r42, %r42, 1;\n" +" add.s32 %r41, %r46, %r33;\n" +" setp.ne.s32 %p12, %r42, %r2;\n" +" @%p12 bra $Lt_2_12802;\n" +" bra.uni $Lt_2_9730;\n" +"$Lt_2_16898:\n" +" mov.f32 %f50, 0f00000000; \n" +" mov.f32 %f51, 0f00000000; \n" +" mov.f32 %f52, 0f00000000; \n" +" bra.uni $Lt_2_9730;\n" +"$Lt_2_9986:\n" +" mov.f32 %f50, 0f00000000; \n" +" mov.f32 %f51, 0f00000000; \n" +" mov.f32 %f52, 0f00000000; \n" +"$Lt_2_9730:\n" +" .loc 17 327 0\n" +" ld.param.u64 %rd32, [__cudaparm_interp_ans];\n" +" cvt.s64.s32 %rd33, %r8;\n" +" mul.wide.s32 %rd34, %r8, 16;\n" +" add.u64 %rd35, %rd32, %rd34;\n" +" mov.f32 %f66, %f67;\n" +" st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66};\n" +"$Lt_2_9218:\n" +" .loc 17 329 0\n" +" exit;\n" +"$LDWend_interp:\n" +" }\n" +;