Page MenuHomec4science

gayberne_lj.ptx
No OneTemporary

File Metadata

Created
Thu, Dec 26, 18:25

gayberne_lj.ptx

.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009b93_00000000-9_lal_gayberne_lj.cpp3.i (/home/sjplimp/ccBI#.hcleqA)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009b93_00000000-8_lal_gayberne_lj.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_ellipsoid_extra.h"
.file 17 "lal_gayberne_lj.cu"
.file 18 "/usr/local/cuda/include/common_functions.h"
.file 19 "/usr/local/cuda/include/math_functions.h"
.file 20 "/usr/local/cuda/include/math_constants.h"
.file 21 "/usr/local/cuda/include/device_functions.h"
.file 22 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 24 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 26 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 27 "/usr/local/cuda/include/surface_functions.h"
.file 28 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 29 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_sphere_ellipsoid (
.param .u64 __cudaparm_kernel_sphere_ellipsoid_x_,
.param .u64 __cudaparm_kernel_sphere_ellipsoid_q,
.param .u64 __cudaparm_kernel_sphere_ellipsoid_shape,
.param .u64 __cudaparm_kernel_sphere_ellipsoid_well,
.param .u64 __cudaparm_kernel_sphere_ellipsoid_gum,
.param .u64 __cudaparm_kernel_sphere_ellipsoid_sig_eps,
.param .s32 __cudaparm_kernel_sphere_ellipsoid_ntypes,
.param .u64 __cudaparm_kernel_sphere_ellipsoid_lshape,
.param .u64 __cudaparm_kernel_sphere_ellipsoid_dev_nbor,
.param .s32 __cudaparm_kernel_sphere_ellipsoid_stride,
.param .u64 __cudaparm_kernel_sphere_ellipsoid_ans,
.param .u64 __cudaparm_kernel_sphere_ellipsoid___val_paramengv,
.param .u64 __cudaparm_kernel_sphere_ellipsoid_err_flag,
.param .s32 __cudaparm_kernel_sphere_ellipsoid_eflag,
.param .s32 __cudaparm_kernel_sphere_ellipsoid_vflag,
.param .s32 __cudaparm_kernel_sphere_ellipsoid_start,
.param .s32 __cudaparm_kernel_sphere_ellipsoid_inum,
.param .s32 __cudaparm_kernel_sphere_ellipsoid_t_per_atom)
{
.reg .u32 %r<59>;
.reg .u64 %rd<79>;
.reg .f32 %f<432>;
.reg .pred %p<35>;
.shared .align 16 .b8 __cuda___cuda_local_var_32888_33_non_const_sp_lj124[16];
.shared .align 4 .b8 __cuda___cuda_local_var_33089_55_non_const_red_acc140[3072];
// __cuda_local_var_32895_10_non_const_f = 48
// __cuda_local_var_32899_9_non_const_virial = 16
.loc 17 28 0
$LDWbegin_kernel_sphere_ellipsoid:
.loc 17 34 0
ld.param.u64 %rd1, [__cudaparm_kernel_sphere_ellipsoid_gum];
ldu.global.f32 %f1, [%rd1+12];
.loc 17 35 0
ld.global.f32 %f2, [%rd1+16];
.loc 17 36 0
ld.global.f32 %f3, [%rd1+20];
.loc 17 37 0
ld.global.f32 %f4, [%rd1+24];
st.shared.v4.f32 [__cuda___cuda_local_var_32888_33_non_const_sp_lj124+0], {%f1,%f2,%f3,%f4};
.loc 17 46 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_sphere_ellipsoid_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_sphere_ellipsoid_start];
add.s32 %r10, %r9, %r8;
ld.param.s32 %r11, [__cudaparm_kernel_sphere_ellipsoid_inum];
setp.ge.s32 %p1, %r10, %r11;
@%p1 bra $Lt_0_55042;
.loc 17 51 0
cvt.s64.s32 %rd2, %r10;
mul.wide.s32 %rd3, %r10, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_sphere_ellipsoid_dev_nbor];
add.u64 %rd5, %rd4, %rd3;
ld.global.s32 %r12, [%rd5+0];
ld.param.s32 %r13, [__cudaparm_kernel_sphere_ellipsoid_stride];
cvt.s64.s32 %rd6, %r13;
mul.wide.s32 %rd7, %r13, 4;
add.u64 %rd8, %rd7, %rd5;
ld.global.s32 %r14, [%rd8+0];
.loc 17 54 0
ld.param.u64 %rd9, [__cudaparm_kernel_sphere_ellipsoid_x_];
cvt.s64.s32 %rd10, %r12;
mul.wide.s32 %rd11, %r12, 16;
add.u64 %rd12, %rd9, %rd11;
ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0];
.loc 17 57 0
cvt.rzi.ftz.s32.f32 %r15, %f20;
cvt.s64.s32 %rd13, %r15;
mul.wide.s32 %rd14, %r15, 16;
ld.param.u64 %rd15, [__cudaparm_kernel_sphere_ellipsoid_shape];
add.u64 %rd16, %rd14, %rd15;
ld.global.f32 %f21, [%rd16+0];
.loc 17 58 0
ld.param.u64 %rd17, [__cudaparm_kernel_sphere_ellipsoid_well];
add.u64 %rd18, %rd14, %rd17;
ld.global.f32 %f22, [%rd18+0];
cvt.s32.s64 %r16, %rd6;
sub.s32 %r17, %r1, 1;
and.b32 %r18, %r17, %r2;
add.u64 %rd19, %rd7, %rd8;
mul.lo.s32 %r19, %r16, %r18;
cvt.s64.s32 %rd20, %r19;
mul.wide.s32 %rd21, %r19, 4;
add.u64 %rd22, %rd19, %rd21;
mov.s64 %rd23, %rd22;
mul.lo.s32 %r20, %r16, %r14;
cvt.s64.s32 %rd24, %r20;
mul.wide.s32 %rd25, %r20, 4;
add.u64 %rd26, %rd19, %rd25;
setp.ge.u64 %p2, %rd22, %rd26;
@%p2 bra $Lt_0_56578;
ld.param.s32 %r21, [__cudaparm_kernel_sphere_ellipsoid_eflag];
mov.s32 %r22, 0;
setp.gt.s32 %p3, %r21, %r22;
ld.param.s32 %r23, [__cudaparm_kernel_sphere_ellipsoid_vflag];
mov.s32 %r24, 0;
setp.gt.s32 %p4, %r23, %r24;
ld.param.s32 %r25, [__cudaparm_kernel_sphere_ellipsoid_ntypes];
mul.lo.s32 %r26, %r25, %r15;
ld.param.u64 %rd27, [__cudaparm_kernel_sphere_ellipsoid_lshape];
mul.lo.u64 %rd28, %rd13, 4;
add.u64 %rd29, %rd27, %rd28;
ld.param.u64 %rd30, [__cudaparm_kernel_sphere_ellipsoid_sig_eps];
ld.param.u64 %rd31, [__cudaparm_kernel_sphere_ellipsoid_q];
mov.f32 %f23, 0f00000000; // 0
mov.f32 %f24, 0f00000000; // 0
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.u64 %rd32, __cuda___cuda_local_var_32888_33_non_const_sp_lj124;
$Lt_0_40706:
//<loop> Loop body line 58, nesting depth: 1, estimated iterations: unknown
.loc 17 63 0
ld.global.s32 %r27, [%rd23+0];
.loc 17 64 0
shr.s32 %r28, %r27, 30;
and.b32 %r29, %r28, 3;
cvt.s64.s32 %rd33, %r29;
mul.wide.s32 %rd34, %r29, 4;
add.u64 %rd35, %rd32, %rd34;
ld.shared.f32 %f27, [%rd35+0];
.loc 17 67 0
and.b32 %r30, %r27, 1073741823;
cvt.s64.s32 %rd36, %r30;
mul.wide.s32 %rd37, %r30, 16;
add.u64 %rd38, %rd37, %rd9;
ld.global.v4.f32 {%f28,%f29,%f30,%f31}, [%rd38+0];
.loc 17 86 0
add.u64 %rd39, %rd37, %rd31;
ld.global.v4.f32 {%f32,%f33,%f34,%f35}, [%rd39+0];
.loc 17 95 0
cvt.rzi.ftz.s32.f32 %r31, %f31;
cvt.s64.s32 %rd40, %r31;
mul.wide.s32 %rd41, %r31, 16;
add.u64 %rd42, %rd41, %rd15;
ld.global.v4.f32 {%f36,%f37,%f38,_}, [%rd42+0];
.loc 16 299 0
sub.ftz.f32 %f39, %f28, %f17;
mov.f32 %f40, %f39;
.loc 16 300 0
add.ftz.f32 %f41, %f33, %f33;
add.ftz.f32 %f42, %f35, %f35;
mul.ftz.f32 %f43, %f32, %f32;
mul.ftz.f32 %f44, %f33, %f33;
mul.ftz.f32 %f45, %f34, %f34;
mul.ftz.f32 %f46, %f35, %f35;
add.ftz.f32 %f47, %f34, %f34;
mul.ftz.f32 %f48, %f41, %f34;
mul.ftz.f32 %f49, %f41, %f35;
mul.ftz.f32 %f50, %f42, %f32;
add.ftz.f32 %f51, %f43, %f44;
mul.ftz.f32 %f52, %f47, %f32;
sub.ftz.f32 %f53, %f48, %f50;
sub.ftz.f32 %f54, %f51, %f45;
add.ftz.f32 %f55, %f49, %f52;
mul.ftz.f32 %f56, %f53, %f37;
sub.ftz.f32 %f57, %f54, %f46;
mul.ftz.f32 %f58, %f55, %f38;
mul.ftz.f32 %f59, %f53, %f56;
mul.ftz.f32 %f60, %f57, %f36;
fma.rn.ftz.f32 %f61, %f57, %f60, %f59;
fma.rn.ftz.f32 %f62, %f55, %f58, %f61;
add.ftz.f32 %f63, %f62, %f21;
mov.f32 %f64, %f63;
.loc 16 301 0
mul.ftz.f32 %f65, %f41, %f32;
sub.ftz.f32 %f66, %f43, %f44;
mul.ftz.f32 %f67, %f47, %f35;
add.ftz.f32 %f68, %f48, %f50;
add.ftz.f32 %f69, %f45, %f66;
sub.ftz.f32 %f70, %f67, %f65;
mul.ftz.f32 %f71, %f68, %f36;
sub.ftz.f32 %f72, %f69, %f46;
mul.ftz.f32 %f73, %f70, %f38;
mul.ftz.f32 %f74, %f72, %f37;
mul.ftz.f32 %f75, %f53, %f74;
fma.rn.ftz.f32 %f76, %f57, %f71, %f75;
fma.rn.ftz.f32 %f77, %f55, %f73, %f76;
mov.f32 %f78, %f77;
.loc 16 302 0
sub.ftz.f32 %f79, %f66, %f45;
sub.ftz.f32 %f80, %f49, %f52;
add.ftz.f32 %f81, %f65, %f67;
add.ftz.f32 %f82, %f46, %f79;
mul.ftz.f32 %f83, %f80, %f36;
mul.ftz.f32 %f84, %f81, %f37;
mul.ftz.f32 %f85, %f82, %f38;
mul.ftz.f32 %f86, %f53, %f84;
fma.rn.ftz.f32 %f87, %f57, %f83, %f86;
fma.rn.ftz.f32 %f88, %f55, %f85, %f87;
mov.f32 %f89, %f88;
.loc 16 303 0
sub.ftz.f32 %f90, %f29, %f18;
mov.f32 %f91, %f90;
.loc 16 304 0
mul.ftz.f32 %f92, %f56, %f72;
fma.rn.ftz.f32 %f93, %f60, %f68, %f92;
fma.rn.ftz.f32 %f94, %f58, %f70, %f93;
mov.f32 %f95, %f94;
.loc 16 305 0
mul.ftz.f32 %f96, %f72, %f74;
fma.rn.ftz.f32 %f97, %f68, %f71, %f96;
fma.rn.ftz.f32 %f98, %f70, %f73, %f97;
add.ftz.f32 %f99, %f98, %f21;
mov.f32 %f100, %f99;
.loc 16 306 0
mul.ftz.f32 %f101, %f72, %f84;
fma.rn.ftz.f32 %f102, %f68, %f83, %f101;
fma.rn.ftz.f32 %f103, %f70, %f85, %f102;
mov.f32 %f104, %f103;
.loc 16 307 0
sub.ftz.f32 %f105, %f30, %f19;
mov.f32 %f106, %f105;
.loc 16 308 0
mul.ftz.f32 %f107, %f81, %f56;
fma.rn.ftz.f32 %f108, %f60, %f80, %f107;
fma.rn.ftz.f32 %f109, %f58, %f82, %f108;
mov.f32 %f110, %f109;
.loc 16 309 0
mul.ftz.f32 %f111, %f81, %f74;
fma.rn.ftz.f32 %f112, %f71, %f80, %f111;
fma.rn.ftz.f32 %f113, %f73, %f82, %f112;
mov.f32 %f114, %f113;
.loc 16 310 0
mul.ftz.f32 %f115, %f81, %f84;
fma.rn.ftz.f32 %f116, %f80, %f83, %f115;
fma.rn.ftz.f32 %f117, %f82, %f85, %f116;
add.ftz.f32 %f118, %f117, %f21;
mov.f32 %f119, %f118;
abs.ftz.f32 %f120, %f94;
abs.ftz.f32 %f121, %f63;
setp.gt.ftz.f32 %p5, %f120, %f121;
@!%p5 bra $Lt_0_40962;
.loc 16 314 0
mov.f32 %f64, %f94;
mov.f32 %f95, %f63;
.loc 16 315 0
mov.f32 %f78, %f99;
mov.f32 %f100, %f77;
.loc 16 316 0
mov.f32 %f89, %f103;
mov.f32 %f104, %f88;
.loc 16 317 0
mov.f32 %f40, %f90;
mov.f32 %f91, %f39;
$Lt_0_40962:
mov.f32 %f122, %f64;
abs.ftz.f32 %f123, %f122;
abs.ftz.f32 %f124, %f109;
setp.lt.ftz.f32 %p6, %f123, %f124;
@!%p6 bra $Lt_0_41474;
.loc 16 321 0
mov.f32 %f64, %f109;
mov.f32 %f110, %f122;
.loc 16 322 0
mov.f32 %f125, %f78;
mov.f32 %f78, %f113;
mov.f32 %f114, %f125;
.loc 16 323 0
mov.f32 %f126, %f89;
mov.f32 %f89, %f118;
mov.f32 %f119, %f126;
.loc 16 324 0
mov.f32 %f127, %f40;
mov.f32 %f40, %f105;
mov.f32 %f106, %f127;
$Lt_0_41474:
mov.f32 %f128, %f64;
mov.f32 %f129, 0f00000000; // 0
setp.neu.ftz.f32 %p7, %f128, %f129;
@!%p7 bra $Lt_0_42242;
bra.uni $Lt_0_43010;
$Lt_0_42242:
mov.f32 %f130, 0f00000000; // 0
setp.neu.ftz.f32 %p8, %f95, %f130;
@!%p8 bra $Lt_0_42754;
.loc 16 338 0
mov.f32 %f64, %f95;
mov.f32 %f95, %f128;
.loc 16 339 0
mov.f32 %f131, %f78;
mov.f32 %f78, %f100;
mov.f32 %f100, %f131;
.loc 16 340 0
mov.f32 %f132, %f89;
mov.f32 %f89, %f104;
mov.f32 %f104, %f132;
.loc 16 341 0
mov.f32 %f133, %f40;
mov.f32 %f40, %f91;
mov.f32 %f91, %f133;
bra.uni $Lt_0_43010;
$Lt_0_42754:
mov.f32 %f134, 0f00000000; // 0
setp.neu.ftz.f32 %p9, %f110, %f134;
@!%p9 bra $Lt_0_43266;
.loc 16 346 0
mov.f32 %f64, %f110;
mov.f32 %f110, %f128;
.loc 16 347 0
mov.f32 %f135, %f78;
mov.f32 %f78, %f114;
mov.f32 %f114, %f135;
.loc 16 348 0
mov.f32 %f136, %f89;
mov.f32 %f89, %f119;
mov.f32 %f119, %f136;
.loc 16 349 0
mov.f32 %f137, %f40;
mov.f32 %f40, %f106;
mov.f32 %f106, %f137;
bra.uni $Lt_0_43010;
$Lt_0_43266:
.loc 16 352 0
mov.s32 %r32, 2;
ld.param.u64 %rd43, [__cudaparm_kernel_sphere_ellipsoid_err_flag];
st.global.s32 [%rd43+0], %r32;
$Lt_0_43010:
$Lt_0_42498:
$Lt_0_41986:
.loc 16 355 0
div.approx.ftz.f32 %f138, %f95, %f64;
mul.ftz.f32 %f139, %f78, %f138;
sub.ftz.f32 %f140, %f100, %f139;
mov.f32 %f100, %f140;
.loc 16 356 0
mul.ftz.f32 %f141, %f89, %f138;
sub.ftz.f32 %f142, %f104, %f141;
mov.f32 %f104, %f142;
.loc 16 357 0
mul.ftz.f32 %f143, %f40, %f138;
sub.ftz.f32 %f144, %f91, %f143;
mov.f32 %f91, %f144;
.loc 16 359 0
div.approx.ftz.f32 %f145, %f110, %f64;
mul.ftz.f32 %f146, %f78, %f145;
sub.ftz.f32 %f114, %f114, %f146;
.loc 16 360 0
mul.ftz.f32 %f147, %f89, %f145;
sub.ftz.f32 %f119, %f119, %f147;
.loc 16 361 0
mul.ftz.f32 %f148, %f40, %f145;
sub.ftz.f32 %f106, %f106, %f148;
abs.ftz.f32 %f149, %f140;
abs.ftz.f32 %f150, %f114;
setp.lt.ftz.f32 %p10, %f149, %f150;
@!%p10 bra $Lt_0_43522;
.loc 16 366 0
mov.f32 %f100, %f114;
mov.f32 %f114, %f140;
.loc 16 367 0
mov.f32 %f104, %f119;
mov.f32 %f119, %f142;
.loc 16 368 0
mov.f32 %f91, %f106;
mov.f32 %f106, %f144;
$Lt_0_43522:
mov.f32 %f151, %f100;
mov.f32 %f152, 0f00000000; // 0
setp.neu.ftz.f32 %p11, %f151, %f152;
@!%p11 bra $Lt_0_44290;
bra.uni $Lt_0_44546;
$Lt_0_44290:
mov.f32 %f153, 0f00000000; // 0
setp.neu.ftz.f32 %p12, %f114, %f153;
@!%p12 bra $Lt_0_44546;
.loc 16 383 0
mov.f32 %f100, %f114;
mov.f32 %f114, %f151;
.loc 16 384 0
mov.f32 %f154, %f104;
mov.f32 %f104, %f119;
mov.f32 %f119, %f154;
.loc 16 385 0
mov.f32 %f155, %f91;
mov.f32 %f91, %f106;
mov.f32 %f106, %f155;
$Lt_0_44546:
$Lt_0_44034:
.loc 16 390 0
div.approx.ftz.f32 %f156, %f114, %f100;
mul.ftz.f32 %f157, %f104, %f156;
sub.ftz.f32 %f119, %f119, %f157;
.loc 16 391 0
mul.ftz.f32 %f158, %f91, %f156;
sub.ftz.f32 %f106, %f106, %f158;
mov.f32 %f159, 0f00000000; // 0
setp.eq.ftz.f32 %p13, %f119, %f159;
@!%p13 bra $Lt_0_45058;
.loc 16 394 0
mov.s32 %r33, 2;
ld.param.u64 %rd44, [__cudaparm_kernel_sphere_ellipsoid_err_flag];
st.global.s32 [%rd44+0], %r33;
$Lt_0_45058:
.loc 17 115 0
div.approx.ftz.f32 %f160, %f106, %f119;
mul.ftz.f32 %f161, %f90, %f90;
mul.ftz.f32 %f162, %f160, %f104;
fma.rn.ftz.f32 %f163, %f39, %f39, %f161;
sub.ftz.f32 %f164, %f91, %f162;
fma.rn.ftz.f32 %f165, %f105, %f105, %f163;
div.approx.ftz.f32 %f166, %f164, %f100;
rsqrt.approx.ftz.f32 %f167, %f165;
mul.ftz.f32 %f168, %f166, %f78;
fma.rn.ftz.f32 %f169, %f89, %f160, %f168;
sub.ftz.f32 %f170, %f40, %f169;
div.approx.ftz.f32 %f171, %f170, %f64;
mul.ftz.f32 %f172, %f167, %f171;
.loc 17 127 0
mul.ftz.f32 %f173, %f166, %f167;
mul.ftz.f32 %f174, %f167, %f90;
mul.ftz.f32 %f175, %f167, %f39;
mul.ftz.f32 %f176, %f167, %f105;
mul.ftz.f32 %f177, %f160, %f167;
mul.ftz.f32 %f178, %f173, %f174;
fma.rn.ftz.f32 %f179, %f175, %f172, %f178;
fma.rn.ftz.f32 %f180, %f176, %f177, %f179;
mov.f32 %f181, 0f3f000000; // 0.5
mul.ftz.f32 %f182, %f180, %f181;
rsqrt.approx.ftz.f32 %f183, %f182;
.loc 17 131 0
rcp.approx.ftz.f32 %f184, %f167;
mul.ftz.f32 %f185, %f184, %f172;
.loc 17 136 0
add.s32 %r34, %r31, %r26;
cvt.s64.s32 %rd45, %r34;
mul.wide.s32 %rd46, %r34, 8;
add.u64 %rd47, %rd30, %rd46;
ld.global.v2.f32 {%f186,%f187}, [%rd47+0];
.loc 17 138 0
sub.ftz.f32 %f188, %f184, %f183;
ld.global.f32 %f189, [%rd1+0];
fma.rn.ftz.f32 %f190, %f189, %f186, %f188;
.loc 17 145 0
div.approx.ftz.f32 %f191, %f186, %f190;
mul.ftz.f32 %f192, %f191, %f191;
mul.ftz.f32 %f193, %f191, %f192;
mul.ftz.f32 %f194, %f193, %f193;
mul.ftz.f32 %f195, %f194, %f194;
mul.ftz.f32 %f196, %f191, %f194;
add.ftz.f32 %f197, %f195, %f195;
mul.ftz.f32 %f198, %f191, %f197;
sub.ftz.f32 %f199, %f198, %f196;
div.approx.ftz.f32 %f200, %f199, %f186;
mov.f32 %f201, 0f41c00000; // 24
mul.ftz.f32 %f202, %f200, %f201;
mul.ftz.f32 %f203, %f187, %f202;
.loc 17 150 0
mul.ftz.f32 %f204, %f183, %f203;
mul.ftz.f32 %f205, %f204, %f183;
mul.ftz.f32 %f206, %f205, %f183;
mov.f32 %f207, 0f3f000000; // 0.5
mul.ftz.f32 %f208, %f206, %f207;
mul.ftz.f32 %f209, %f208, %f167;
mul.ftz.f32 %f210, %f173, %f184;
mul.ftz.f32 %f211, %f177, %f184;
mul.ftz.f32 %f212, %f167, %f209;
mul.ftz.f32 %f213, %f174, %f210;
fma.rn.ftz.f32 %f214, %f175, %f185, %f213;
fma.rn.ftz.f32 %f215, %f176, %f211, %f214;
mul.ftz.f32 %f216, %f175, %f215;
sub.ftz.f32 %f217, %f185, %f216;
mul.ftz.f32 %f218, %f212, %f217;
fma.rn.ftz.f32 %f219, %f175, %f203, %f218;
.loc 17 151 0
mul.ftz.f32 %f220, %f174, %f215;
sub.ftz.f32 %f221, %f210, %f220;
mul.ftz.f32 %f222, %f212, %f221;
fma.rn.ftz.f32 %f223, %f174, %f203, %f222;
.loc 17 152 0
mul.ftz.f32 %f224, %f176, %f215;
sub.ftz.f32 %f225, %f211, %f224;
mul.ftz.f32 %f226, %f212, %f225;
fma.rn.ftz.f32 %f227, %f176, %f203, %f226;
.loc 17 159 0
ld.global.f32 %f228, [%rd29+0];
mul.lo.u64 %rd48, %rd40, 4;
add.u64 %rd49, %rd27, %rd48;
ld.global.f32 %f229, [%rd49+0];
add.ftz.f32 %f230, %f228, %f228;
mul.ftz.f32 %f231, %f229, %f230;
.loc 17 160 0
mul.ftz.f32 %f232, %f103, %f63;
mul.ftz.f32 %f233, %f113, %f232;
mul.ftz.f32 %f234, %f99, %f63;
mul.ftz.f32 %f235, %f118, %f234;
sub.ftz.f32 %f236, %f235, %f233;
mul.ftz.f32 %f237, %f94, %f77;
mul.ftz.f32 %f238, %f118, %f237;
sub.ftz.f32 %f239, %f236, %f238;
mul.ftz.f32 %f240, %f94, %f88;
fma.rn.ftz.f32 %f241, %f113, %f240, %f239;
mul.ftz.f32 %f242, %f109, %f77;
fma.rn.ftz.f32 %f243, %f103, %f242, %f241;
mul.ftz.f32 %f244, %f109, %f88;
mul.ftz.f32 %f245, %f99, %f244;
sub.ftz.f32 %f246, %f243, %f245;
.loc 17 161 0
ld.global.f32 %f247, [%rd1+4];
.loc 17 172 0
add.u64 %rd50, %rd41, %rd17;
ld.global.v4.f32 {%f248,%f249,%f250,_}, [%rd50+0];
.loc 16 299 0
mov.f32 %f40, %f39;
.loc 16 300 0
mul.ftz.f32 %f251, %f53, %f249;
mul.ftz.f32 %f252, %f55, %f250;
mul.ftz.f32 %f253, %f53, %f251;
mul.ftz.f32 %f254, %f57, %f248;
fma.rn.ftz.f32 %f255, %f57, %f254, %f253;
fma.rn.ftz.f32 %f256, %f55, %f252, %f255;
add.ftz.f32 %f257, %f256, %f22;
mov.f32 %f64, %f257;
.loc 16 301 0
mul.ftz.f32 %f258, %f68, %f248;
mul.ftz.f32 %f259, %f70, %f250;
mul.ftz.f32 %f260, %f72, %f249;
mul.ftz.f32 %f261, %f53, %f260;
fma.rn.ftz.f32 %f262, %f57, %f258, %f261;
fma.rn.ftz.f32 %f263, %f55, %f259, %f262;
mov.f32 %f78, %f263;
.loc 16 302 0
mul.ftz.f32 %f264, %f80, %f248;
mul.ftz.f32 %f265, %f81, %f249;
mul.ftz.f32 %f266, %f82, %f250;
mul.ftz.f32 %f267, %f53, %f265;
fma.rn.ftz.f32 %f268, %f57, %f264, %f267;
fma.rn.ftz.f32 %f269, %f55, %f266, %f268;
mov.f32 %f89, %f269;
.loc 16 303 0
mov.f32 %f91, %f90;
.loc 16 304 0
mul.ftz.f32 %f270, %f251, %f72;
fma.rn.ftz.f32 %f271, %f254, %f68, %f270;
fma.rn.ftz.f32 %f272, %f252, %f70, %f271;
mov.f32 %f95, %f272;
.loc 16 305 0
mul.ftz.f32 %f273, %f72, %f260;
fma.rn.ftz.f32 %f274, %f68, %f258, %f273;
fma.rn.ftz.f32 %f275, %f70, %f259, %f274;
add.ftz.f32 %f100, %f22, %f275;
.loc 16 306 0
mul.ftz.f32 %f276, %f72, %f265;
fma.rn.ftz.f32 %f277, %f68, %f264, %f276;
fma.rn.ftz.f32 %f104, %f70, %f266, %f277;
.loc 16 307 0
mov.f32 %f106, %f105;
.loc 16 308 0
mul.ftz.f32 %f278, %f81, %f251;
fma.rn.ftz.f32 %f279, %f254, %f80, %f278;
fma.rn.ftz.f32 %f280, %f252, %f82, %f279;
mov.f32 %f110, %f280;
.loc 16 309 0
mul.ftz.f32 %f281, %f81, %f260;
fma.rn.ftz.f32 %f282, %f258, %f80, %f281;
fma.rn.ftz.f32 %f114, %f259, %f82, %f282;
.loc 16 310 0
mul.ftz.f32 %f283, %f81, %f265;
fma.rn.ftz.f32 %f284, %f80, %f264, %f283;
fma.rn.ftz.f32 %f285, %f82, %f266, %f284;
add.ftz.f32 %f119, %f22, %f285;
abs.ftz.f32 %f286, %f272;
abs.ftz.f32 %f287, %f257;
setp.gt.ftz.f32 %p14, %f286, %f287;
@!%p14 bra $Lt_0_45570;
.loc 16 314 0
mov.f32 %f64, %f272;
mov.f32 %f95, %f257;
.loc 16 315 0
mov.f32 %f78, %f100;
mov.f32 %f100, %f263;
.loc 16 316 0
mov.f32 %f89, %f104;
mov.f32 %f104, %f269;
.loc 16 317 0
mov.f32 %f40, %f90;
mov.f32 %f91, %f39;
$Lt_0_45570:
mov.f32 %f288, %f64;
abs.ftz.f32 %f289, %f288;
abs.ftz.f32 %f290, %f280;
setp.lt.ftz.f32 %p15, %f289, %f290;
@!%p15 bra $Lt_0_46082;
.loc 16 321 0
mov.f32 %f64, %f280;
mov.f32 %f110, %f288;
.loc 16 322 0
mov.f32 %f291, %f78;
mov.f32 %f78, %f114;
mov.f32 %f114, %f291;
.loc 16 323 0
mov.f32 %f292, %f89;
mov.f32 %f89, %f119;
mov.f32 %f119, %f292;
.loc 16 324 0
mov.f32 %f293, %f40;
mov.f32 %f40, %f105;
mov.f32 %f106, %f293;
$Lt_0_46082:
mov.f32 %f294, %f64;
mov.f32 %f295, 0f00000000; // 0
setp.neu.ftz.f32 %p16, %f294, %f295;
@!%p16 bra $Lt_0_46850;
bra.uni $Lt_0_47618;
$Lt_0_46850:
mov.f32 %f296, 0f00000000; // 0
setp.neu.ftz.f32 %p17, %f95, %f296;
@!%p17 bra $Lt_0_47362;
.loc 16 338 0
mov.f32 %f64, %f95;
mov.f32 %f95, %f294;
.loc 16 339 0
mov.f32 %f297, %f78;
mov.f32 %f78, %f100;
mov.f32 %f100, %f297;
.loc 16 340 0
mov.f32 %f298, %f89;
mov.f32 %f89, %f104;
mov.f32 %f104, %f298;
.loc 16 341 0
mov.f32 %f299, %f40;
mov.f32 %f40, %f91;
mov.f32 %f91, %f299;
bra.uni $Lt_0_47618;
$Lt_0_47362:
mov.f32 %f300, 0f00000000; // 0
setp.neu.ftz.f32 %p18, %f110, %f300;
@!%p18 bra $Lt_0_47874;
.loc 16 346 0
mov.f32 %f64, %f110;
mov.f32 %f110, %f294;
.loc 16 347 0
mov.f32 %f301, %f78;
mov.f32 %f78, %f114;
mov.f32 %f114, %f301;
.loc 16 348 0
mov.f32 %f302, %f89;
mov.f32 %f89, %f119;
mov.f32 %f119, %f302;
.loc 16 349 0
mov.f32 %f303, %f40;
mov.f32 %f40, %f106;
mov.f32 %f106, %f303;
bra.uni $Lt_0_47618;
$Lt_0_47874:
.loc 16 352 0
mov.s32 %r35, 2;
ld.param.u64 %rd51, [__cudaparm_kernel_sphere_ellipsoid_err_flag];
st.global.s32 [%rd51+0], %r35;
$Lt_0_47618:
$Lt_0_47106:
$Lt_0_46594:
.loc 16 355 0
div.approx.ftz.f32 %f304, %f95, %f64;
mul.ftz.f32 %f305, %f78, %f304;
sub.ftz.f32 %f306, %f100, %f305;
mov.f32 %f100, %f306;
.loc 16 356 0
mul.ftz.f32 %f307, %f89, %f304;
sub.ftz.f32 %f308, %f104, %f307;
mov.f32 %f104, %f308;
.loc 16 357 0
mul.ftz.f32 %f309, %f40, %f304;
sub.ftz.f32 %f310, %f91, %f309;
mov.f32 %f91, %f310;
.loc 16 359 0
div.approx.ftz.f32 %f311, %f110, %f64;
mul.ftz.f32 %f312, %f78, %f311;
sub.ftz.f32 %f114, %f114, %f312;
.loc 16 360 0
mul.ftz.f32 %f313, %f89, %f311;
sub.ftz.f32 %f119, %f119, %f313;
.loc 16 361 0
mul.ftz.f32 %f314, %f40, %f311;
sub.ftz.f32 %f106, %f106, %f314;
abs.ftz.f32 %f315, %f306;
abs.ftz.f32 %f316, %f114;
setp.lt.ftz.f32 %p19, %f315, %f316;
@!%p19 bra $Lt_0_48130;
.loc 16 366 0
mov.f32 %f100, %f114;
mov.f32 %f114, %f306;
.loc 16 367 0
mov.f32 %f104, %f119;
mov.f32 %f119, %f308;
.loc 16 368 0
mov.f32 %f91, %f106;
mov.f32 %f106, %f310;
$Lt_0_48130:
mov.f32 %f317, %f100;
mov.f32 %f318, 0f00000000; // 0
setp.neu.ftz.f32 %p20, %f317, %f318;
@!%p20 bra $Lt_0_48898;
bra.uni $Lt_0_49154;
$Lt_0_48898:
mov.f32 %f319, 0f00000000; // 0
setp.neu.ftz.f32 %p21, %f114, %f319;
@!%p21 bra $Lt_0_49154;
.loc 16 383 0
mov.f32 %f100, %f114;
mov.f32 %f114, %f317;
.loc 16 384 0
mov.f32 %f320, %f104;
mov.f32 %f104, %f119;
mov.f32 %f119, %f320;
.loc 16 385 0
mov.f32 %f321, %f91;
mov.f32 %f91, %f106;
mov.f32 %f106, %f321;
$Lt_0_49154:
$Lt_0_48642:
.loc 16 390 0
div.approx.ftz.f32 %f322, %f114, %f100;
mul.ftz.f32 %f323, %f104, %f322;
sub.ftz.f32 %f119, %f119, %f323;
.loc 16 391 0
mul.ftz.f32 %f324, %f91, %f322;
sub.ftz.f32 %f106, %f106, %f324;
mov.f32 %f325, 0f00000000; // 0
setp.eq.ftz.f32 %p22, %f119, %f325;
@!%p22 bra $Lt_0_49666;
.loc 16 394 0
mov.s32 %r36, 2;
ld.param.u64 %rd52, [__cudaparm_kernel_sphere_ellipsoid_err_flag];
st.global.s32 [%rd52+0], %r36;
$Lt_0_49666:
.loc 17 189 0
div.approx.ftz.f32 %f326, %f106, %f119;
mul.ftz.f32 %f327, %f326, %f104;
sub.ftz.f32 %f328, %f91, %f327;
div.approx.ftz.f32 %f329, %f328, %f100;
mul.ftz.f32 %f330, %f329, %f78;
fma.rn.ftz.f32 %f331, %f89, %f326, %f330;
sub.ftz.f32 %f332, %f40, %f331;
div.approx.ftz.f32 %f333, %f332, %f64;
mul.ftz.f32 %f334, %f167, %f333;
.loc 17 193 0
ld.global.f32 %f335, [%rd1+8];
.loc 21 496 0
mul.ftz.f32 %f336, %f329, %f167;
mul.ftz.f32 %f337, %f326, %f167;
mul.ftz.f32 %f338, %f336, %f174;
fma.rn.ftz.f32 %f339, %f175, %f334, %f338;
fma.rn.ftz.f32 %f340, %f176, %f337, %f339;
add.ftz.f32 %f341, %f340, %f340;
lg2.approx.ftz.f32 %f342, %f341;
.loc 21 538 0
mul.ftz.f32 %f343, %f342, %f335;
ex2.approx.ftz.f32 %f344, %f343;
.loc 17 196 0
mul.ftz.f32 %f345, %f184, %f334;
.loc 17 201 0
mov.f32 %f346, 0fbf800000; // -1
add.ftz.f32 %f347, %f335, %f346;
.loc 21 496 0
lg2.approx.ftz.f32 %f348, %f344;
.loc 17 201 0
div.approx.ftz.f32 %f349, %f347, %f335;
mul.ftz.f32 %f350, %f348, %f349;
ex2.approx.ftz.f32 %f351, %f350;
mov.f32 %f352, 0fc0800000; // -4
mul.ftz.f32 %f353, %f167, %f352;
mul.ftz.f32 %f354, %f167, %f353;
mul.ftz.f32 %f355, %f335, %f354;
mul.ftz.f32 %f356, %f351, %f355;
.loc 17 203 0
mul.ftz.f32 %f357, %f336, %f184;
mul.ftz.f32 %f358, %f337, %f184;
mul.ftz.f32 %f359, %f174, %f357;
fma.rn.ftz.f32 %f360, %f175, %f345, %f359;
fma.rn.ftz.f32 %f361, %f176, %f358, %f360;
mul.ftz.f32 %f362, %f175, %f361;
sub.ftz.f32 %f363, %f345, %f362;
mul.ftz.f32 %f364, %f356, %f363;
.loc 17 204 0
mul.ftz.f32 %f365, %f174, %f361;
sub.ftz.f32 %f366, %f357, %f365;
mul.ftz.f32 %f367, %f356, %f366;
.loc 17 205 0
mul.ftz.f32 %f368, %f176, %f361;
sub.ftz.f32 %f369, %f358, %f368;
mul.ftz.f32 %f370, %f356, %f369;
.loc 16 396 0
mov.f32 %f371, 0f40800000; // 4
mul.ftz.f32 %f372, %f187, %f371;
div.approx.ftz.f32 %f373, %f231, %f246;
lg2.approx.ftz.f32 %f374, %f373;
mul.ftz.f32 %f375, %f374, %f247;
ex2.approx.ftz.f32 %f376, %f375;
mul.ftz.f32 %f377, %f376, %f27;
sub.ftz.f32 %f378, %f195, %f194;
mul.ftz.f32 %f379, %f377, %f344;
mul.ftz.f32 %f380, %f372, %f378;
fma.rn.ftz.f32 %f381, %f380, %f379, %f26;
selp.f32 %f26, %f381, %f26, %p3;
mul.ftz.f32 %f382, %f379, %f219;
mul.ftz.f32 %f383, %f379, %f223;
mul.ftz.f32 %f384, %f379, %f227;
mul.ftz.f32 %f385, %f376, %f380;
mul.ftz.f32 %f386, %f385, %f27;
neg.ftz.f32 %f387, %f386;
mul.ftz.f32 %f388, %f364, %f387;
sub.ftz.f32 %f389, %f388, %f382;
mul.ftz.f32 %f390, %f367, %f387;
sub.ftz.f32 %f391, %f390, %f383;
mul.ftz.f32 %f392, %f370, %f387;
sub.ftz.f32 %f393, %f392, %f384;
@!%p4 bra $Lt_0_50434;
.loc 17 217 0
add.ftz.f32 %f25, %f389, %f25;
.loc 17 218 0
sub.ftz.f32 %f394, %f17, %f28;
mov.f32 %f395, %f6;
fma.rn.ftz.f32 %f396, %f394, %f389, %f395;
mov.f32 %f6, %f396;
.loc 17 220 0
add.ftz.f32 %f24, %f391, %f24;
.loc 17 221 0
sub.ftz.f32 %f397, %f18, %f29;
mov.f32 %f398, %f8;
fma.rn.ftz.f32 %f399, %f397, %f391, %f398;
mov.f32 %f8, %f399;
.loc 17 222 0
mov.f32 %f400, %f12;
fma.rn.ftz.f32 %f401, %f394, %f391, %f400;
mov.f32 %f12, %f401;
.loc 17 224 0
add.ftz.f32 %f23, %f393, %f23;
.loc 17 225 0
mov.f32 %f402, %f10;
sub.ftz.f32 %f403, %f19, %f30;
fma.rn.ftz.f32 %f404, %f403, %f393, %f402;
mov.f32 %f10, %f404;
.loc 17 226 0
mov.f32 %f405, %f14;
fma.rn.ftz.f32 %f406, %f394, %f393, %f405;
mov.f32 %f14, %f406;
.loc 17 227 0
fma.rn.ftz.f32 %f15, %f397, %f393, %f15;
mov.f32 %f16, %f15;
bra.uni $Lt_0_50178;
$Lt_0_50434:
.loc 17 229 0
add.ftz.f32 %f25, %f389, %f25;
.loc 17 230 0
add.ftz.f32 %f24, %f391, %f24;
.loc 17 231 0
add.ftz.f32 %f23, %f393, %f23;
$Lt_0_50178:
mul.lo.s32 %r37, %r16, %r1;
cvt.s64.s32 %rd53, %r37;
mul.wide.s32 %rd54, %r37, 4;
add.u64 %rd23, %rd23, %rd54;
setp.gt.u64 %p23, %rd26, %rd23;
@%p23 bra $Lt_0_40706;
bra.uni $Lt_0_40194;
$Lt_0_56578:
mov.f32 %f23, 0f00000000; // 0
mov.f32 %f24, 0f00000000; // 0
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
$Lt_0_40194:
mov.u32 %r38, 1;
setp.le.s32 %p24, %r1, %r38;
@%p24 bra $Lt_0_52994;
.loc 17 234 0
mov.u64 %rd55, __cuda___cuda_local_var_33089_55_non_const_red_acc140;
cvt.s64.s32 %rd56, %r2;
mul.wide.s32 %rd57, %r2, 4;
add.u64 %rd58, %rd55, %rd57;
mov.f32 %f407, %f25;
st.shared.f32 [%rd58+0], %f407;
mov.f32 %f408, %f24;
st.shared.f32 [%rd58+512], %f408;
mov.f32 %f409, %f23;
st.shared.f32 [%rd58+1024], %f409;
mov.f32 %f410, %f26;
st.shared.f32 [%rd58+1536], %f410;
shr.s32 %r39, %r1, 31;
mov.s32 %r40, 1;
and.b32 %r41, %r39, %r40;
add.s32 %r42, %r41, %r1;
shr.s32 %r43, %r42, 1;
mov.s32 %r44, %r43;
mov.u32 %r45, 0;
setp.ne.u32 %p25, %r43, %r45;
@!%p25 bra $Lt_0_51458;
$Lt_0_51970:
setp.ge.u32 %p26, %r18, %r44;
@%p26 bra $Lt_0_52226;
add.u32 %r46, %r2, %r44;
cvt.u64.u32 %rd59, %r46;
mul.wide.u32 %rd60, %r46, 4;
add.u64 %rd61, %rd55, %rd60;
ld.shared.f32 %f411, [%rd61+0];
add.ftz.f32 %f407, %f411, %f407;
st.shared.f32 [%rd58+0], %f407;
ld.shared.f32 %f412, [%rd61+512];
add.ftz.f32 %f408, %f412, %f408;
st.shared.f32 [%rd58+512], %f408;
ld.shared.f32 %f413, [%rd61+1024];
add.ftz.f32 %f409, %f413, %f409;
st.shared.f32 [%rd58+1024], %f409;
ld.shared.f32 %f414, [%rd61+1536];
add.ftz.f32 %f410, %f414, %f410;
st.shared.f32 [%rd58+1536], %f410;
$Lt_0_52226:
shr.u32 %r44, %r44, 1;
mov.u32 %r47, 0;
setp.ne.u32 %p27, %r44, %r47;
@%p27 bra $Lt_0_51970;
$Lt_0_51458:
mov.f32 %f25, %f407;
mov.f32 %f24, %f408;
mov.f32 %f23, %f409;
mov.f32 %f26, %f410;
ld.param.s32 %r48, [__cudaparm_kernel_sphere_ellipsoid_vflag];
mov.u32 %r49, 0;
setp.le.s32 %p28, %r48, %r49;
@%p28 bra $Lt_0_52994;
mov.f32 %f407, %f6;
st.shared.f32 [%rd58+0], %f407;
mov.f32 %f408, %f8;
st.shared.f32 [%rd58+512], %f408;
mov.f32 %f409, %f10;
st.shared.f32 [%rd58+1024], %f409;
mov.f32 %f410, %f12;
st.shared.f32 [%rd58+1536], %f410;
mov.f32 %f415, %f14;
st.shared.f32 [%rd58+2048], %f415;
mov.f32 %f416, %f15;
st.shared.f32 [%rd58+2560], %f416;
mov.s32 %r50, %r43;
@!%p25 bra $Lt_0_53506;
$Lt_0_54018:
setp.ge.u32 %p29, %r18, %r50;
@%p29 bra $Lt_0_54274;
add.u32 %r51, %r2, %r50;
cvt.u64.u32 %rd62, %r51;
mul.wide.u32 %rd63, %r51, 4;
add.u64 %rd64, %rd55, %rd63;
ld.shared.f32 %f417, [%rd64+0];
add.ftz.f32 %f407, %f417, %f407;
st.shared.f32 [%rd58+0], %f407;
ld.shared.f32 %f418, [%rd64+512];
add.ftz.f32 %f408, %f418, %f408;
st.shared.f32 [%rd58+512], %f408;
ld.shared.f32 %f419, [%rd64+1024];
add.ftz.f32 %f409, %f419, %f409;
st.shared.f32 [%rd58+1024], %f409;
ld.shared.f32 %f420, [%rd64+1536];
add.ftz.f32 %f410, %f420, %f410;
st.shared.f32 [%rd58+1536], %f410;
ld.shared.f32 %f421, [%rd64+2048];
add.ftz.f32 %f415, %f421, %f415;
st.shared.f32 [%rd58+2048], %f415;
ld.shared.f32 %f422, [%rd64+2560];
add.ftz.f32 %f416, %f422, %f416;
st.shared.f32 [%rd58+2560], %f416;
$Lt_0_54274:
shr.u32 %r50, %r50, 1;
mov.u32 %r52, 0;
setp.ne.u32 %p30, %r50, %r52;
@%p30 bra $Lt_0_54018;
$Lt_0_53506:
mov.f32 %f6, %f407;
mov.f32 %f8, %f408;
mov.f32 %f10, %f409;
mov.f32 %f12, %f410;
mov.f32 %f14, %f415;
mov.f32 %f16, %f416;
$Lt_0_52994:
$Lt_0_50946:
mov.u32 %r53, 0;
setp.ne.s32 %p31, %r18, %r53;
@%p31 bra $Lt_0_55042;
ld.param.u64 %rd65, [__cudaparm_kernel_sphere_ellipsoid___val_paramengv];
add.u64 %rd66, %rd65, %rd3;
ld.param.s32 %r54, [__cudaparm_kernel_sphere_ellipsoid_eflag];
mov.u32 %r55, 0;
setp.le.s32 %p32, %r54, %r55;
@%p32 bra $Lt_0_55554;
st.global.f32 [%rd66+0], %f26;
cvt.s64.s32 %rd67, %r11;
mul.wide.s32 %rd68, %r11, 4;
add.u64 %rd66, %rd66, %rd68;
$Lt_0_55554:
ld.param.s32 %r56, [__cudaparm_kernel_sphere_ellipsoid_vflag];
mov.u32 %r57, 0;
setp.le.s32 %p33, %r56, %r57;
@%p33 bra $Lt_0_56066;
mov.f32 %f423, %f6;
st.global.f32 [%rd66+0], %f423;
cvt.s64.s32 %rd69, %r11;
mul.wide.s32 %rd70, %r11, 4;
add.u64 %rd71, %rd70, %rd66;
mov.f32 %f424, %f8;
st.global.f32 [%rd71+0], %f424;
add.u64 %rd72, %rd70, %rd71;
mov.f32 %f425, %f10;
st.global.f32 [%rd72+0], %f425;
add.u64 %rd73, %rd70, %rd72;
mov.f32 %f426, %f12;
st.global.f32 [%rd73+0], %f426;
add.u64 %rd66, %rd70, %rd73;
mov.f32 %f427, %f14;
st.global.f32 [%rd66+0], %f427;
mov.f32 %f428, %f16;
add.u64 %rd74, %rd70, %rd66;
st.global.f32 [%rd74+0], %f428;
$Lt_0_56066:
ld.param.u64 %rd75, [__cudaparm_kernel_sphere_ellipsoid_ans];
mul.lo.u64 %rd76, %rd2, 16;
add.u64 %rd77, %rd75, %rd76;
mov.f32 %f429, %f430;
st.global.v4.f32 [%rd77+0], {%f25,%f24,%f23,%f429};
$Lt_0_55042:
$Lt_0_39682:
.loc 17 237 0
exit;
$LDWend_kernel_sphere_ellipsoid:
} // kernel_sphere_ellipsoid
.entry kernel_lj (
.param .u64 __cudaparm_kernel_lj_x_,
.param .u64 __cudaparm_kernel_lj_lj1,
.param .u64 __cudaparm_kernel_lj_lj3,
.param .s32 __cudaparm_kernel_lj_lj_types,
.param .u64 __cudaparm_kernel_lj_gum,
.param .s32 __cudaparm_kernel_lj_stride,
.param .u64 __cudaparm_kernel_lj_dev_ij,
.param .u64 __cudaparm_kernel_lj_ans,
.param .u64 __cudaparm_kernel_lj___val_paramengv,
.param .u64 __cudaparm_kernel_lj_err_flag,
.param .s32 __cudaparm_kernel_lj_eflag,
.param .s32 __cudaparm_kernel_lj_vflag,
.param .s32 __cudaparm_kernel_lj_start,
.param .s32 __cudaparm_kernel_lj_inum,
.param .s32 __cudaparm_kernel_lj_t_per_atom)
{
.reg .u32 %r<55>;
.reg .u64 %rd<60>;
.reg .f32 %f<115>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_33106_33_non_const_sp_lj3316[16];
.shared .align 4 .b8 __cuda___cuda_local_var_33172_55_non_const_red_acc3332[3072];
// __cuda_local_var_33117_9_non_const_virial = 16
.loc 17 246 0
$LDWbegin_kernel_lj:
.loc 17 252 0
ld.param.u64 %rd1, [__cudaparm_kernel_lj_gum];
ldu.global.f32 %f1, [%rd1+12];
.loc 17 253 0
ld.global.f32 %f2, [%rd1+16];
.loc 17 254 0
ld.global.f32 %f3, [%rd1+20];
.loc 17 255 0
ld.global.f32 %f4, [%rd1+24];
st.shared.v4.f32 [__cuda___cuda_local_var_33106_33_non_const_sp_lj3316+0], {%f1,%f2,%f3,%f4};
.loc 17 264 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_lj_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_lj_start];
add.s32 %r10, %r9, %r8;
ld.param.s32 %r11, [__cudaparm_kernel_lj_inum];
setp.ge.s32 %p1, %r10, %r11;
@%p1 bra $Lt_1_25346;
.loc 17 269 0
cvt.s64.s32 %rd2, %r10;
mul.wide.s32 %rd3, %r10, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_lj_dev_ij];
add.u64 %rd5, %rd4, %rd3;
ld.global.s32 %r12, [%rd5+0];
ld.param.s32 %r13, [__cudaparm_kernel_lj_stride];
cvt.s64.s32 %rd6, %r13;
mul.wide.s32 %rd7, %r13, 4;
add.u64 %rd8, %rd7, %rd5;
ld.global.s32 %r14, [%rd8+0];
.loc 17 272 0
ld.param.u64 %rd9, [__cudaparm_kernel_lj_x_];
cvt.s64.s32 %rd10, %r12;
mul.wide.s32 %rd11, %r12, 16;
add.u64 %rd12, %rd9, %rd11;
ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0];
.loc 17 273 0
cvt.s32.s64 %r15, %rd6;
sub.s32 %r16, %r1, 1;
and.b32 %r17, %r16, %r2;
add.u64 %rd13, %rd7, %rd8;
mul.lo.s32 %r18, %r15, %r17;
cvt.s64.s32 %rd14, %r18;
mul.wide.s32 %rd15, %r18, 4;
add.u64 %rd16, %rd13, %rd15;
mov.s64 %rd17, %rd16;
mul.lo.s32 %r19, %r15, %r14;
cvt.s64.s32 %rd18, %r19;
mul.wide.s32 %rd19, %r19, 4;
add.u64 %rd20, %rd13, %rd19;
setp.ge.u64 %p2, %rd16, %rd20;
@%p2 bra $Lt_1_26882;
cvt.rzi.ftz.s32.f32 %r20, %f20;
ld.param.s32 %r21, [__cudaparm_kernel_lj_lj_types];
mul.lo.s32 %r22, %r21, %r20;
ld.param.u64 %rd21, [__cudaparm_kernel_lj_lj1];
mov.f32 %f21, 0f00000000; // 0
mov.f32 %f22, 0f00000000; // 0
mov.f32 %f23, 0f00000000; // 0
mov.f32 %f24, 0f00000000; // 0
mov.u64 %rd22, __cuda___cuda_local_var_33106_33_non_const_sp_lj3316;
$Lt_1_19714:
//<loop> Loop body line 273, nesting depth: 1, estimated iterations: unknown
.loc 17 278 0
ld.global.s32 %r23, [%rd17+0];
.loc 17 279 0
shr.s32 %r24, %r23, 30;
and.b32 %r25, %r24, 3;
cvt.s64.s32 %rd23, %r25;
mul.wide.s32 %rd24, %r25, 4;
add.u64 %rd25, %rd22, %rd24;
ld.shared.f32 %f25, [%rd25+0];
.loc 17 282 0
and.b32 %r26, %r23, 1073741823;
cvt.s64.s32 %rd26, %r26;
mul.wide.s32 %rd27, %r26, 16;
add.u64 %rd28, %rd9, %rd27;
ld.global.v4.f32 {%f26,%f27,%f28,%f29}, [%rd28+0];
.loc 17 278 0
cvt.rzi.ftz.s32.f32 %r27, %f29;
sub.ftz.f32 %f30, %f18, %f27;
sub.ftz.f32 %f31, %f17, %f26;
sub.ftz.f32 %f32, %f19, %f28;
mul.ftz.f32 %f33, %f30, %f30;
fma.rn.ftz.f32 %f34, %f31, %f31, %f33;
fma.rn.ftz.f32 %f35, %f32, %f32, %f34;
add.s32 %r28, %r27, %r22;
cvt.s64.s32 %rd29, %r28;
mul.wide.s32 %rd30, %r28, 16;
add.u64 %rd31, %rd30, %rd21;
ld.global.f32 %f36, [%rd31+8];
setp.gt.ftz.f32 %p3, %f36, %f35;
@!%p3 bra $Lt_1_27138;
ld.global.f32 %f37, [%rd31+12];
mov.f32 %f38, 0f00000000; // 0
setp.eq.ftz.f32 %p4, %f37, %f38;
@!%p4 bra $Lt_1_27138;
.loc 17 296 0
rcp.approx.ftz.f32 %f39, %f35;
mul.ftz.f32 %f40, %f39, %f39;
mul.ftz.f32 %f41, %f39, %f40;
mul.ftz.f32 %f42, %f39, %f41;
ld.global.v2.f32 {%f43,%f44}, [%rd31+0];
mul.ftz.f32 %f45, %f43, %f41;
sub.ftz.f32 %f46, %f45, %f44;
mul.ftz.f32 %f47, %f42, %f46;
mul.ftz.f32 %f48, %f25, %f47;
.loc 17 298 0
fma.rn.ftz.f32 %f23, %f31, %f48, %f23;
.loc 17 299 0
fma.rn.ftz.f32 %f22, %f30, %f48, %f22;
.loc 17 300 0
fma.rn.ftz.f32 %f21, %f32, %f48, %f21;
ld.param.s32 %r29, [__cudaparm_kernel_lj_eflag];
mov.u32 %r30, 0;
setp.le.s32 %p5, %r29, %r30;
@%p5 bra $Lt_1_19970;
.loc 17 304 0
ld.param.u64 %rd32, [__cudaparm_kernel_lj_lj3];
add.u64 %rd33, %rd32, %rd30;
ld.global.v4.f32 {%f49,%f50,%f51,_}, [%rd33+0];
mul.ftz.f32 %f52, %f49, %f41;
sub.ftz.f32 %f53, %f52, %f50;
mul.ftz.f32 %f54, %f41, %f53;
sub.ftz.f32 %f55, %f54, %f51;
fma.rn.ftz.f32 %f24, %f25, %f55, %f24;
$Lt_1_19970:
ld.param.s32 %r31, [__cudaparm_kernel_lj_vflag];
mov.u32 %r32, 0;
setp.le.s32 %p6, %r31, %r32;
@%p6 bra $Lt_1_27138;
.loc 17 307 0
mov.f32 %f56, %f6;
mul.ftz.f32 %f57, %f31, %f31;
fma.rn.ftz.f32 %f58, %f48, %f57, %f56;
mov.f32 %f6, %f58;
.loc 17 308 0
mov.f32 %f59, %f8;
fma.rn.ftz.f32 %f60, %f48, %f33, %f59;
mov.f32 %f8, %f60;
.loc 17 309 0
mov.f32 %f61, %f10;
mul.ftz.f32 %f62, %f32, %f32;
fma.rn.ftz.f32 %f63, %f48, %f62, %f61;
mov.f32 %f10, %f63;
.loc 17 310 0
mov.f32 %f64, %f12;
mul.ftz.f32 %f65, %f30, %f31;
fma.rn.ftz.f32 %f66, %f48, %f65, %f64;
mov.f32 %f12, %f66;
.loc 17 311 0
mov.f32 %f67, %f14;
mul.ftz.f32 %f68, %f31, %f32;
fma.rn.ftz.f32 %f69, %f48, %f68, %f67;
mov.f32 %f14, %f69;
.loc 17 312 0
mul.ftz.f32 %f70, %f30, %f32;
fma.rn.ftz.f32 %f15, %f48, %f70, %f15;
mov.f32 %f16, %f15;
$Lt_1_27138:
$L_1_18178:
.loc 17 306 0
mul.lo.s32 %r33, %r15, %r1;
cvt.s64.s32 %rd34, %r33;
mul.wide.s32 %rd35, %r33, 4;
add.u64 %rd17, %rd17, %rd35;
setp.gt.u64 %p7, %rd20, %rd17;
@%p7 bra $Lt_1_19714;
bra.uni $Lt_1_19202;
$Lt_1_26882:
mov.f32 %f21, 0f00000000; // 0
mov.f32 %f22, 0f00000000; // 0
mov.f32 %f23, 0f00000000; // 0
mov.f32 %f24, 0f00000000; // 0
$Lt_1_19202:
mov.u32 %r34, 1;
setp.le.s32 %p8, %r1, %r34;
@%p8 bra $Lt_1_23298;
.loc 17 317 0
mov.u64 %rd36, __cuda___cuda_local_var_33172_55_non_const_red_acc3332;
cvt.s64.s32 %rd37, %r2;
mul.wide.s32 %rd38, %r2, 4;
add.u64 %rd39, %rd36, %rd38;
mov.f32 %f71, %f23;
st.shared.f32 [%rd39+0], %f71;
mov.f32 %f72, %f22;
st.shared.f32 [%rd39+512], %f72;
mov.f32 %f73, %f21;
st.shared.f32 [%rd39+1024], %f73;
mov.f32 %f74, %f24;
st.shared.f32 [%rd39+1536], %f74;
shr.s32 %r35, %r1, 31;
mov.s32 %r36, 1;
and.b32 %r37, %r35, %r36;
add.s32 %r38, %r37, %r1;
shr.s32 %r39, %r38, 1;
mov.s32 %r40, %r39;
mov.u32 %r41, 0;
setp.ne.u32 %p9, %r39, %r41;
@!%p9 bra $Lt_1_21762;
$Lt_1_22274:
setp.ge.u32 %p10, %r17, %r40;
@%p10 bra $Lt_1_22530;
add.u32 %r42, %r2, %r40;
cvt.u64.u32 %rd40, %r42;
mul.wide.u32 %rd41, %r42, 4;
add.u64 %rd42, %rd36, %rd41;
ld.shared.f32 %f75, [%rd42+0];
add.ftz.f32 %f71, %f75, %f71;
st.shared.f32 [%rd39+0], %f71;
ld.shared.f32 %f76, [%rd42+512];
add.ftz.f32 %f72, %f76, %f72;
st.shared.f32 [%rd39+512], %f72;
ld.shared.f32 %f77, [%rd42+1024];
add.ftz.f32 %f73, %f77, %f73;
st.shared.f32 [%rd39+1024], %f73;
ld.shared.f32 %f78, [%rd42+1536];
add.ftz.f32 %f74, %f78, %f74;
st.shared.f32 [%rd39+1536], %f74;
$Lt_1_22530:
shr.u32 %r40, %r40, 1;
mov.u32 %r43, 0;
setp.ne.u32 %p11, %r40, %r43;
@%p11 bra $Lt_1_22274;
$Lt_1_21762:
mov.f32 %f23, %f71;
mov.f32 %f22, %f72;
mov.f32 %f21, %f73;
mov.f32 %f24, %f74;
ld.param.s32 %r44, [__cudaparm_kernel_lj_vflag];
mov.u32 %r45, 0;
setp.le.s32 %p12, %r44, %r45;
@%p12 bra $Lt_1_23298;
mov.f32 %f71, %f6;
st.shared.f32 [%rd39+0], %f71;
mov.f32 %f72, %f8;
st.shared.f32 [%rd39+512], %f72;
mov.f32 %f73, %f10;
st.shared.f32 [%rd39+1024], %f73;
mov.f32 %f74, %f12;
st.shared.f32 [%rd39+1536], %f74;
mov.f32 %f79, %f14;
st.shared.f32 [%rd39+2048], %f79;
mov.f32 %f80, %f15;
st.shared.f32 [%rd39+2560], %f80;
mov.s32 %r46, %r39;
@!%p9 bra $Lt_1_23810;
$Lt_1_24322:
setp.ge.u32 %p13, %r17, %r46;
@%p13 bra $Lt_1_24578;
add.u32 %r47, %r2, %r46;
cvt.u64.u32 %rd43, %r47;
mul.wide.u32 %rd44, %r47, 4;
add.u64 %rd45, %rd36, %rd44;
ld.shared.f32 %f81, [%rd45+0];
add.ftz.f32 %f71, %f81, %f71;
st.shared.f32 [%rd39+0], %f71;
ld.shared.f32 %f82, [%rd45+512];
add.ftz.f32 %f72, %f82, %f72;
st.shared.f32 [%rd39+512], %f72;
ld.shared.f32 %f83, [%rd45+1024];
add.ftz.f32 %f73, %f83, %f73;
st.shared.f32 [%rd39+1024], %f73;
ld.shared.f32 %f84, [%rd45+1536];
add.ftz.f32 %f74, %f84, %f74;
st.shared.f32 [%rd39+1536], %f74;
ld.shared.f32 %f85, [%rd45+2048];
add.ftz.f32 %f79, %f85, %f79;
st.shared.f32 [%rd39+2048], %f79;
ld.shared.f32 %f86, [%rd45+2560];
add.ftz.f32 %f80, %f86, %f80;
st.shared.f32 [%rd39+2560], %f80;
$Lt_1_24578:
shr.u32 %r46, %r46, 1;
mov.u32 %r48, 0;
setp.ne.u32 %p14, %r46, %r48;
@%p14 bra $Lt_1_24322;
$Lt_1_23810:
mov.f32 %f6, %f71;
mov.f32 %f8, %f72;
mov.f32 %f10, %f73;
mov.f32 %f12, %f74;
mov.f32 %f14, %f79;
mov.f32 %f16, %f80;
$Lt_1_23298:
$Lt_1_21250:
mov.u32 %r49, 0;
setp.ne.s32 %p15, %r17, %r49;
@%p15 bra $Lt_1_25346;
ld.param.u64 %rd46, [__cudaparm_kernel_lj___val_paramengv];
add.u64 %rd47, %rd46, %rd3;
ld.param.s32 %r50, [__cudaparm_kernel_lj_eflag];
mov.u32 %r51, 0;
setp.le.s32 %p16, %r50, %r51;
@%p16 bra $Lt_1_25858;
ld.global.f32 %f87, [%rd47+0];
add.ftz.f32 %f88, %f87, %f24;
st.global.f32 [%rd47+0], %f88;
cvt.s64.s32 %rd48, %r11;
mul.wide.s32 %rd49, %r11, 4;
add.u64 %rd47, %rd47, %rd49;
$Lt_1_25858:
ld.param.s32 %r52, [__cudaparm_kernel_lj_vflag];
mov.u32 %r53, 0;
setp.le.s32 %p17, %r52, %r53;
@%p17 bra $Lt_1_26370;
ld.global.f32 %f89, [%rd47+0];
mov.f32 %f90, %f6;
add.ftz.f32 %f91, %f89, %f90;
st.global.f32 [%rd47+0], %f91;
cvt.s64.s32 %rd50, %r11;
mul.wide.s32 %rd51, %r11, 4;
add.u64 %rd52, %rd51, %rd47;
ld.global.f32 %f92, [%rd52+0];
mov.f32 %f93, %f8;
add.ftz.f32 %f94, %f92, %f93;
st.global.f32 [%rd52+0], %f94;
add.u64 %rd53, %rd51, %rd52;
ld.global.f32 %f95, [%rd53+0];
mov.f32 %f96, %f10;
add.ftz.f32 %f97, %f95, %f96;
st.global.f32 [%rd53+0], %f97;
add.u64 %rd54, %rd51, %rd53;
ld.global.f32 %f98, [%rd54+0];
mov.f32 %f99, %f12;
add.ftz.f32 %f100, %f98, %f99;
st.global.f32 [%rd54+0], %f100;
add.u64 %rd55, %rd51, %rd54;
ld.global.f32 %f101, [%rd55+0];
mov.f32 %f102, %f14;
add.ftz.f32 %f103, %f101, %f102;
st.global.f32 [%rd55+0], %f103;
add.u64 %rd47, %rd51, %rd55;
ld.global.f32 %f104, [%rd47+0];
mov.f32 %f105, %f16;
add.ftz.f32 %f106, %f104, %f105;
st.global.f32 [%rd47+0], %f106;
$Lt_1_26370:
ld.param.u64 %rd56, [__cudaparm_kernel_lj_ans];
mul.lo.u64 %rd57, %rd2, 16;
add.u64 %rd58, %rd56, %rd57;
ld.global.v4.f32 {%f107,%f108,%f109,%f110}, [%rd58+0];
add.ftz.f32 %f111, %f108, %f22;
add.ftz.f32 %f112, %f109, %f21;
add.ftz.f32 %f113, %f107, %f23;
st.global.v4.f32 [%rd58+0], {%f113,%f111,%f112,%f110};
$Lt_1_25346:
$Lt_1_18690:
.loc 17 320 0
exit;
$LDWend_kernel_lj:
} // kernel_lj
.entry kernel_lj_fast (
.param .u64 __cudaparm_kernel_lj_fast_x_,
.param .u64 __cudaparm_kernel_lj_fast_lj1_in,
.param .u64 __cudaparm_kernel_lj_fast_lj3_in,
.param .u64 __cudaparm_kernel_lj_fast_gum,
.param .s32 __cudaparm_kernel_lj_fast_stride,
.param .u64 __cudaparm_kernel_lj_fast_dev_ij,
.param .u64 __cudaparm_kernel_lj_fast_ans,
.param .u64 __cudaparm_kernel_lj_fast___val_paramengv,
.param .u64 __cudaparm_kernel_lj_fast_err_flag,
.param .s32 __cudaparm_kernel_lj_fast_eflag,
.param .s32 __cudaparm_kernel_lj_fast_vflag,
.param .s32 __cudaparm_kernel_lj_fast_start,
.param .s32 __cudaparm_kernel_lj_fast_inum,
.param .s32 __cudaparm_kernel_lj_fast_t_per_atom)
{
.reg .u32 %r<57>;
.reg .u64 %rd<72>;
.reg .f32 %f<122>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_33188_33_non_const_sp_lj6500[16];
.shared .align 16 .b8 __cuda___cuda_local_var_33189_34_non_const_lj16528[1936];
.shared .align 16 .b8 __cuda___cuda_local_var_33190_34_non_const_lj38464[1936];
.shared .align 4 .b8 __cuda___cuda_local_var_33260_55_non_const_red_acc10400[3072];
// __cuda_local_var_33204_9_non_const_virial = 16
.loc 17 328 0
$LDWbegin_kernel_lj_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_2_20994;
.loc 17 337 0
mov.u64 %rd1, __cuda___cuda_local_var_33188_33_non_const_sp_lj6500;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_lj_fast_gum];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+12];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_2_20994:
mov.u64 %rd1, __cuda___cuda_local_var_33188_33_non_const_sp_lj6500;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_2_21506;
.loc 17 339 0
mov.u64 %rd7, __cuda___cuda_local_var_33189_34_non_const_lj16528;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_lj_fast_lj1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_lj_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_2_22018;
.loc 17 341 0
mov.u64 %rd13, __cuda___cuda_local_var_33190_34_non_const_lj38464;
ld.param.u64 %rd14, [__cudaparm_kernel_lj_fast_lj3_in];
add.u64 %rd15, %rd14, %rd9;
add.u64 %rd16, %rd9, %rd13;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
$Lt_2_22018:
mov.u64 %rd13, __cuda___cuda_local_var_33190_34_non_const_lj38464;
$Lt_2_21506:
mov.u64 %rd13, __cuda___cuda_local_var_33190_34_non_const_lj38464;
mov.u64 %rd7, __cuda___cuda_local_var_33189_34_non_const_lj16528;
.loc 17 351 0
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
mov.f32 %f20, 0f00000000; // 0
mov.f32 %f21, %f20;
.loc 17 353 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_lj_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
cvt.s32.u32 %r10, %ctaid.x;
mul.lo.s32 %r11, %r10, %r9;
add.s32 %r12, %r7, %r11;
ld.param.s32 %r13, [__cudaparm_kernel_lj_fast_start];
add.s32 %r14, %r13, %r12;
ld.param.s32 %r15, [__cudaparm_kernel_lj_fast_inum];
setp.ge.s32 %p4, %r14, %r15;
@%p4 bra $Lt_2_29186;
.loc 17 358 0
cvt.s64.s32 %rd17, %r14;
mul.wide.s32 %rd18, %r14, 4;
ld.param.u64 %rd19, [__cudaparm_kernel_lj_fast_dev_ij];
add.u64 %rd20, %rd19, %rd18;
ld.global.s32 %r16, [%rd20+0];
ld.param.s32 %r17, [__cudaparm_kernel_lj_fast_stride];
cvt.s64.s32 %rd21, %r17;
mul.wide.s32 %rd22, %r17, 4;
add.u64 %rd23, %rd22, %rd20;
ld.global.s32 %r18, [%rd23+0];
.loc 17 361 0
ld.param.u64 %rd24, [__cudaparm_kernel_lj_fast_x_];
cvt.s64.s32 %rd25, %r16;
mul.wide.s32 %rd26, %r16, 16;
add.u64 %rd27, %rd24, %rd26;
ld.global.v4.f32 {%f22,%f23,%f24,%f25}, [%rd27+0];
.loc 17 363 0
cvt.s32.s64 %r19, %rd21;
sub.s32 %r20, %r6, 1;
and.b32 %r21, %r20, %r1;
add.u64 %rd28, %rd22, %rd23;
mul.lo.s32 %r22, %r19, %r21;
cvt.s64.s32 %rd29, %r22;
mul.wide.s32 %rd30, %r22, 4;
add.u64 %rd31, %rd28, %rd30;
mov.s64 %rd32, %rd31;
mul.lo.s32 %r23, %r19, %r18;
cvt.s64.s32 %rd33, %r23;
mul.wide.s32 %rd34, %r23, 4;
add.u64 %rd35, %rd28, %rd34;
setp.ge.u64 %p5, %rd31, %rd35;
@%p5 bra $Lt_2_30722;
cvt.rzi.ftz.s32.f32 %r24, %f25;
mul.lo.s32 %r25, %r24, 11;
cvt.rn.f32.s32 %f26, %r25;
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.f32 %f29, 0f00000000; // 0
mov.f32 %f30, 0f00000000; // 0
$Lt_2_23554:
//<loop> Loop body line 363, nesting depth: 1, estimated iterations: unknown
.loc 17 368 0
ld.global.s32 %r26, [%rd32+0];
.loc 17 369 0
shr.s32 %r27, %r26, 30;
and.b32 %r28, %r27, 3;
cvt.s64.s32 %rd36, %r28;
mul.wide.s32 %rd37, %r28, 4;
add.u64 %rd38, %rd1, %rd37;
ld.shared.f32 %f31, [%rd38+0];
.loc 17 372 0
and.b32 %r29, %r26, 1073741823;
cvt.s64.s32 %rd39, %r29;
mul.wide.s32 %rd40, %r29, 16;
add.u64 %rd41, %rd24, %rd40;
ld.global.v4.f32 {%f32,%f33,%f34,%f35}, [%rd41+0];
.loc 17 368 0
sub.ftz.f32 %f36, %f23, %f33;
sub.ftz.f32 %f37, %f22, %f32;
sub.ftz.f32 %f38, %f24, %f34;
mul.ftz.f32 %f39, %f36, %f36;
fma.rn.ftz.f32 %f40, %f37, %f37, %f39;
fma.rn.ftz.f32 %f41, %f38, %f38, %f40;
add.ftz.f32 %f42, %f26, %f35;
cvt.rzi.ftz.s32.f32 %r30, %f42;
cvt.s64.s32 %rd42, %r30;
mul.wide.s32 %rd43, %r30, 16;
add.u64 %rd44, %rd43, %rd7;
ld.shared.f32 %f43, [%rd44+8];
setp.gt.ftz.f32 %p6, %f43, %f41;
@!%p6 bra $Lt_2_30978;
ld.shared.f32 %f44, [%rd44+12];
mov.f32 %f45, 0f00000000; // 0
setp.eq.ftz.f32 %p7, %f44, %f45;
@!%p7 bra $Lt_2_30978;
.loc 17 384 0
rcp.approx.ftz.f32 %f46, %f41;
mul.ftz.f32 %f47, %f46, %f46;
mul.ftz.f32 %f48, %f46, %f47;
mul.ftz.f32 %f49, %f46, %f31;
mul.ftz.f32 %f50, %f48, %f49;
ld.shared.v2.f32 {%f51,%f52}, [%rd44+0];
mul.ftz.f32 %f53, %f51, %f48;
sub.ftz.f32 %f54, %f53, %f52;
mul.ftz.f32 %f55, %f50, %f54;
.loc 17 386 0
fma.rn.ftz.f32 %f29, %f37, %f55, %f29;
.loc 17 387 0
fma.rn.ftz.f32 %f28, %f36, %f55, %f28;
.loc 17 388 0
fma.rn.ftz.f32 %f27, %f38, %f55, %f27;
ld.param.s32 %r31, [__cudaparm_kernel_lj_fast_eflag];
mov.u32 %r32, 0;
setp.le.s32 %p8, %r31, %r32;
@%p8 bra $Lt_2_23810;
.loc 17 391 0
add.u64 %rd45, %rd43, %rd13;
ld.shared.v4.f32 {%f56,%f57,%f58,_}, [%rd45+0];
mul.ftz.f32 %f59, %f56, %f48;
sub.ftz.f32 %f60, %f59, %f57;
mul.ftz.f32 %f61, %f48, %f60;
.loc 17 392 0
sub.ftz.f32 %f62, %f61, %f58;
fma.rn.ftz.f32 %f30, %f31, %f62, %f30;
$Lt_2_23810:
ld.param.s32 %r33, [__cudaparm_kernel_lj_fast_vflag];
mov.u32 %r34, 0;
setp.le.s32 %p9, %r33, %r34;
@%p9 bra $Lt_2_30978;
.loc 17 395 0
mov.f32 %f63, %f11;
mul.ftz.f32 %f64, %f37, %f37;
fma.rn.ftz.f32 %f65, %f55, %f64, %f63;
mov.f32 %f11, %f65;
.loc 17 396 0
mov.f32 %f66, %f13;
fma.rn.ftz.f32 %f67, %f55, %f39, %f66;
mov.f32 %f13, %f67;
.loc 17 397 0
mov.f32 %f68, %f15;
mul.ftz.f32 %f69, %f38, %f38;
fma.rn.ftz.f32 %f70, %f55, %f69, %f68;
mov.f32 %f15, %f70;
.loc 17 398 0
mov.f32 %f71, %f17;
mul.ftz.f32 %f72, %f36, %f37;
fma.rn.ftz.f32 %f73, %f55, %f72, %f71;
mov.f32 %f17, %f73;
.loc 17 399 0
mov.f32 %f74, %f19;
mul.ftz.f32 %f75, %f37, %f38;
fma.rn.ftz.f32 %f76, %f55, %f75, %f74;
mov.f32 %f19, %f76;
.loc 17 400 0
mul.ftz.f32 %f77, %f36, %f38;
fma.rn.ftz.f32 %f20, %f55, %f77, %f20;
mov.f32 %f21, %f20;
$Lt_2_30978:
$L_2_20482:
.loc 17 394 0
mul.lo.s32 %r35, %r19, %r6;
cvt.s64.s32 %rd46, %r35;
mul.wide.s32 %rd47, %r35, 4;
add.u64 %rd32, %rd32, %rd47;
setp.gt.u64 %p10, %rd35, %rd32;
@%p10 bra $Lt_2_23554;
bra.uni $Lt_2_23042;
$Lt_2_30722:
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.f32 %f29, 0f00000000; // 0
mov.f32 %f30, 0f00000000; // 0
$Lt_2_23042:
mov.u32 %r36, 1;
setp.le.s32 %p11, %r6, %r36;
@%p11 bra $Lt_2_27138;
.loc 17 405 0
mov.u64 %rd48, __cuda___cuda_local_var_33260_55_non_const_red_acc10400;
cvt.s64.s32 %rd49, %r1;
mul.wide.s32 %rd50, %r1, 4;
add.u64 %rd51, %rd48, %rd50;
mov.f32 %f78, %f29;
st.shared.f32 [%rd51+0], %f78;
mov.f32 %f79, %f28;
st.shared.f32 [%rd51+512], %f79;
mov.f32 %f80, %f27;
st.shared.f32 [%rd51+1024], %f80;
mov.f32 %f81, %f30;
st.shared.f32 [%rd51+1536], %f81;
shr.s32 %r37, %r6, 31;
mov.s32 %r38, 1;
and.b32 %r39, %r37, %r38;
add.s32 %r40, %r39, %r6;
shr.s32 %r41, %r40, 1;
mov.s32 %r42, %r41;
mov.u32 %r43, 0;
setp.ne.u32 %p12, %r41, %r43;
@!%p12 bra $Lt_2_25602;
$Lt_2_26114:
setp.ge.u32 %p13, %r21, %r42;
@%p13 bra $Lt_2_26370;
add.u32 %r44, %r1, %r42;
cvt.u64.u32 %rd52, %r44;
mul.wide.u32 %rd53, %r44, 4;
add.u64 %rd54, %rd48, %rd53;
ld.shared.f32 %f82, [%rd54+0];
add.ftz.f32 %f78, %f82, %f78;
st.shared.f32 [%rd51+0], %f78;
ld.shared.f32 %f83, [%rd54+512];
add.ftz.f32 %f79, %f83, %f79;
st.shared.f32 [%rd51+512], %f79;
ld.shared.f32 %f84, [%rd54+1024];
add.ftz.f32 %f80, %f84, %f80;
st.shared.f32 [%rd51+1024], %f80;
ld.shared.f32 %f85, [%rd54+1536];
add.ftz.f32 %f81, %f85, %f81;
st.shared.f32 [%rd51+1536], %f81;
$Lt_2_26370:
shr.u32 %r42, %r42, 1;
mov.u32 %r45, 0;
setp.ne.u32 %p14, %r42, %r45;
@%p14 bra $Lt_2_26114;
$Lt_2_25602:
mov.f32 %f29, %f78;
mov.f32 %f28, %f79;
mov.f32 %f27, %f80;
mov.f32 %f30, %f81;
ld.param.s32 %r46, [__cudaparm_kernel_lj_fast_vflag];
mov.u32 %r47, 0;
setp.le.s32 %p15, %r46, %r47;
@%p15 bra $Lt_2_27138;
mov.f32 %f78, %f11;
st.shared.f32 [%rd51+0], %f78;
mov.f32 %f79, %f13;
st.shared.f32 [%rd51+512], %f79;
mov.f32 %f80, %f15;
st.shared.f32 [%rd51+1024], %f80;
mov.f32 %f81, %f17;
st.shared.f32 [%rd51+1536], %f81;
mov.f32 %f86, %f19;
st.shared.f32 [%rd51+2048], %f86;
mov.f32 %f87, %f20;
st.shared.f32 [%rd51+2560], %f87;
mov.s32 %r48, %r41;
@!%p12 bra $Lt_2_27650;
$Lt_2_28162:
setp.ge.u32 %p16, %r21, %r48;
@%p16 bra $Lt_2_28418;
add.u32 %r49, %r1, %r48;
cvt.u64.u32 %rd55, %r49;
mul.wide.u32 %rd56, %r49, 4;
add.u64 %rd57, %rd48, %rd56;
ld.shared.f32 %f88, [%rd57+0];
add.ftz.f32 %f78, %f88, %f78;
st.shared.f32 [%rd51+0], %f78;
ld.shared.f32 %f89, [%rd57+512];
add.ftz.f32 %f79, %f89, %f79;
st.shared.f32 [%rd51+512], %f79;
ld.shared.f32 %f90, [%rd57+1024];
add.ftz.f32 %f80, %f90, %f80;
st.shared.f32 [%rd51+1024], %f80;
ld.shared.f32 %f91, [%rd57+1536];
add.ftz.f32 %f81, %f91, %f81;
st.shared.f32 [%rd51+1536], %f81;
ld.shared.f32 %f92, [%rd57+2048];
add.ftz.f32 %f86, %f92, %f86;
st.shared.f32 [%rd51+2048], %f86;
ld.shared.f32 %f93, [%rd57+2560];
add.ftz.f32 %f87, %f93, %f87;
st.shared.f32 [%rd51+2560], %f87;
$Lt_2_28418:
shr.u32 %r48, %r48, 1;
mov.u32 %r50, 0;
setp.ne.u32 %p17, %r48, %r50;
@%p17 bra $Lt_2_28162;
$Lt_2_27650:
mov.f32 %f11, %f78;
mov.f32 %f13, %f79;
mov.f32 %f15, %f80;
mov.f32 %f17, %f81;
mov.f32 %f19, %f86;
mov.f32 %f21, %f87;
$Lt_2_27138:
$Lt_2_25090:
mov.u32 %r51, 0;
setp.ne.s32 %p18, %r21, %r51;
@%p18 bra $Lt_2_29186;
ld.param.u64 %rd58, [__cudaparm_kernel_lj_fast___val_paramengv];
add.u64 %rd59, %rd58, %rd18;
ld.param.s32 %r52, [__cudaparm_kernel_lj_fast_eflag];
mov.u32 %r53, 0;
setp.le.s32 %p19, %r52, %r53;
@%p19 bra $Lt_2_29698;
ld.global.f32 %f94, [%rd59+0];
add.ftz.f32 %f95, %f94, %f30;
st.global.f32 [%rd59+0], %f95;
cvt.s64.s32 %rd60, %r15;
mul.wide.s32 %rd61, %r15, 4;
add.u64 %rd59, %rd59, %rd61;
$Lt_2_29698:
ld.param.s32 %r54, [__cudaparm_kernel_lj_fast_vflag];
mov.u32 %r55, 0;
setp.le.s32 %p20, %r54, %r55;
@%p20 bra $Lt_2_30210;
ld.global.f32 %f96, [%rd59+0];
mov.f32 %f97, %f11;
add.ftz.f32 %f98, %f96, %f97;
st.global.f32 [%rd59+0], %f98;
cvt.s64.s32 %rd62, %r15;
mul.wide.s32 %rd63, %r15, 4;
add.u64 %rd64, %rd63, %rd59;
ld.global.f32 %f99, [%rd64+0];
mov.f32 %f100, %f13;
add.ftz.f32 %f101, %f99, %f100;
st.global.f32 [%rd64+0], %f101;
add.u64 %rd65, %rd63, %rd64;
ld.global.f32 %f102, [%rd65+0];
mov.f32 %f103, %f15;
add.ftz.f32 %f104, %f102, %f103;
st.global.f32 [%rd65+0], %f104;
add.u64 %rd66, %rd63, %rd65;
ld.global.f32 %f105, [%rd66+0];
mov.f32 %f106, %f17;
add.ftz.f32 %f107, %f105, %f106;
st.global.f32 [%rd66+0], %f107;
add.u64 %rd67, %rd63, %rd66;
ld.global.f32 %f108, [%rd67+0];
mov.f32 %f109, %f19;
add.ftz.f32 %f110, %f108, %f109;
st.global.f32 [%rd67+0], %f110;
add.u64 %rd59, %rd63, %rd67;
ld.global.f32 %f111, [%rd59+0];
mov.f32 %f112, %f21;
add.ftz.f32 %f113, %f111, %f112;
st.global.f32 [%rd59+0], %f113;
$Lt_2_30210:
ld.param.u64 %rd68, [__cudaparm_kernel_lj_fast_ans];
mul.lo.u64 %rd69, %rd17, 16;
add.u64 %rd70, %rd68, %rd69;
ld.global.v4.f32 {%f114,%f115,%f116,%f117}, [%rd70+0];
add.ftz.f32 %f118, %f115, %f28;
add.ftz.f32 %f119, %f116, %f27;
add.ftz.f32 %f120, %f114, %f29;
st.global.v4.f32 [%rd70+0], {%f120,%f118,%f119,%f117};
$Lt_2_29186:
$Lt_2_22530:
.loc 17 408 0
exit;
$LDWend_kernel_lj_fast:
} // kernel_lj_fast

Event Timeline