Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F96558939
pppm_f.ptx
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Dec 28, 02:31
Size
26 KB
Mime Type
text/x-asm
Expires
Mon, Dec 30, 02:31 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
23205575
Attached To
rLAMMPS lammps
pppm_f.ptx
View Options
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009b0b_00000000-9_lal_pppm.cpp3.i (/home/sjplimp/ccBI#.wCkpTI)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009b0b_00000000-8_lal_pppm.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 17 "lal_pppm.cu"
.file 18 "/usr/local/cuda/include/common_functions.h"
.file 19 "/usr/local/cuda/include/math_functions.h"
.file 20 "/usr/local/cuda/include/math_constants.h"
.file 21 "/usr/local/cuda/include/device_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.global .texref q_tex;
.entry particle_map (
.param .u64 __cudaparm_particle_map_x_,
.param .u64 __cudaparm_particle_map_q_,
.param .f32 __cudaparm_particle_map_delvolinv,
.param .s32 __cudaparm_particle_map_nlocal,
.param .u64 __cudaparm_particle_map_counts,
.param .u64 __cudaparm_particle_map_ans,
.param .f32 __cudaparm_particle_map_b_lo_x,
.param .f32 __cudaparm_particle_map_b_lo_y,
.param .f32 __cudaparm_particle_map_b_lo_z,
.param .f32 __cudaparm_particle_map_delxinv,
.param .f32 __cudaparm_particle_map_delyinv,
.param .f32 __cudaparm_particle_map_delzinv,
.param .s32 __cudaparm_particle_map_nlocal_x,
.param .s32 __cudaparm_particle_map_nlocal_y,
.param .s32 __cudaparm_particle_map_nlocal_z,
.param .s32 __cudaparm_particle_map_atom_stride,
.param .s32 __cudaparm_particle_map_max_atoms,
.param .u64 __cudaparm_particle_map_error)
{
.reg .u32 %r<50>;
.reg .u64 %rd<12>;
.reg .f32 %f<44>;
.reg .pred %p<11>;
.loc 17 50 0
$LDWbegin_particle_map:
cvt.s32.u32 %r1, %ntid.x;
cvt.s32.u32 %r2, %ctaid.x;
mul24.lo.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %nctaid.x;
mul24.lo.s32 %r5, %r4, %r1;
mov.u32 %r6, %tid.x;
add.u32 %r7, %r3, %r6;
sub.s32 %r8, %r5, 1;
mul.lo.s32 %r9, %r7, 64;
div.s32 %r10, %r9, %r5;
mul.lo.s32 %r11, %r8, %r10;
sub.s32 %r12, %r9, %r11;
ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];
setp.le.s32 %p1, %r13, %r12;
@%p1 bra $Lt_0_7426;
.loc 17 62 0
mov.u32 %r14, %r12;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
mov.s32 %r17, 0;
mov.u32 %r18, %r17;
mov.s32 %r19, 0;
mov.u32 %r20, %r19;
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];
mov.f32 %f5, %f1;
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
.loc 17 64 0
mov.u32 %r21, %r12;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];
mov.f32 %f12, %f8;
ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv];
mul.ftz.f32 %f14, %f13, %f12;
mov.f32 %f15, 0f00000000; // 0
setp.neu.ftz.f32 %p2, %f14, %f15;
@!%p2 bra $Lt_0_7426;
.loc 17 67 0
ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x];
sub.ftz.f32 %f17, %f5, %f16;
ld.param.f32 %f18, [__cudaparm_particle_map_delxinv];
mul.ftz.f32 %f19, %f18, %f17;
mov.f32 %f20, 0f00000000; // 0
setp.lt.ftz.f32 %p3, %f19, %f20;
@%p3 bra $Lt_0_8706;
ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y];
sub.ftz.f32 %f22, %f6, %f21;
ld.param.f32 %f23, [__cudaparm_particle_map_delyinv];
mul.ftz.f32 %f24, %f23, %f22;
mov.f32 %f25, 0f00000000; // 0
setp.lt.ftz.f32 %p4, %f24, %f25;
@%p4 bra $Lt_0_8706;
ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z];
sub.ftz.f32 %f27, %f7, %f26;
ld.param.f32 %f28, [__cudaparm_particle_map_delzinv];
mul.ftz.f32 %f29, %f28, %f27;
mov.f32 %f30, 0f00000000; // 0
setp.lt.ftz.f32 %p5, %f29, %f30;
@%p5 bra $Lt_0_8706;
cvt.rzi.ftz.s32.f32 %r28, %f19;
ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];
setp.ge.s32 %p6, %r28, %r29;
@%p6 bra $Lt_0_8706;
cvt.rzi.ftz.s32.f32 %r30, %f24;
ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];
setp.ge.s32 %p7, %r30, %r31;
@%p7 bra $Lt_0_8706;
cvt.rzi.ftz.s32.f32 %r32, %f29;
ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];
setp.gt.s32 %p8, %r33, %r32;
@%p8 bra $L_0_4866;
$Lt_0_8706:
$L_0_5122:
.loc 17 76 0
mov.s32 %r34, 1;
ld.param.u64 %rd1, [__cudaparm_particle_map_error];
st.global.s32 [%rd1+0], %r34;
bra.uni $Lt_0_7426;
$L_0_4866:
.loc 17 83 0
mul.lo.s32 %r35, %r32, %r31;
add.s32 %r36, %r30, %r35;
mul.lo.s32 %r37, %r36, %r29;
add.s32 %r38, %r28, %r37;
ld.param.u64 %rd2, [__cudaparm_particle_map_counts];
cvt.s64.s32 %rd3, %r38;
mul.wide.s32 %rd4, %r38, 4;
add.u64 %rd5, %rd2, %rd4;
mov.s32 %r39, 1;
atom.global.add.s32 %r40, [%rd5], %r39;
mov.s32 %r41, %r40;
ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];
setp.gt.s32 %p9, %r42, %r41;
@%p9 bra $Lt_0_7682;
.loc 17 85 0
mov.s32 %r43, 2;
ld.param.u64 %rd6, [__cudaparm_particle_map_error];
st.global.s32 [%rd6+0], %r43;
.loc 16 118 0
mov.s32 %r44, -1;
atom.global.add.s32 %r45, [%rd5], %r44;
bra.uni $Lt_0_7426;
$Lt_0_7682:
.loc 17 88 0
ld.param.u64 %rd7, [__cudaparm_particle_map_ans];
ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];
mul.lo.s32 %r47, %r46, %r41;
add.s32 %r48, %r38, %r47;
cvt.s64.s32 %rd8, %r48;
mul.wide.s32 %rd9, %r48, 16;
add.u64 %rd10, %rd7, %rd9;
cvt.rn.f32.s32 %f31, %r28;
mov.f32 %f32, 0f3f000000; // 0.5
add.ftz.f32 %f33, %f31, %f32;
sub.ftz.f32 %f34, %f33, %f19;
cvt.rn.f32.s32 %f35, %r30;
mov.f32 %f36, 0f3f000000; // 0.5
add.ftz.f32 %f37, %f35, %f36;
sub.ftz.f32 %f38, %f37, %f24;
cvt.rn.f32.s32 %f39, %r32;
mov.f32 %f40, 0f3f000000; // 0.5
add.ftz.f32 %f41, %f39, %f40;
sub.ftz.f32 %f42, %f41, %f29;
st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14};
$Lt_0_7426:
$L_0_4610:
$Lt_0_6914:
$Lt_0_6402:
.loc 17 92 0
exit;
$LDWend_particle_map:
} // particle_map
.entry make_rho (
.param .u64 __cudaparm_make_rho_counts,
.param .u64 __cudaparm_make_rho_atoms,
.param .u64 __cudaparm_make_rho_brick,
.param .u64 __cudaparm_make_rho__rho_coeff,
.param .s32 __cudaparm_make_rho_atom_stride,
.param .s32 __cudaparm_make_rho_npts_x,
.param .s32 __cudaparm_make_rho_npts_y,
.param .s32 __cudaparm_make_rho_npts_z,
.param .s32 __cudaparm_make_rho_nlocal_x,
.param .s32 __cudaparm_make_rho_nlocal_y,
.param .s32 __cudaparm_make_rho_nlocal_z,
.param .s32 __cudaparm_make_rho_order_m_1,
.param .s32 __cudaparm_make_rho_order,
.param .s32 __cudaparm_make_rho_order2)
{
.reg .u32 %r<119>;
.reg .u64 %rd<57>;
.reg .f32 %f<26>;
.reg .pred %p<27>;
.shared .align 4 .b8 __cuda___cuda_local_var_32578_33_non_const_rho_coeff168[256];
.shared .align 4 .b8 __cuda___cuda_local_var_32579_33_non_const_front424[320];
.shared .align 4 .b8 __cuda___cuda_local_var_32580_33_non_const_ans744[2048];
.loc 17 101 0
$LDWbegin_make_rho:
ld.param.s32 %r1, [__cudaparm_make_rho_order2];
ld.param.s32 %r2, [__cudaparm_make_rho_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_1_16898;
.loc 17 108 0
mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 4;
ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_16898:
mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;
shr.s32 %r5, %r4, 31;
mov.s32 %r6, 31;
and.b32 %r7, %r5, %r6;
add.s32 %r8, %r7, %r4;
shr.s32 %r9, %r8, 5;
mul.lo.s32 %r10, %r9, 32;
sub.s32 %r11, %r4, %r10;
setp.lt.s32 %p2, %r11, %r2;
@!%p2 bra $Lt_1_17410;
.loc 17 114 0
mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;
mov.f32 %f2, 0f00000000; // 0
cvt.s64.s32 %rd8, %r11;
shr.s32 %r12, %r4, 31;
mov.s32 %r13, 31;
and.b32 %r14, %r12, %r13;
add.s32 %r15, %r14, %r4;
shr.s32 %r16, %r15, 5;
cvt.s64.s32 %rd9, %r16;
mul.wide.s32 %rd10, %r16, 40;
add.u64 %rd11, %rd8, %rd10;
mul.lo.u64 %rd12, %rd11, 4;
add.u64 %rd13, %rd7, %rd12;
st.shared.f32 [%rd13+128], %f2;
$Lt_1_17410:
mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;
.loc 17 116 0
bar.sync 0;
ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];
shr.s32 %r18, %r17, 31;
mov.s32 %r19, 31;
and.b32 %r20, %r18, %r19;
add.s32 %r21, %r20, %r17;
shr.s32 %r22, %r21, 5;
add.s32 %r23, %r22, 1;
mov.u32 %r24, 0;
setp.le.s32 %p3, %r23, %r24;
@%p3 bra $Lt_1_17922;
shr.s32 %r25, %r4, 31;
mov.s32 %r26, 31;
and.b32 %r27, %r25, %r26;
add.s32 %r28, %r27, %r4;
shr.s32 %r29, %r28, 5;
add.s32 %r30, %r11, 32;
ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];
ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];
mul.lo.s32 %r33, %r31, %r32;
mov.u32 %r34, %ctaid.x;
mul.lo.u32 %r35, %r34, 2;
add.u32 %r36, %r29, %r35;
ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];
div.s32 %r38, %r36, %r37;
ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];
setp.lt.s32 %p4, %r38, %r39;
sub.s32 %r40, %r39, %r38;
mov.s32 %r41, 0;
selp.s32 %r42, %r40, %r41, %p4;
ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];
setp.ge.s32 %p5, %r38, %r43;
sub.s32 %r44, %r43, %r38;
add.s32 %r45, %r44, %r2;
sub.s32 %r46, %r45, 1;
selp.s32 %r47, %r46, %r2, %p5;
rem.s32 %r48, %r36, %r37;
setp.lt.s32 %p6, %r48, %r39;
sub.s32 %r49, %r39, %r48;
mov.s32 %r50, 0;
selp.s32 %r51, %r49, %r50, %p6;
setp.ge.s32 %p7, %r48, %r31;
sub.s32 %r52, %r31, %r48;
add.s32 %r53, %r52, %r2;
sub.s32 %r54, %r53, 1;
selp.s32 %r55, %r54, %r2, %p7;
mov.s32 %r56, %r23;
mov.s32 %r57, 0;
setp.gt.s32 %p8, %r2, %r57;
mov.s32 %r58, 0;
cvt.s64.s32 %rd14, %r11;
cvt.s64.s32 %rd15, %r29;
mul.lo.s32 %r59, %r23, 32;
mul.wide.s32 %rd16, %r29, 40;
add.u64 %rd17, %rd14, %rd16;
ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];
setp.gt.s32 %p9, %r60, %r38;
mul.lo.u64 %rd18, %rd17, 4;
selp.s32 %r61, 1, 0, %p9;
add.u64 %rd19, %rd18, %rd7;
mov.u64 %rd20, __cuda___cuda_local_var_32580_33_non_const_ans744;
mov.s32 %r62, %r56;
$Lt_1_18434:
//<loop> Loop body line 116, nesting depth: 1, estimated iterations: unknown
@!%p8 bra $Lt_1_18690;
mov.s32 %r63, %r2;
cvt.s64.s32 %rd21, %r4;
mul.wide.s32 %rd22, %r4, 4;
add.u64 %rd23, %rd20, %rd22;
mov.s32 %r64, 0;
mov.s32 %r65, %r63;
$Lt_1_19202:
//<loop> Loop body line 116, nesting depth: 2, estimated iterations: unknown
.loc 17 140 0
mov.f32 %f3, 0f00000000; // 0
st.shared.f32 [%rd23+0], %f3;
add.s32 %r64, %r64, 1;
add.u64 %rd23, %rd23, 256;
setp.ne.s32 %p10, %r64, %r2;
@%p10 bra $Lt_1_19202;
$Lt_1_18690:
add.s32 %r66, %r11, %r58;
set.lt.u32.s32 %r67, %r66, %r32;
neg.s32 %r68, %r67;
and.b32 %r69, %r61, %r68;
mov.u32 %r70, 0;
setp.eq.s32 %p11, %r69, %r70;
@%p11 bra $Lt_1_20226;
.loc 17 143 0
mov.s32 %r71, %r42;
setp.ge.s32 %p12, %r42, %r47;
@%p12 bra $Lt_1_20226;
sub.s32 %r72, %r47, %r42;
setp.lt.s32 %p13, %r51, %r55;
mov.s32 %r73, %r72;
$Lt_1_20738:
//<loop> Loop body line 143, nesting depth: 2, estimated iterations: unknown
.loc 17 145 0
mov.s32 %r74, %r51;
@!%p13 bra $Lt_1_20994;
sub.s32 %r75, %r55, %r51;
sub.s32 %r76, %r71, %r42;
add.s32 %r77, %r38, %r42;
add.s32 %r78, %r48, %r51;
sub.s32 %r79, %r77, %r39;
sub.s32 %r80, %r78, %r39;
add.s32 %r81, %r76, %r79;
mul.lo.s32 %r82, %r33, %r81;
ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];
ld.param.u64 %rd24, [__cudaparm_make_rho_counts];
mov.s32 %r84, %r75;
$Lt_1_21506:
//<loop> Loop body line 145, nesting depth: 3, estimated iterations: unknown
.loc 17 147 0
sub.s32 %r85, %r74, %r51;
add.s32 %r86, %r85, %r80;
mul.lo.s32 %r87, %r86, %r32;
add.s32 %r88, %r82, %r87;
add.s32 %r89, %r66, %r88;
cvt.s64.s32 %rd25, %r89;
mul.wide.s32 %rd26, %r89, 4;
add.u64 %rd27, %rd24, %rd26;
ld.global.s32 %r90, [%rd27+0];
mul.lo.s32 %r91, %r90, %r83;
.loc 17 148 0
mov.s32 %r92, %r89;
setp.ge.s32 %p14, %r89, %r91;
@%p14 bra $Lt_1_21762;
sub.s32 %r93, %r3, 1;
cvt.s64.s32 %rd28, %r83;
mul.wide.s32 %rd29, %r83, 16;
mov.s32 %r94, -1;
setp.gt.s32 %p15, %r93, %r94;
ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];
mul.lo.u64 %rd31, %rd25, 16;
add.u64 %rd32, %rd30, %rd31;
$Lt_1_22274:
//<loop> Loop body line 148, nesting depth: 4, estimated iterations: unknown
.loc 17 149 0
ld.global.f32 %f4, [%rd32+0];
@!%p15 bra $Lt_1_29954;
sub.s32 %r95, %r93, %r74;
mov.s32 %r96, -1;
sub.s32 %r97, %r96, %r74;
cvt.s64.s32 %rd33, %r2;
mul.wide.s32 %rd34, %r2, 4;
ld.global.f32 %f5, [%rd32+4];
ld.global.f32 %f6, [%rd32+8];
cvt.s64.s32 %rd35, %r95;
mul.wide.s32 %rd36, %r95, 4;
add.u64 %rd37, %rd1, %rd36;
sub.s32 %r98, %r93, %r71;
cvt.s64.s32 %rd38, %r98;
mul.wide.s32 %rd39, %r98, 4;
add.u64 %rd40, %rd1, %rd39;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, 0f00000000; // 0
$Lt_1_23042:
//<loop> Loop body line 149, nesting depth: 5, estimated iterations: unknown
.loc 17 154 0
ld.shared.f32 %f9, [%rd37+0];
fma.rn.ftz.f32 %f8, %f8, %f5, %f9;
.loc 17 155 0
ld.shared.f32 %f10, [%rd40+0];
fma.rn.ftz.f32 %f7, %f7, %f6, %f10;
sub.u64 %rd40, %rd40, %rd34;
sub.s32 %r95, %r95, %r2;
sub.u64 %rd37, %rd37, %rd34;
setp.gt.s32 %p16, %r95, %r97;
@%p16 bra $Lt_1_23042;
bra.uni $Lt_1_22530;
$Lt_1_29954:
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, 0f00000000; // 0
$Lt_1_22530:
.loc 17 157 0
ld.global.f32 %f11, [%rd32+12];
mul.ftz.f32 %f12, %f7, %f8;
mul.ftz.f32 %f13, %f11, %f12;
@!%p8 bra $Lt_1_23554;
mov.s32 %r99, %r2;
cvt.s64.s32 %rd41, %r4;
mul.wide.s32 %rd42, %r4, 4;
add.u64 %rd43, %rd20, %rd42;
mov.s32 %r100, 0;
mov.s32 %r101, %r99;
$Lt_1_24066:
//<loop> Loop body line 157, nesting depth: 5, estimated iterations: unknown
.loc 17 161 0
add.s32 %r102, %r100, %r1;
mov.s32 %r103, %r102;
setp.lt.s32 %p17, %r102, %r100;
@%p17 bra $Lt_1_30466;
cvt.s64.s32 %rd44, %r2;
mul.wide.s32 %rd34, %r2, 4;
cvt.s64.s32 %rd45, %r102;
mul.wide.s32 %rd46, %r102, 4;
add.u64 %rd47, %rd1, %rd46;
mov.f32 %f14, 0f00000000; // 0
$Lt_1_24834:
//<loop> Loop body line 161, nesting depth: 6, estimated iterations: unknown
.loc 17 162 0
ld.shared.f32 %f15, [%rd47+0];
fma.rn.ftz.f32 %f14, %f4, %f14, %f15;
sub.s32 %r103, %r103, %r2;
sub.u64 %rd47, %rd47, %rd34;
setp.ge.s32 %p18, %r103, %r100;
@%p18 bra $Lt_1_24834;
bra.uni $Lt_1_24322;
$Lt_1_30466:
mov.f32 %f14, 0f00000000; // 0
$Lt_1_24322:
.loc 17 163 0
ld.shared.f32 %f16, [%rd43+0];
fma.rn.ftz.f32 %f17, %f14, %f13, %f16;
st.shared.f32 [%rd43+0], %f17;
add.s32 %r100, %r100, 1;
add.u64 %rd43, %rd43, 256;
setp.ne.s32 %p19, %r100, %r2;
@%p19 bra $Lt_1_24066;
$Lt_1_23554:
add.s32 %r92, %r92, %r83;
add.u64 %rd32, %rd29, %rd32;
setp.gt.s32 %p20, %r91, %r92;
@%p20 bra $Lt_1_22274;
$Lt_1_21762:
add.s32 %r74, %r74, 1;
setp.ne.s32 %p21, %r55, %r74;
@%p21 bra $Lt_1_21506;
$Lt_1_20994:
add.s32 %r71, %r71, 1;
setp.ne.s32 %p22, %r47, %r71;
@%p22 bra $Lt_1_20738;
$Lt_1_20226:
$Lt_1_19714:
.loc 17 172 0
bar.sync 0;
@!%p2 bra $Lt_1_26626;
.loc 17 174 0
ld.shared.f32 %f18, [%rd19+128];
st.shared.f32 [%rd19+0], %f18;
.loc 17 175 0
mov.f32 %f19, 0f00000000; // 0
st.shared.f32 [%rd19+128], %f19;
bra.uni $Lt_1_26370;
$Lt_1_26626:
.loc 17 177 0
mov.f32 %f20, 0f00000000; // 0
st.shared.f32 [%rd19+0], %f20;
$Lt_1_26370:
@!%p8 bra $Lt_1_26882;
mov.s32 %r104, %r2;
cvt.s64.s32 %rd48, %r4;
mov.s32 %r105, %r11;
add.s32 %r106, %r11, %r2;
mul.wide.s32 %rd49, %r4, 4;
add.u64 %rd50, %rd20, %rd49;
mov.s64 %rd51, %rd19;
mov.s32 %r107, %r104;
$Lt_1_27394:
//<loop> Loop body line 177, nesting depth: 2, estimated iterations: unknown
.loc 17 180 0
ld.shared.f32 %f21, [%rd50+0];
ld.shared.f32 %f22, [%rd51+0];
add.ftz.f32 %f23, %f21, %f22;
st.shared.f32 [%rd51+0], %f23;
.loc 17 181 0
bar.sync 0;
add.s32 %r105, %r105, 1;
add.u64 %rd51, %rd51, 4;
add.u64 %rd50, %rd50, 256;
setp.ne.s32 %p23, %r105, %r106;
@%p23 bra $Lt_1_27394;
$Lt_1_26882:
set.lt.u32.s32 %r108, %r66, %r17;
neg.s32 %r109, %r108;
and.b32 %r110, %r61, %r109;
mov.u32 %r111, 0;
setp.eq.s32 %p24, %r110, %r111;
@%p24 bra $Lt_1_27906;
.loc 17 185 0
ld.shared.f32 %f24, [%rd19+0];
ld.param.u64 %rd52, [__cudaparm_make_rho_brick];
add.s32 %r112, %r11, %r58;
mul.lo.s32 %r113, %r37, %r17;
mul.lo.s32 %r114, %r38, %r113;
mul.lo.s32 %r115, %r48, %r17;
add.s32 %r116, %r114, %r115;
add.s32 %r117, %r112, %r116;
cvt.s64.s32 %rd53, %r117;
mul.wide.s32 %rd54, %r117, 4;
add.u64 %rd55, %rd52, %rd54;
st.global.f32 [%rd55+0], %f24;
$Lt_1_27906:
add.s32 %r58, %r58, 32;
setp.ne.s32 %p25, %r58, %r59;
@%p25 bra $Lt_1_18434;
$Lt_1_17922:
.loc 17 189 0
exit;
$LDWend_make_rho:
} // make_rho
.entry interp (
.param .u64 __cudaparm_interp_x_,
.param .u64 __cudaparm_interp_q_,
.param .s32 __cudaparm_interp_nlocal,
.param .u64 __cudaparm_interp_brick,
.param .u64 __cudaparm_interp__rho_coeff,
.param .s32 __cudaparm_interp_npts_x,
.param .s32 __cudaparm_interp_npts_yx,
.param .f32 __cudaparm_interp_b_lo_x,
.param .f32 __cudaparm_interp_b_lo_y,
.param .f32 __cudaparm_interp_b_lo_z,
.param .f32 __cudaparm_interp_delxinv,
.param .f32 __cudaparm_interp_delyinv,
.param .f32 __cudaparm_interp_delzinv,
.param .s32 __cudaparm_interp_order,
.param .s32 __cudaparm_interp_order2,
.param .f32 __cudaparm_interp_qqrd2e_scale,
.param .u64 __cudaparm_interp_ans)
{
.reg .u32 %r<56>;
.reg .u64 %rd<37>;
.reg .f32 %f<69>;
.reg .pred %p<14>;
.shared .align 4 .b8 __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888[256];
.shared .align 4 .b8 __cuda___cuda_local_var_32677_33_non_const_rho1d_03144[2048];
.shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_rho1d_15192[2048];
// __cuda_local_var_32694_12_non_const_ek = 16
.loc 17 199 0
$LDWbegin_interp:
ld.param.s32 %r1, [__cudaparm_interp_order2];
ld.param.s32 %r2, [__cudaparm_interp_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_2_8706;
.loc 17 206 0
mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 4;
ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_2_8706:
mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;
.loc 17 207 0
bar.sync 0;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mul.lo.u32 %r7, %r5, %r6;
add.u32 %r8, %r4, %r7;
ld.param.s32 %r9, [__cudaparm_interp_nlocal];
setp.le.s32 %p2, %r9, %r8;
@%p2 bra $Lt_2_9218;
.loc 17 215 0
mov.u32 %r10, %r8;
mov.s32 %r11, 0;
mov.u32 %r12, %r11;
mov.s32 %r13, 0;
mov.u32 %r14, %r13;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}];
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
mov.f32 %f8, %f4;
.loc 17 216 0
mov.u32 %r17, %r8;
mov.s32 %r18, 0;
mov.u32 %r19, %r18;
mov.s32 %r20, 0;
mov.u32 %r21, %r20;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}];
mov.f32 %f13, %f9;
ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale];
mul.ftz.f32 %f15, %f14, %f13;
mov.f32 %f16, 0f00000000; // 0
setp.neu.ftz.f32 %p3, %f15, %f16;
@!%p3 bra $Lt_2_9986;
mov.s32 %r24, 0;
setp.gt.s32 %p4, %r2, %r24;
ld.param.f32 %f17, [__cudaparm_interp_b_lo_x];
sub.ftz.f32 %f18, %f6, %f17;
ld.param.f32 %f19, [__cudaparm_interp_delxinv];
mul.ftz.f32 %f20, %f19, %f18;
@!%p4 bra $Lt_2_16386;
mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;
mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;
cvt.rzi.ftz.s32.f32 %r25, %f20;
cvt.rn.f32.s32 %f21, %r25;
mov.f32 %f22, 0f3f000000; // 0.5
add.ftz.f32 %f23, %f21, %f22;
sub.ftz.f32 %f24, %f23, %f20;
ld.param.f32 %f25, [__cudaparm_interp_b_lo_y];
sub.ftz.f32 %f26, %f7, %f25;
ld.param.f32 %f27, [__cudaparm_interp_delyinv];
mul.ftz.f32 %f28, %f27, %f26;
cvt.rzi.ftz.s32.f32 %r26, %f28;
cvt.rn.f32.s32 %f29, %r26;
mov.f32 %f30, 0f3f000000; // 0.5
add.ftz.f32 %f31, %f29, %f30;
sub.ftz.f32 %f32, %f31, %f28;
mov.s32 %r27, %r2;
cvt.s64.s32 %rd9, %r4;
mov.s32 %r28, %r1;
mul.wide.s32 %rd3, %r4, 4;
add.u64 %rd10, %rd3, %rd7;
add.u64 %rd11, %rd3, %rd8;
mov.s32 %r29, 0;
mov.s32 %r30, %r27;
$Lt_2_10754:
//<loop> Loop body line 216, nesting depth: 1, estimated iterations: unknown
.loc 17 235 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
st.shared.f32 [%rd10+0], %f34;
.loc 17 236 0
mov.f32 %f35, 0f00000000; // 0
mov.f32 %f36, 0f00000000; // 0
st.shared.f32 [%rd11+0], %f36;
.loc 17 237 0
mov.s32 %r31, %r28;
setp.lt.s32 %p5, %r28, %r29;
@%p5 bra $Lt_2_11010;
cvt.s64.s32 %rd12, %r2;
mul.wide.s32 %rd13, %r2, 4;
cvt.s64.s32 %rd14, %r28;
mul.wide.s32 %rd15, %r28, 4;
add.u64 %rd16, %rd1, %rd15;
$Lt_2_11522:
//<loop> Loop body line 237, nesting depth: 2, estimated iterations: unknown
.loc 17 238 0
ld.shared.f32 %f37, [%rd16+0];
fma.rn.ftz.f32 %f33, %f33, %f24, %f37;
st.shared.f32 [%rd10+0], %f33;
.loc 17 239 0
fma.rn.ftz.f32 %f35, %f35, %f32, %f37;
st.shared.f32 [%rd11+0], %f35;
sub.s32 %r31, %r31, %r2;
sub.u64 %rd16, %rd16, %rd13;
setp.ge.s32 %p6, %r31, %r29;
@%p6 bra $Lt_2_11522;
$Lt_2_11010:
add.s32 %r29, %r29, 1;
add.s32 %r28, %r28, 1;
add.u64 %rd11, %rd11, 256;
add.u64 %rd10, %rd10, 256;
setp.ne.s32 %p7, %r28, %r3;
@%p7 bra $Lt_2_10754;
bra.uni $Lt_2_10242;
$Lt_2_16386:
cvt.rzi.ftz.s32.f32 %r25, %f20;
mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;
mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;
$Lt_2_10242:
.loc 17 243 0
ld.param.f32 %f38, [__cudaparm_interp_b_lo_z];
sub.ftz.f32 %f39, %f8, %f38;
ld.param.f32 %f40, [__cudaparm_interp_delzinv];
mul.ftz.f32 %f41, %f40, %f39;
cvt.rzi.ftz.s32.f32 %r32, %f41;
ld.param.s32 %r33, [__cudaparm_interp_npts_yx];
mul.lo.s32 %r34, %r32, %r33;
add.s32 %r35, %r25, %r34;
@!%p4 bra $Lt_2_16898;
cvt.rn.f32.s32 %f42, %r32;
mov.f32 %f43, 0f3f000000; // 0.5
add.ftz.f32 %f44, %f42, %f43;
sub.ftz.f32 %f45, %f44, %f41;
mov.s32 %r36, %r2;
ld.param.f32 %f46, [__cudaparm_interp_b_lo_y];
sub.ftz.f32 %f47, %f7, %f46;
cvt.s64.s32 %rd17, %r4;
ld.param.f32 %f48, [__cudaparm_interp_delyinv];
mul.ftz.f32 %f49, %f48, %f47;
cvt.rzi.ftz.s32.f32 %r37, %f49;
ld.param.s32 %r38, [__cudaparm_interp_npts_x];
mul.lo.s32 %r39, %r37, %r38;
mul.wide.s32 %rd3, %r4, 4;
add.s32 %r40, %r39, %r35;
add.u64 %rd18, %rd3, %rd7;
add.u64 %rd19, %rd3, %rd8;
cvt.s64.s32 %rd20, %r38;
mul.wide.s32 %rd21, %r38, 16;
mov.s32 %r41, %r40;
ld.param.u64 %rd22, [__cudaparm_interp_brick];
mov.s32 %r42, 0;
mov.f32 %f50, 0f00000000; // 0
mov.f32 %f51, 0f00000000; // 0
mov.f32 %f52, 0f00000000; // 0
mov.s32 %r43, %r36;
$Lt_2_12802:
//<loop> Loop body line 243, nesting depth: 1, estimated iterations: unknown
.loc 17 246 0
add.s32 %r44, %r42, %r1;
mov.s32 %r45, %r44;
setp.lt.s32 %p8, %r44, %r42;
@%p8 bra $Lt_2_17154;
cvt.s64.s32 %rd23, %r2;
mul.wide.s32 %rd13, %r2, 4;
cvt.s64.s32 %rd24, %r44;
mul.wide.s32 %rd25, %r44, 4;
add.u64 %rd26, %rd1, %rd25;
mov.f32 %f53, 0f00000000; // 0
$Lt_2_13570:
//<loop> Loop body line 246, nesting depth: 2, estimated iterations: unknown
.loc 17 247 0
ld.shared.f32 %f54, [%rd26+0];
fma.rn.ftz.f32 %f53, %f45, %f53, %f54;
sub.s32 %r45, %r45, %r2;
sub.u64 %rd26, %rd26, %rd13;
setp.ge.s32 %p9, %r45, %r42;
@%p9 bra $Lt_2_13570;
bra.uni $Lt_2_13058;
$Lt_2_17154:
mov.f32 %f53, 0f00000000; // 0
$Lt_2_13058:
.loc 17 249 0
mov.s32 %r46, %r41;
mov.s32 %r47, %r2;
mul.ftz.f32 %f55, %f15, %f53;
mov.s32 %r48, %r46;
mov.s64 %rd27, %rd19;
cvt.s64.s32 %rd28, %r46;
mul.wide.s32 %rd29, %r46, 16;
mov.s32 %r49, 0;
mov.s32 %r50, %r47;
$Lt_2_14594:
//<loop> Loop body line 249, nesting depth: 2, estimated iterations: unknown
mov.s32 %r51, %r2;
mov.s32 %r52, %r48;
add.s32 %r53, %r48, %r2;
mov.s64 %rd30, %rd18;
ld.shared.f32 %f56, [%rd27+0];
add.u64 %rd31, %rd29, %rd22;
mul.ftz.f32 %f57, %f55, %f56;
mov.s32 %r54, %r51;
$Lt_2_15362:
//<loop> Loop body line 249, nesting depth: 3, estimated iterations: unknown
.loc 17 253 0
ld.shared.f32 %f58, [%rd30+0];
mul.ftz.f32 %f59, %f58, %f57;
ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0];
.loc 17 255 0
mul.ftz.f32 %f63, %f59, %f60;
sub.ftz.f32 %f52, %f52, %f63;
.loc 17 256 0
mul.ftz.f32 %f64, %f59, %f61;
sub.ftz.f32 %f51, %f51, %f64;
.loc 17 257 0
mul.ftz.f32 %f65, %f59, %f62;
sub.ftz.f32 %f50, %f50, %f65;
add.s32 %r52, %r52, 1;
add.u64 %rd31, %rd31, 16;
add.u64 %rd30, %rd30, 256;
setp.ne.s32 %p10, %r52, %r53;
@%p10 bra $Lt_2_15362;
add.s32 %r49, %r49, 1;
add.s32 %r48, %r48, %r38;
add.u64 %rd29, %rd29, %rd21;
add.u64 %rd27, %rd27, 256;
setp.ne.s32 %p11, %r49, %r2;
@%p11 bra $Lt_2_14594;
add.s32 %r42, %r42, 1;
add.s32 %r41, %r46, %r33;
setp.ne.s32 %p12, %r42, %r2;
@%p12 bra $Lt_2_12802;
bra.uni $Lt_2_9730;
$Lt_2_16898:
mov.f32 %f50, 0f00000000; // 0
mov.f32 %f51, 0f00000000; // 0
mov.f32 %f52, 0f00000000; // 0
bra.uni $Lt_2_9730;
$Lt_2_9986:
mov.f32 %f50, 0f00000000; // 0
mov.f32 %f51, 0f00000000; // 0
mov.f32 %f52, 0f00000000; // 0
$Lt_2_9730:
.loc 17 264 0
ld.param.u64 %rd32, [__cudaparm_interp_ans];
cvt.s64.s32 %rd33, %r8;
mul.wide.s32 %rd34, %r8, 16;
add.u64 %rd35, %rd32, %rd34;
mov.f32 %f66, %f67;
st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66};
$Lt_2_9218:
.loc 17 266 0
exit;
$LDWend_interp:
} // interp
Event Timeline
Log In to Comment