Page MenuHomec4science

ellipsoid_nbor.ptx
No OneTemporary

File Metadata

Created
Thu, Dec 26, 19:50

ellipsoid_nbor.ptx

.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009ad9_00000000-9_lal_ellipsoid_nbor.cpp3.i (/home/sjplimp/ccBI#.7CLzz0)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009ad9_00000000-8_lal_ellipsoid_nbor.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_ellipsoid_nbor.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_nbor (
.param .u64 __cudaparm_kernel_nbor_x_,
.param .u64 __cudaparm_kernel_nbor_cut_form,
.param .s32 __cudaparm_kernel_nbor_ntypes,
.param .u64 __cudaparm_kernel_nbor_dev_nbor,
.param .s32 __cudaparm_kernel_nbor_nbor_pitch,
.param .s32 __cudaparm_kernel_nbor_start,
.param .s32 __cudaparm_kernel_nbor_inum,
.param .u64 __cudaparm_kernel_nbor_dev_ij,
.param .s32 __cudaparm_kernel_nbor_form_low,
.param .s32 __cudaparm_kernel_nbor_form_high)
{
.reg .u32 %r<26>;
.reg .u64 %rd<33>;
.reg .f32 %f<20>;
.reg .pred %p<8>;
.loc 16 29 0
$LDWbegin_kernel_nbor:
cvt.s32.u32 %r1, %ctaid.x;
cvt.s32.u32 %r2, %ntid.x;
mul24.lo.s32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r3, %r4;
ld.param.s32 %r6, [__cudaparm_kernel_nbor_start];
add.u32 %r7, %r6, %r5;
ld.param.s32 %r8, [__cudaparm_kernel_nbor_inum];
setp.le.s32 %p1, %r8, %r7;
@%p1 bra $Lt_0_4354;
.loc 16 36 0
cvt.s64.s32 %rd1, %r7;
ld.param.u64 %rd2, [__cudaparm_kernel_nbor_dev_ij];
mul.wide.s32 %rd3, %r7, 4;
add.u64 %rd4, %rd2, %rd3;
ld.global.s32 %r9, [%rd4+0];
.loc 16 38 0
ld.param.s32 %r10, [__cudaparm_kernel_nbor_nbor_pitch];
cvt.s64.s32 %rd5, %r10;
mul.wide.s32 %rd6, %r10, 4;
add.u64 %rd7, %rd6, %rd4;
ld.global.s32 %r11, [%rd7+0];
.loc 16 39 0
add.u64 %rd8, %rd6, %rd7;
mov.s64 %rd9, %rd8;
.loc 16 41 0
ld.param.u64 %rd10, [__cudaparm_kernel_nbor_dev_nbor];
add.u64 %rd11, %rd1, %rd5;
add.u64 %rd12, %rd5, %rd11;
mul.lo.u64 %rd13, %rd12, 4;
add.u64 %rd14, %rd10, %rd13;
.loc 16 43 0
ld.param.u64 %rd15, [__cudaparm_kernel_nbor_x_];
cvt.s64.s32 %rd16, %r9;
mul.wide.s32 %rd17, %r9, 16;
add.u64 %rd18, %rd15, %rd17;
ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%rd18+0];
cvt.s32.s64 %r12, %rd5;
mul.lo.s32 %r13, %r12, %r11;
cvt.s64.s32 %rd19, %r13;
mul.wide.s32 %rd20, %r13, 4;
add.u64 %rd21, %rd8, %rd20;
setp.ge.u64 %p2, %rd8, %rd21;
@%p2 bra $Lt_0_6402;
cvt.rzi.ftz.s32.f32 %r14, %f4;
ld.param.s32 %r15, [__cudaparm_kernel_nbor_form_low];
cvt.rn.f32.s32 %f5, %r15;
ld.param.s32 %r16, [__cudaparm_kernel_nbor_ntypes];
mul.lo.s32 %r17, %r16, %r14;
ld.param.u64 %rd22, [__cudaparm_kernel_nbor_cut_form];
mov.s32 %r18, 0;
$Lt_0_5378:
//<loop> Loop body line 43, nesting depth: 1, estimated iterations: unknown
.loc 16 49 0
ld.global.s32 %r19, [%rd9+0];
and.b32 %r20, %r19, 1073741823;
.loc 16 50 0
cvt.s64.s32 %rd23, %r20;
mul.wide.s32 %rd24, %r20, 16;
add.u64 %rd25, %rd15, %rd24;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd25+0];
.loc 16 53 0
cvt.rzi.ftz.s32.f32 %r21, %f9;
add.s32 %r22, %r21, %r17;
cvt.s64.s32 %rd26, %r22;
mul.wide.s32 %rd27, %r22, 8;
add.u64 %rd28, %rd22, %rd27;
ld.global.f32 %f10, [%rd28+4];
.loc 16 48 0
setp.le.ftz.f32 %p3, %f5, %f10;
@!%p3 bra $Lt_0_6658;
ld.param.s32 %r23, [__cudaparm_kernel_nbor_form_high];
cvt.rn.f32.s32 %f11, %r23;
setp.ge.ftz.f32 %p4, %f11, %f10;
@!%p4 bra $Lt_0_6658;
sub.ftz.f32 %f12, %f6, %f1;
sub.ftz.f32 %f13, %f7, %f2;
sub.ftz.f32 %f14, %f8, %f3;
ld.global.f32 %f15, [%rd28+0];
mul.ftz.f32 %f16, %f12, %f12;
fma.rn.ftz.f32 %f17, %f13, %f13, %f16;
fma.rn.ftz.f32 %f18, %f14, %f14, %f17;
setp.gt.ftz.f32 %p5, %f15, %f18;
@!%p5 bra $Lt_0_6658;
.loc 16 64 0
st.global.s32 [%rd14+0], %r20;
.loc 16 65 0
add.u64 %rd14, %rd6, %rd14;
.loc 16 66 0
add.s32 %r18, %r18, 1;
$Lt_0_6658:
$L_0_3842:
.loc 16 47 0
add.u64 %rd9, %rd6, %rd9;
setp.gt.u64 %p6, %rd21, %rd9;
@%p6 bra $Lt_0_5378;
bra.uni $Lt_0_4866;
$Lt_0_6402:
mov.s32 %r18, 0;
$Lt_0_4866:
.loc 16 70 0
add.s32 %r24, %r12, %r7;
cvt.s64.s32 %rd29, %r24;
mul.wide.s32 %rd30, %r24, 4;
add.u64 %rd31, %rd10, %rd30;
st.global.s32 [%rd31+0], %r18;
$Lt_0_4354:
.loc 16 72 0
exit;
$LDWend_kernel_nbor:
} // kernel_nbor
.entry kernel_nbor_fast (
.param .u64 __cudaparm_kernel_nbor_fast_x_,
.param .u64 __cudaparm_kernel_nbor_fast_cut_form,
.param .u64 __cudaparm_kernel_nbor_fast_dev_nbor,
.param .s32 __cudaparm_kernel_nbor_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_nbor_fast_start,
.param .s32 __cudaparm_kernel_nbor_fast_inum,
.param .u64 __cudaparm_kernel_nbor_fast_dev_ij,
.param .s32 __cudaparm_kernel_nbor_fast_form_low,
.param .s32 __cudaparm_kernel_nbor_fast_form_high)
{
.reg .u32 %r<28>;
.reg .u64 %rd<42>;
.reg .f32 %f<19>;
.reg .pred %p<9>;
.shared .align 4 .b8 __cuda___cuda_local_var_32570_31_non_const_form120[484];
.shared .align 4 .b8 __cuda___cuda_local_var_32571_33_non_const_cutsq604[484];
.loc 16 84 0
$LDWbegin_kernel_nbor_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 120;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_5122;
.loc 16 90 0
mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;
mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;
cvt.s64.s32 %rd3, %r1;
mul.wide.s32 %rd4, %r1, 4;
ld.param.u64 %rd5, [__cudaparm_kernel_nbor_fast_cut_form];
mul.wide.s32 %rd6, %r1, 8;
add.u64 %rd7, %rd5, %rd6;
ld.global.v2.f32 {%f1,%f2}, [%rd7+0];
add.u64 %rd8, %rd4, %rd2;
st.shared.f32 [%rd8+0], %f1;
.loc 16 91 0
cvt.rzi.ftz.s32.f32 %r3, %f2;
add.u64 %rd9, %rd4, %rd1;
st.shared.s32 [%rd9+0], %r3;
$Lt_1_5122:
mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;
mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;
.loc 16 94 0
bar.sync 0;
cvt.s32.u32 %r4, %ctaid.x;
cvt.s32.u32 %r5, %ntid.x;
mul.lo.s32 %r6, %r4, %r5;
ld.param.s32 %r7, [__cudaparm_kernel_nbor_fast_start];
add.s32 %r8, %r7, %r6;
add.s32 %r9, %r8, %r1;
ld.param.s32 %r10, [__cudaparm_kernel_nbor_fast_inum];
setp.le.s32 %p2, %r10, %r9;
@%p2 bra $Lt_1_5634;
.loc 16 98 0
cvt.s64.s32 %rd10, %r9;
ld.param.u64 %rd11, [__cudaparm_kernel_nbor_fast_dev_ij];
mul.wide.s32 %rd12, %r9, 4;
add.u64 %rd13, %rd11, %rd12;
ld.global.s32 %r11, [%rd13+0];
.loc 16 100 0
ld.param.s32 %r12, [__cudaparm_kernel_nbor_fast_nbor_pitch];
cvt.s64.s32 %rd14, %r12;
mul.wide.s32 %rd15, %r12, 4;
add.u64 %rd16, %rd15, %rd13;
ld.global.s32 %r13, [%rd16+0];
.loc 16 101 0
add.u64 %rd17, %rd15, %rd16;
mov.s64 %rd18, %rd17;
.loc 16 103 0
ld.param.u64 %rd19, [__cudaparm_kernel_nbor_fast_dev_nbor];
add.u64 %rd20, %rd10, %rd14;
add.u64 %rd21, %rd14, %rd20;
mul.lo.u64 %rd22, %rd21, 4;
add.u64 %rd23, %rd19, %rd22;
.loc 16 105 0
ld.param.u64 %rd24, [__cudaparm_kernel_nbor_fast_x_];
cvt.s64.s32 %rd25, %r11;
mul.wide.s32 %rd26, %r11, 16;
add.u64 %rd27, %rd24, %rd26;
ld.global.v4.f32 {%f3,%f4,%f5,%f6}, [%rd27+0];
cvt.s32.s64 %r14, %rd14;
mul.lo.s32 %r15, %r14, %r13;
cvt.s64.s32 %rd28, %r15;
mul.wide.s32 %rd29, %r15, 4;
add.u64 %rd30, %rd17, %rd29;
setp.ge.u64 %p3, %rd17, %rd30;
@%p3 bra $Lt_1_7682;
cvt.rzi.ftz.s32.f32 %r16, %f6;
mul.lo.s32 %r17, %r16, 11;
ld.param.s32 %r18, [__cudaparm_kernel_nbor_fast_form_low];
mov.s32 %r19, 0;
$Lt_1_6658:
//<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown
.loc 16 112 0
ld.global.s32 %r20, [%rd18+0];
and.b32 %r21, %r20, 1073741823;
.loc 16 113 0
cvt.s64.s32 %rd31, %r21;
mul.wide.s32 %rd32, %r21, 16;
add.u64 %rd33, %rd24, %rd32;
ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd33+0];
.loc 16 111 0
cvt.rzi.ftz.s32.f32 %r22, %f10;
add.s32 %r23, %r22, %r17;
cvt.s64.s32 %rd34, %r23;
mul.wide.s32 %rd35, %r23, 4;
add.u64 %rd36, %rd35, %rd1;
ld.shared.s32 %r24, [%rd36+0];
setp.lt.s32 %p4, %r24, %r18;
@%p4 bra $Lt_1_7938;
ld.param.s32 %r25, [__cudaparm_kernel_nbor_fast_form_high];
setp.lt.s32 %p5, %r25, %r24;
@%p5 bra $Lt_1_7938;
sub.ftz.f32 %f11, %f7, %f3;
sub.ftz.f32 %f12, %f8, %f4;
sub.ftz.f32 %f13, %f9, %f5;
add.u64 %rd37, %rd35, %rd2;
ld.shared.f32 %f14, [%rd37+0];
mul.ftz.f32 %f15, %f11, %f11;
fma.rn.ftz.f32 %f16, %f12, %f12, %f15;
fma.rn.ftz.f32 %f17, %f13, %f13, %f16;
setp.gt.ftz.f32 %p6, %f14, %f17;
@!%p6 bra $Lt_1_7938;
.loc 16 127 0
st.global.s32 [%rd23+0], %r21;
.loc 16 128 0
add.u64 %rd23, %rd15, %rd23;
.loc 16 129 0
add.s32 %r19, %r19, 1;
$Lt_1_7938:
$L_1_4610:
.loc 16 110 0
add.u64 %rd18, %rd15, %rd18;
setp.gt.u64 %p7, %rd30, %rd18;
@%p7 bra $Lt_1_6658;
bra.uni $Lt_1_6146;
$Lt_1_7682:
mov.s32 %r19, 0;
$Lt_1_6146:
.loc 16 133 0
add.s32 %r26, %r14, %r9;
cvt.s64.s32 %rd38, %r26;
mul.wide.s32 %rd39, %r26, 4;
add.u64 %rd40, %rd19, %rd39;
st.global.s32 [%rd40+0], %r19;
$Lt_1_5634:
.loc 16 135 0
exit;
$LDWend_kernel_nbor_fast:
} // kernel_nbor_fast

Event Timeline