Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F96443108
ellipsoid_nbor.ptx
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Dec 26, 19:43
Size
10 KB
Mime Type
text/x-asm
Expires
Sat, Dec 28, 19:43 (1 d, 9 h)
Engine
blob
Format
Raw Data
Handle
23184094
Attached To
rLAMMPS lammps
ellipsoid_nbor.ptx
View Options
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009ad9_00000000-9_lal_ellipsoid_nbor.cpp3.i (/home/sjplimp/ccBI#.7CLzz0)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009ad9_00000000-8_lal_ellipsoid_nbor.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_ellipsoid_nbor.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_nbor (
.param .u64 __cudaparm_kernel_nbor_x_,
.param .u64 __cudaparm_kernel_nbor_cut_form,
.param .s32 __cudaparm_kernel_nbor_ntypes,
.param .u64 __cudaparm_kernel_nbor_dev_nbor,
.param .s32 __cudaparm_kernel_nbor_nbor_pitch,
.param .s32 __cudaparm_kernel_nbor_start,
.param .s32 __cudaparm_kernel_nbor_inum,
.param .u64 __cudaparm_kernel_nbor_dev_ij,
.param .s32 __cudaparm_kernel_nbor_form_low,
.param .s32 __cudaparm_kernel_nbor_form_high)
{
.reg .u32 %r<26>;
.reg .u64 %rd<33>;
.reg .f32 %f<20>;
.reg .pred %p<8>;
.loc 16 29 0
$LDWbegin_kernel_nbor:
cvt.s32.u32 %r1, %ctaid.x;
cvt.s32.u32 %r2, %ntid.x;
mul24.lo.s32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r3, %r4;
ld.param.s32 %r6, [__cudaparm_kernel_nbor_start];
add.u32 %r7, %r6, %r5;
ld.param.s32 %r8, [__cudaparm_kernel_nbor_inum];
setp.le.s32 %p1, %r8, %r7;
@%p1 bra $Lt_0_4354;
.loc 16 36 0
cvt.s64.s32 %rd1, %r7;
ld.param.u64 %rd2, [__cudaparm_kernel_nbor_dev_ij];
mul.wide.s32 %rd3, %r7, 4;
add.u64 %rd4, %rd2, %rd3;
ld.global.s32 %r9, [%rd4+0];
.loc 16 38 0
ld.param.s32 %r10, [__cudaparm_kernel_nbor_nbor_pitch];
cvt.s64.s32 %rd5, %r10;
mul.wide.s32 %rd6, %r10, 4;
add.u64 %rd7, %rd6, %rd4;
ld.global.s32 %r11, [%rd7+0];
.loc 16 39 0
add.u64 %rd8, %rd6, %rd7;
mov.s64 %rd9, %rd8;
.loc 16 41 0
ld.param.u64 %rd10, [__cudaparm_kernel_nbor_dev_nbor];
add.u64 %rd11, %rd1, %rd5;
add.u64 %rd12, %rd5, %rd11;
mul.lo.u64 %rd13, %rd12, 4;
add.u64 %rd14, %rd10, %rd13;
.loc 16 43 0
ld.param.u64 %rd15, [__cudaparm_kernel_nbor_x_];
cvt.s64.s32 %rd16, %r9;
mul.wide.s32 %rd17, %r9, 16;
add.u64 %rd18, %rd15, %rd17;
ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%rd18+0];
cvt.s32.s64 %r12, %rd5;
mul.lo.s32 %r13, %r12, %r11;
cvt.s64.s32 %rd19, %r13;
mul.wide.s32 %rd20, %r13, 4;
add.u64 %rd21, %rd8, %rd20;
setp.ge.u64 %p2, %rd8, %rd21;
@%p2 bra $Lt_0_6402;
cvt.rzi.ftz.s32.f32 %r14, %f4;
ld.param.s32 %r15, [__cudaparm_kernel_nbor_form_low];
cvt.rn.f32.s32 %f5, %r15;
ld.param.s32 %r16, [__cudaparm_kernel_nbor_ntypes];
mul.lo.s32 %r17, %r16, %r14;
ld.param.u64 %rd22, [__cudaparm_kernel_nbor_cut_form];
mov.s32 %r18, 0;
$Lt_0_5378:
//<loop> Loop body line 43, nesting depth: 1, estimated iterations: unknown
.loc 16 49 0
ld.global.s32 %r19, [%rd9+0];
and.b32 %r20, %r19, 1073741823;
.loc 16 50 0
cvt.s64.s32 %rd23, %r20;
mul.wide.s32 %rd24, %r20, 16;
add.u64 %rd25, %rd15, %rd24;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd25+0];
.loc 16 53 0
cvt.rzi.ftz.s32.f32 %r21, %f9;
add.s32 %r22, %r21, %r17;
cvt.s64.s32 %rd26, %r22;
mul.wide.s32 %rd27, %r22, 8;
add.u64 %rd28, %rd22, %rd27;
ld.global.f32 %f10, [%rd28+4];
.loc 16 48 0
setp.le.ftz.f32 %p3, %f5, %f10;
@!%p3 bra $Lt_0_6658;
ld.param.s32 %r23, [__cudaparm_kernel_nbor_form_high];
cvt.rn.f32.s32 %f11, %r23;
setp.ge.ftz.f32 %p4, %f11, %f10;
@!%p4 bra $Lt_0_6658;
sub.ftz.f32 %f12, %f6, %f1;
sub.ftz.f32 %f13, %f7, %f2;
sub.ftz.f32 %f14, %f8, %f3;
ld.global.f32 %f15, [%rd28+0];
mul.ftz.f32 %f16, %f12, %f12;
fma.rn.ftz.f32 %f17, %f13, %f13, %f16;
fma.rn.ftz.f32 %f18, %f14, %f14, %f17;
setp.gt.ftz.f32 %p5, %f15, %f18;
@!%p5 bra $Lt_0_6658;
.loc 16 64 0
st.global.s32 [%rd14+0], %r20;
.loc 16 65 0
add.u64 %rd14, %rd6, %rd14;
.loc 16 66 0
add.s32 %r18, %r18, 1;
$Lt_0_6658:
$L_0_3842:
.loc 16 47 0
add.u64 %rd9, %rd6, %rd9;
setp.gt.u64 %p6, %rd21, %rd9;
@%p6 bra $Lt_0_5378;
bra.uni $Lt_0_4866;
$Lt_0_6402:
mov.s32 %r18, 0;
$Lt_0_4866:
.loc 16 70 0
add.s32 %r24, %r12, %r7;
cvt.s64.s32 %rd29, %r24;
mul.wide.s32 %rd30, %r24, 4;
add.u64 %rd31, %rd10, %rd30;
st.global.s32 [%rd31+0], %r18;
$Lt_0_4354:
.loc 16 72 0
exit;
$LDWend_kernel_nbor:
} // kernel_nbor
.entry kernel_nbor_fast (
.param .u64 __cudaparm_kernel_nbor_fast_x_,
.param .u64 __cudaparm_kernel_nbor_fast_cut_form,
.param .u64 __cudaparm_kernel_nbor_fast_dev_nbor,
.param .s32 __cudaparm_kernel_nbor_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_nbor_fast_start,
.param .s32 __cudaparm_kernel_nbor_fast_inum,
.param .u64 __cudaparm_kernel_nbor_fast_dev_ij,
.param .s32 __cudaparm_kernel_nbor_fast_form_low,
.param .s32 __cudaparm_kernel_nbor_fast_form_high)
{
.reg .u32 %r<28>;
.reg .u64 %rd<42>;
.reg .f32 %f<19>;
.reg .pred %p<9>;
.shared .align 4 .b8 __cuda___cuda_local_var_32570_31_non_const_form120[484];
.shared .align 4 .b8 __cuda___cuda_local_var_32571_33_non_const_cutsq604[484];
.loc 16 84 0
$LDWbegin_kernel_nbor_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 120;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_5122;
.loc 16 90 0
mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;
mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;
cvt.s64.s32 %rd3, %r1;
mul.wide.s32 %rd4, %r1, 4;
ld.param.u64 %rd5, [__cudaparm_kernel_nbor_fast_cut_form];
mul.wide.s32 %rd6, %r1, 8;
add.u64 %rd7, %rd5, %rd6;
ld.global.v2.f32 {%f1,%f2}, [%rd7+0];
add.u64 %rd8, %rd4, %rd2;
st.shared.f32 [%rd8+0], %f1;
.loc 16 91 0
cvt.rzi.ftz.s32.f32 %r3, %f2;
add.u64 %rd9, %rd4, %rd1;
st.shared.s32 [%rd9+0], %r3;
$Lt_1_5122:
mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;
mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;
.loc 16 94 0
bar.sync 0;
cvt.s32.u32 %r4, %ctaid.x;
cvt.s32.u32 %r5, %ntid.x;
mul.lo.s32 %r6, %r4, %r5;
ld.param.s32 %r7, [__cudaparm_kernel_nbor_fast_start];
add.s32 %r8, %r7, %r6;
add.s32 %r9, %r8, %r1;
ld.param.s32 %r10, [__cudaparm_kernel_nbor_fast_inum];
setp.le.s32 %p2, %r10, %r9;
@%p2 bra $Lt_1_5634;
.loc 16 98 0
cvt.s64.s32 %rd10, %r9;
ld.param.u64 %rd11, [__cudaparm_kernel_nbor_fast_dev_ij];
mul.wide.s32 %rd12, %r9, 4;
add.u64 %rd13, %rd11, %rd12;
ld.global.s32 %r11, [%rd13+0];
.loc 16 100 0
ld.param.s32 %r12, [__cudaparm_kernel_nbor_fast_nbor_pitch];
cvt.s64.s32 %rd14, %r12;
mul.wide.s32 %rd15, %r12, 4;
add.u64 %rd16, %rd15, %rd13;
ld.global.s32 %r13, [%rd16+0];
.loc 16 101 0
add.u64 %rd17, %rd15, %rd16;
mov.s64 %rd18, %rd17;
.loc 16 103 0
ld.param.u64 %rd19, [__cudaparm_kernel_nbor_fast_dev_nbor];
add.u64 %rd20, %rd10, %rd14;
add.u64 %rd21, %rd14, %rd20;
mul.lo.u64 %rd22, %rd21, 4;
add.u64 %rd23, %rd19, %rd22;
.loc 16 105 0
ld.param.u64 %rd24, [__cudaparm_kernel_nbor_fast_x_];
cvt.s64.s32 %rd25, %r11;
mul.wide.s32 %rd26, %r11, 16;
add.u64 %rd27, %rd24, %rd26;
ld.global.v4.f32 {%f3,%f4,%f5,%f6}, [%rd27+0];
cvt.s32.s64 %r14, %rd14;
mul.lo.s32 %r15, %r14, %r13;
cvt.s64.s32 %rd28, %r15;
mul.wide.s32 %rd29, %r15, 4;
add.u64 %rd30, %rd17, %rd29;
setp.ge.u64 %p3, %rd17, %rd30;
@%p3 bra $Lt_1_7682;
cvt.rzi.ftz.s32.f32 %r16, %f6;
mul.lo.s32 %r17, %r16, 11;
ld.param.s32 %r18, [__cudaparm_kernel_nbor_fast_form_low];
mov.s32 %r19, 0;
$Lt_1_6658:
//<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown
.loc 16 112 0
ld.global.s32 %r20, [%rd18+0];
and.b32 %r21, %r20, 1073741823;
.loc 16 113 0
cvt.s64.s32 %rd31, %r21;
mul.wide.s32 %rd32, %r21, 16;
add.u64 %rd33, %rd24, %rd32;
ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd33+0];
.loc 16 111 0
cvt.rzi.ftz.s32.f32 %r22, %f10;
add.s32 %r23, %r22, %r17;
cvt.s64.s32 %rd34, %r23;
mul.wide.s32 %rd35, %r23, 4;
add.u64 %rd36, %rd35, %rd1;
ld.shared.s32 %r24, [%rd36+0];
setp.lt.s32 %p4, %r24, %r18;
@%p4 bra $Lt_1_7938;
ld.param.s32 %r25, [__cudaparm_kernel_nbor_fast_form_high];
setp.lt.s32 %p5, %r25, %r24;
@%p5 bra $Lt_1_7938;
sub.ftz.f32 %f11, %f7, %f3;
sub.ftz.f32 %f12, %f8, %f4;
sub.ftz.f32 %f13, %f9, %f5;
add.u64 %rd37, %rd35, %rd2;
ld.shared.f32 %f14, [%rd37+0];
mul.ftz.f32 %f15, %f11, %f11;
fma.rn.ftz.f32 %f16, %f12, %f12, %f15;
fma.rn.ftz.f32 %f17, %f13, %f13, %f16;
setp.gt.ftz.f32 %p6, %f14, %f17;
@!%p6 bra $Lt_1_7938;
.loc 16 127 0
st.global.s32 [%rd23+0], %r21;
.loc 16 128 0
add.u64 %rd23, %rd15, %rd23;
.loc 16 129 0
add.s32 %r19, %r19, 1;
$Lt_1_7938:
$L_1_4610:
.loc 16 110 0
add.u64 %rd18, %rd15, %rd18;
setp.gt.u64 %p7, %rd30, %rd18;
@%p7 bra $Lt_1_6658;
bra.uni $Lt_1_6146;
$Lt_1_7682:
mov.s32 %r19, 0;
$Lt_1_6146:
.loc 16 133 0
add.s32 %r26, %r14, %r9;
cvt.s64.s32 %rd38, %r26;
mul.wide.s32 %rd39, %r26, 4;
add.u64 %rd40, %rd19, %rd39;
st.global.s32 [%rd40+0], %r19;
$Lt_1_5634:
.loc 16 135 0
exit;
$LDWend_kernel_nbor_fast:
} // kernel_nbor_fast
Event Timeline
Log In to Comment