const char * neighbor_gpu = 
"	.version 2.3\n"
"	.target sm_20\n"
"	.address_size 64\n"
"	.global .texref neigh_tex;\n"
"	.entry calc_cell_id (\n"
"		.param .u64 __cudaparm_calc_cell_id_pos,\n"
"		.param .u64 __cudaparm_calc_cell_id_cell_id,\n"
"		.param .u64 __cudaparm_calc_cell_id_particle_id,\n"
"		.param .f32 __cudaparm_calc_cell_id_boxlo0,\n"
"		.param .f32 __cudaparm_calc_cell_id_boxlo1,\n"
"		.param .f32 __cudaparm_calc_cell_id_boxlo2,\n"
"		.param .f32 __cudaparm_calc_cell_id_boxhi0,\n"
"		.param .f32 __cudaparm_calc_cell_id_boxhi1,\n"
"		.param .f32 __cudaparm_calc_cell_id_boxhi2,\n"
"		.param .f32 __cudaparm_calc_cell_id_cell_size,\n"
"		.param .s32 __cudaparm_calc_cell_id_ncellx,\n"
"		.param .s32 __cudaparm_calc_cell_id_ncelly,\n"
"		.param .s32 __cudaparm_calc_cell_id_nall)\n"
"	{\n"
"	.reg .u32 %r<25>;\n"
"	.reg .u64 %rd<8>;\n"
"	.reg .f32 %f<35>;\n"
"	.reg .f64 %fd<11>;\n"
"	.reg .pred %p<3>;\n"
"	.loc	16	29	0\n"
"$LDWbegin_calc_cell_id:\n"
"	mov.u32 	%r1, %tid.x;\n"
"	mov.u32 	%r2, %ctaid.x;\n"
"	mov.u32 	%r3, %ntid.x;\n"
"	mul.lo.u32 	%r4, %r2, %r3;\n"
"	add.u32 	%r5, %r1, %r4;\n"
"	ld.param.s32 	%r6, [__cudaparm_calc_cell_id_nall];\n"
"	setp.le.s32 	%p1, %r6, %r5;\n"
"	@%p1 bra 	$Lt_0_1026;\n"
"	.loc	16	33	0\n"
"	mov.u32 	%r7, %r5;\n"
"	mov.s32 	%r8, 0;\n"
"	mov.u32 	%r9, %r8;\n"
"	mov.s32 	%r10, 0;\n"
"	mov.u32 	%r11, %r10;\n"
"	mov.s32 	%r12, 0;\n"
"	mov.u32 	%r13, %r12;\n"
"	tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}];\n"
"	mov.f32 	%f5, %f1;\n"
"	mov.f32 	%f6, %f2;\n"
"	mov.f32 	%f7, %f3;\n"
"	.loc	16	46	0\n"
"	ld.param.f32 	%f8, [__cudaparm_calc_cell_id_cell_size];\n"
"	neg.ftz.f32 	%f9, %f8;\n"
"	ld.param.f32 	%f10, [__cudaparm_calc_cell_id_boxlo0];\n"
"	ld.param.f32 	%f11, [__cudaparm_calc_cell_id_boxlo2];\n"
"	ld.param.f32 	%f12, [__cudaparm_calc_cell_id_boxlo1];\n"
"	ld.param.s32 	%r14, [__cudaparm_calc_cell_id_ncellx];\n"
"	ld.param.s32 	%r15, [__cudaparm_calc_cell_id_ncelly];\n"
"	ld.param.f32 	%f13, [__cudaparm_calc_cell_id_boxhi2];\n"
"	sub.ftz.f32 	%f14, %f13, %f11;\n"
"	add.ftz.f32 	%f15, %f8, %f14;\n"
"	sub.ftz.f32 	%f16, %f7, %f11;\n"
"	max.ftz.f32 	%f17, %f9, %f16;\n"
"	min.ftz.f32 	%f18, %f15, %f17;\n"
"	div.approx.ftz.f32 	%f19, %f18, %f8;\n"
"	cvt.ftz.f64.f32 	%fd1, %f19;\n"
"	mov.f64 	%fd2, 0d3ff0000000000000;	\n"
"	add.f64 	%fd3, %fd1, %fd2;\n"
"	cvt.rzi.u32.f64 	%r16, %fd3;\n"
"	mul.lo.u32 	%r17, %r14, %r16;\n"
"	mul.lo.u32 	%r18, %r15, %r17;\n"
"	ld.param.f32 	%f20, [__cudaparm_calc_cell_id_boxhi1];\n"
"	sub.ftz.f32 	%f21, %f20, %f12;\n"
"	add.ftz.f32 	%f22, %f8, %f21;\n"
"	sub.ftz.f32 	%f23, %f6, %f12;\n"
"	max.ftz.f32 	%f24, %f9, %f23;\n"
"	min.ftz.f32 	%f25, %f22, %f24;\n"
"	div.approx.ftz.f32 	%f26, %f25, %f8;\n"
"	cvt.ftz.f64.f32 	%fd4, %f26;\n"
"	mov.f64 	%fd5, 0d3ff0000000000000;	\n"
"	add.f64 	%fd6, %fd4, %fd5;\n"
"	cvt.rzi.u32.f64 	%r19, %fd6;\n"
"	mul.lo.u32 	%r20, %r14, %r19;\n"
"	add.u32 	%r21, %r18, %r20;\n"
"	ld.param.f32 	%f27, [__cudaparm_calc_cell_id_boxhi0];\n"
"	sub.ftz.f32 	%f28, %f27, %f10;\n"
"	add.ftz.f32 	%f29, %f8, %f28;\n"
"	sub.ftz.f32 	%f30, %f5, %f10;\n"
"	max.ftz.f32 	%f31, %f9, %f30;\n"
"	min.ftz.f32 	%f32, %f29, %f31;\n"
"	div.approx.ftz.f32 	%f33, %f32, %f8;\n"
"	cvt.ftz.f64.f32 	%fd7, %f33;\n"
"	mov.f64 	%fd8, 0d3ff0000000000000;	\n"
"	add.f64 	%fd9, %fd7, %fd8;\n"
"	cvt.rzi.u32.f64 	%r22, %fd9;\n"
"	add.u32 	%r23, %r21, %r22;\n"
"	.loc	16	50	0\n"
"	cvt.s64.s32 	%rd1, %r5;\n"
"	mul.wide.s32 	%rd2, %r5, 4;\n"
"	ld.param.u64 	%rd3, [__cudaparm_calc_cell_id_cell_id];\n"
"	add.u64 	%rd4, %rd3, %rd2;\n"
"	st.global.u32 	[%rd4+0], %r23;\n"
"	.loc	16	51	0\n"
"	ld.param.u64 	%rd5, [__cudaparm_calc_cell_id_particle_id];\n"
"	add.u64 	%rd6, %rd5, %rd2;\n"
"	st.global.s32 	[%rd6+0], %r5;\n"
"$Lt_0_1026:\n"
"	.loc	16	53	0\n"
"	exit;\n"
"$LDWend_calc_cell_id:\n"
"	}\n"
"	.entry kernel_calc_cell_counts (\n"
"		.param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,\n"
"		.param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,\n"
"		.param .s32 __cudaparm_kernel_calc_cell_counts_nall,\n"
"		.param .s32 __cudaparm_kernel_calc_cell_counts_ncell)\n"
"	{\n"
"	.reg .u32 %r<33>;\n"
"	.reg .u64 %rd<15>;\n"
"	.reg .pred %p<13>;\n"
"	.loc	16	56	0\n"
"$LDWbegin_kernel_calc_cell_counts:\n"
"	mov.u32 	%r1, %ctaid.x;\n"
"	mov.u32 	%r2, %ntid.x;\n"
"	mul.lo.u32 	%r3, %r1, %r2;\n"
"	mov.u32 	%r4, %tid.x;\n"
"	add.u32 	%r5, %r4, %r3;\n"
"	ld.param.s32 	%r6, [__cudaparm_kernel_calc_cell_counts_nall];\n"
"	setp.gt.s32 	%p1, %r6, %r5;\n"
"	@!%p1 bra 	$Lt_1_7426;\n"
"	.loc	16	59	0\n"
"	ld.param.u64 	%rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];\n"
"	cvt.s64.s32 	%rd2, %r5;\n"
"	mul.wide.s32 	%rd3, %r5, 4;\n"
"	add.u64 	%rd4, %rd1, %rd3;\n"
"	ld.global.u32 	%r7, [%rd4+0];\n"
"	mov.u32 	%r8, 0;\n"
"	setp.ne.s32 	%p2, %r5, %r8;\n"
"	@%p2 bra 	$Lt_1_7938;\n"
"	add.s32 	%r9, %r7, 1;\n"
"	mov.u32 	%r10, 0;\n"
"	setp.le.s32 	%p3, %r9, %r10;\n"
"	@%p3 bra 	$Lt_1_8450;\n"
"	mov.s32 	%r11, %r9;\n"
"	ld.param.u64 	%rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
"	mov.s32 	%r12, 0;\n"
"	mov.s32 	%r13, %r11;\n"
"$Lt_1_8962:\n"
"	.loc	16	64	0\n"
"	mov.s32 	%r14, 0;\n"
"	st.global.s32 	[%rd5+0], %r14;\n"
"	add.s32 	%r12, %r12, 1;\n"
"	add.u64 	%rd5, %rd5, 4;\n"
"	setp.ne.s32 	%p4, %r9, %r12;\n"
"	@%p4 bra 	$Lt_1_8962;\n"
"$Lt_1_8450:\n"
"$Lt_1_7938:\n"
"	sub.s32 	%r15, %r6, 1;\n"
"	setp.ne.s32 	%p5, %r5, %r15;\n"
"	@%p5 bra 	$Lt_1_9474;\n"
"	.loc	16	67	0\n"
"	add.s32 	%r9, %r7, 1;\n"
"	mov.s32 	%r16, %r9;\n"
"	ld.param.s32 	%r17, [__cudaparm_kernel_calc_cell_counts_ncell];\n"
"	setp.gt.s32 	%p6, %r9, %r17;\n"
"	@%p6 bra 	$Lt_1_9986;\n"
"	sub.s32 	%r18, %r17, %r7;\n"
"	add.s32 	%r19, %r17, 1;\n"
"	ld.param.u64 	%rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
"	cvt.s64.s32 	%rd7, %r9;\n"
"	mul.wide.s32 	%rd8, %r9, 4;\n"
"	add.u64 	%rd9, %rd6, %rd8;\n"
"	mov.s32 	%r20, %r18;\n"
"$Lt_1_10498:\n"
"	.loc	16	68	0\n"
"	st.global.s32 	[%rd9+0], %r6;\n"
"	add.s32 	%r16, %r16, 1;\n"
"	add.u64 	%rd9, %rd9, 4;\n"
"	setp.ne.s32 	%p7, %r19, %r16;\n"
"	@%p7 bra 	$Lt_1_10498;\n"
"$Lt_1_9986:\n"
"$Lt_1_9474:\n"
"	selp.s32 	%r21, 1, 0, %p1;\n"
"	mov.s32 	%r22, 0;\n"
"	set.gt.u32.s32 	%r23, %r5, %r22;\n"
"	neg.s32 	%r24, %r23;\n"
"	and.b32 	%r25, %r21, %r24;\n"
"	mov.u32 	%r26, 0;\n"
"	setp.eq.s32 	%p8, %r25, %r26;\n"
"	@%p8 bra 	$Lt_1_11010;\n"
"	.loc	16	72	0\n"
"	ld.global.u32 	%r27, [%rd4+-4];\n"
"	setp.eq.s32 	%p9, %r7, %r27;\n"
"	@%p9 bra 	$Lt_1_11522;\n"
"	.loc	16	74	0\n"
"	add.s32 	%r28, %r27, 1;\n"
"	mov.s32 	%r29, %r28;\n"
"	setp.gt.s32 	%p10, %r28, %r7;\n"
"	@%p10 bra 	$Lt_1_12034;\n"
"	sub.s32 	%r30, %r7, %r27;\n"
"	add.s32 	%r9, %r7, 1;\n"
"	ld.param.u64 	%rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
"	cvt.s64.s32 	%rd11, %r28;\n"
"	mul.wide.s32 	%rd12, %r28, 4;\n"
"	add.u64 	%rd13, %rd10, %rd12;\n"
"	mov.s32 	%r31, %r30;\n"
"$Lt_1_12546:\n"
"	.loc	16	75	0\n"
"	st.global.s32 	[%rd13+0], %r5;\n"
"	add.s32 	%r29, %r29, 1;\n"
"	add.u64 	%rd13, %rd13, 4;\n"
"	setp.ne.s32 	%p11, %r9, %r29;\n"
"	@%p11 bra 	$Lt_1_12546;\n"
"$Lt_1_12034:\n"
"$Lt_1_11522:\n"
"$Lt_1_11010:\n"
"$Lt_1_7426:\n"
"	.loc	16	79	0\n"
"	exit;\n"
"$LDWend_kernel_calc_cell_counts:\n"
"	}\n"
"	.entry transpose (\n"
"		.param .u64 __cudaparm_transpose_out,\n"
"		.param .u64 __cudaparm_transpose_in,\n"
"		.param .s32 __cudaparm_transpose_columns_in,\n"
"		.param .s32 __cudaparm_transpose_rows_in)\n"
"	{\n"
"	.reg .u32 %r<32>;\n"
"	.reg .u64 %rd<23>;\n"
"	.reg .f32 %f<4>;\n"
"	.reg .pred %p<4>;\n"
"	.shared .align 4 .b8 __cuda___cuda_local_var_32571_32_non_const_block112[288];\n"
"	.loc	16	86	0\n"
"$LDWbegin_transpose:\n"
"	mov.u32 	%r1, %ctaid.x;\n"
"	mul.lo.u32 	%r2, %r1, 8;\n"
"	mov.u32 	%r3, %ctaid.y;\n"
"	mul.lo.u32 	%r4, %r3, 8;\n"
"	mov.u32 	%r5, %tid.x;\n"
"	add.u32 	%r6, %r2, %r5;\n"
"	mov.u32 	%r7, %tid.y;\n"
"	add.u32 	%r8, %r4, %r7;\n"
"	ld.param.s32 	%r9, [__cudaparm_transpose_rows_in];\n"
"	ld.param.s32 	%r10, [__cudaparm_transpose_columns_in];\n"
"	set.gt.u32.u32 	%r11, %r9, %r8;\n"
"	neg.s32 	%r12, %r11;\n"
"	set.gt.u32.u32 	%r13, %r10, %r6;\n"
"	neg.s32 	%r14, %r13;\n"
"	and.b32 	%r15, %r12, %r14;\n"
"	mov.u32 	%r16, 0;\n"
"	setp.eq.s32 	%p1, %r15, %r16;\n"
"	@%p1 bra 	$Lt_2_2306;\n"
"	.loc	16	98	0\n"
"	mov.u64 	%rd1, __cuda___cuda_local_var_32571_32_non_const_block112;\n"
"	ld.param.u64 	%rd2, [__cudaparm_transpose_in];\n"
"	mul.lo.u32 	%r17, %r10, %r8;\n"
"	add.u32 	%r18, %r6, %r17;\n"
"	cvt.u64.u32 	%rd3, %r18;\n"
"	mul.wide.u32 	%rd4, %r18, 4;\n"
"	add.u64 	%rd5, %rd2, %rd4;\n"
"	ld.global.s32 	%r19, [%rd5+0];\n"
"	cvt.rn.f32.s32 	%f1, %r19;\n"
"	cvt.u64.u32 	%rd6, %r5;\n"
"	cvt.u64.u32 	%rd7, %r7;\n"
"	mul.wide.u32 	%rd8, %r7, 9;\n"
"	add.u64 	%rd9, %rd6, %rd8;\n"
"	mul.lo.u64 	%rd10, %rd9, 4;\n"
"	add.u64 	%rd11, %rd1, %rd10;\n"
"	st.shared.f32 	[%rd11+0], %f1;\n"
"$Lt_2_2306:\n"
"	mov.u64 	%rd1, __cuda___cuda_local_var_32571_32_non_const_block112;\n"
"	.loc	16	100	0\n"
"	bar.sync 	0;\n"
"	add.u32 	%r20, %r2, %r7;\n"
"	add.u32 	%r21, %r4, %r5;\n"
"	set.gt.u32.u32 	%r22, %r9, %r21;\n"
"	neg.s32 	%r23, %r22;\n"
"	set.gt.u32.u32 	%r24, %r10, %r20;\n"
"	neg.s32 	%r25, %r24;\n"
"	and.b32 	%r26, %r23, %r25;\n"
"	mov.u32 	%r27, 0;\n"
"	setp.eq.s32 	%p2, %r26, %r27;\n"
"	@%p2 bra 	$Lt_2_2818;\n"
"	.loc	16	105	0\n"
"	cvt.u64.u32 	%rd12, %r7;\n"
"	cvt.u64.u32 	%rd13, %r5;\n"
"	mul.wide.u32 	%rd14, %r5, 9;\n"
"	add.u64 	%rd15, %rd12, %rd14;\n"
"	mul.lo.u64 	%rd16, %rd15, 4;\n"
"	add.u64 	%rd17, %rd1, %rd16;\n"
"	ld.shared.f32 	%f2, [%rd17+0];\n"
"	cvt.rzi.ftz.s32.f32 	%r28, %f2;\n"
"	ld.param.u64 	%rd18, [__cudaparm_transpose_out];\n"
"	mul.lo.u32 	%r29, %r9, %r20;\n"
"	add.u32 	%r30, %r21, %r29;\n"
"	cvt.u64.u32 	%rd19, %r30;\n"
"	mul.wide.u32 	%rd20, %r30, 4;\n"
"	add.u64 	%rd21, %rd18, %rd20;\n"
"	st.global.s32 	[%rd21+0], %r28;\n"
"$Lt_2_2818:\n"
"	.loc	16	106	0\n"
"	exit;\n"
"$LDWend_transpose:\n"
"	}\n"
"	.entry calc_neigh_list_cell (\n"
"		.param .u64 __cudaparm_calc_neigh_list_cell_x_,\n"
"		.param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,\n"
"		.param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,\n"
"		.param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,\n"
"		.param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,\n"
"		.param .u64 __cudaparm_calc_neigh_list_cell_host_numj,\n"
"		.param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,\n"
"		.param .f32 __cudaparm_calc_neigh_list_cell_cell_size,\n"
"		.param .s32 __cudaparm_calc_neigh_list_cell_ncellx,\n"
"		.param .s32 __cudaparm_calc_neigh_list_cell_ncelly,\n"
"		.param .s32 __cudaparm_calc_neigh_list_cell_ncellz,\n"
"		.param .s32 __cudaparm_calc_neigh_list_cell_inum,\n"
"		.param .s32 __cudaparm_calc_neigh_list_cell_nt,\n"
"		.param .s32 __cudaparm_calc_neigh_list_cell_nall,\n"
"		.param .s32 __cudaparm_calc_neigh_list_cell_t_per_atom)\n"
"	{\n"
"	.reg .u32 %r<118>;\n"
"	.reg .u64 %rd<52>;\n"
"	.reg .f32 %f<41>;\n"
"	.reg .f64 %fd<4>;\n"
"	.reg .pred %p<23>;\n"
"	.shared .align 16 .b8 __cuda___cuda_local_var_32609_34_non_const_pos_sh496[2048];\n"
"	.shared .align 4 .b8 __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544[512];\n"
"	.loc	16	116	0\n"
"$LDWbegin_calc_neigh_list_cell:\n"
"	.loc	16	128	0\n"
"	ld.param.s32 	%r1, [__cudaparm_calc_neigh_list_cell_ncelly];\n"
"	mov.u32 	%r2, %ctaid.y;\n"
"	rem.u32 	%r3, %r2, %r1;\n"
"	div.u32 	%r4, %r2, %r1;\n"
"	ld.param.s32 	%r5, [__cudaparm_calc_neigh_list_cell_ncellx];\n"
"	mul.lo.s32 	%r6, %r5, %r3;\n"
"	mul.lo.s32 	%r7, %r5, %r4;\n"
"	mul.lo.s32 	%r8, %r7, %r1;\n"
"	cvt.s32.u32 	%r9, %ctaid.x;\n"
"	ld.param.u64 	%rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];\n"
"	add.s32 	%r10, %r6, %r8;\n"
"	add.s32 	%r11, %r9, %r10;\n"
"	cvt.s64.s32 	%rd2, %r11;\n"
"	mul.wide.s32 	%rd3, %r11, 4;\n"
"	add.u64 	%rd4, %rd1, %rd3;\n"
"	ldu.global.s32 	%r12, [%rd4+0];\n"
"	.loc	16	129	0\n"
"	ldu.global.s32 	%r13, [%rd4+4];\n"
"	.loc	16	137	0\n"
"	sub.s32 	%r14, %r13, %r12;\n"
"	mov.u32 	%r15, %ntid.x;\n"
"	cvt.rn.f32.u32 	%f1, %r15;\n"
"	cvt.rn.f32.s32 	%f2, %r14;\n"
"	div.approx.ftz.f32 	%f3, %f2, %f1;\n"
"	cvt.rpi.ftz.f32.f32 	%f4, %f3;\n"
"	cvt.rzi.ftz.s32.f32 	%r16, %f4;\n"
"	mov.u32 	%r17, 0;\n"
"	setp.le.s32 	%p1, %r16, %r17;\n"
"	@%p1 bra 	$Lt_3_14082;\n"
"	sub.s32 	%r18, %r3, 1;\n"
"	mov.s32 	%r19, 0;\n"
"	max.s32 	%r20, %r18, %r19;\n"
"	sub.s32 	%r21, %r1, 1;\n"
"	add.s32 	%r22, %r3, 1;\n"
"	min.s32 	%r23, %r21, %r22;\n"
"	ld.param.s32 	%r24, [__cudaparm_calc_neigh_list_cell_ncellz];\n"
"	sub.s32 	%r25, %r24, 1;\n"
"	add.s32 	%r26, %r4, 1;\n"
"	min.s32 	%r27, %r25, %r26;\n"
"	sub.s32 	%r28, %r9, 1;\n"
"	mov.s32 	%r29, 0;\n"
"	max.s32 	%r30, %r28, %r29;\n"
"	add.s32 	%r31, %r9, 1;\n"
"	sub.s32 	%r32, %r5, 1;\n"
"	min.s32 	%r33, %r31, %r32;\n"
"	mov.s32 	%r34, %r16;\n"
"	cvt.s32.u32 	%r35, %tid.x;\n"
"	add.s32 	%r36, %r12, %r35;\n"
"	mov.u32 	%r37, 0;\n"
"	ld.param.s32 	%r38, [__cudaparm_calc_neigh_list_cell_inum];\n"
"	cvt.s64.s32 	%rd5, %r38;\n"
"	sub.s32 	%r39, %r4, 1;\n"
"	mov.s32 	%r40, %r36;\n"
"	mov.s32 	%r41, 0;\n"
"	max.s32 	%r42, %r39, %r41;\n"
"	setp.ge.s32 	%p2, %r27, %r42;\n"
"	ld.param.s32 	%r43, [__cudaparm_calc_neigh_list_cell_nt];\n"
"	ld.param.s32 	%r44, [__cudaparm_calc_neigh_list_cell_nall];\n"
"	mov.s32 	%r45, 0;\n"
"	mov.u64 	%rd6, __cuda___cuda_local_var_32609_34_non_const_pos_sh496;\n"
"	mov.u64 	%rd7, __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544;\n"
"	mov.s32 	%r46, %r34;\n"
"$Lt_3_14594:\n"
"	.loc	16	140	0\n"
"	mov.s32 	%r47, %r44;\n"
"	setp.ge.s32 	%p3, %r40, %r13;\n"
"	@%p3 bra 	$Lt_3_14850;\n"
"	.loc	16	146	0\n"
"	ld.param.u64 	%rd8, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
"	add.u32 	%r48, %r36, %r37;\n"
"	cvt.s64.s32 	%rd9, %r48;\n"
"	mul.wide.s32 	%rd10, %r48, 4;\n"
"	add.u64 	%rd11, %rd8, %rd10;\n"
"	ld.global.s32 	%r47, [%rd11+0];\n"
"$Lt_3_14850:\n"
"	setp.lt.s32 	%p4, %r47, %r43;\n"
"	@!%p4 bra 	$Lt_3_15362;\n"
"	.loc	16	149	0\n"
"	mov.u32 	%r49, %r47;\n"
"	mov.s32 	%r50, 0;\n"
"	mov.u32 	%r51, %r50;\n"
"	mov.s32 	%r52, 0;\n"
"	mov.u32 	%r53, %r52;\n"
"	mov.s32 	%r54, 0;\n"
"	mov.u32 	%r55, %r54;\n"
"	tex.1d.v4.f32.s32 {%f5,%f6,%f7,%f8},[neigh_tex,{%r49,%r51,%r53,%r55}];\n"
"	mov.f32 	%f9, %f5;\n"
"	mov.f32 	%f10, %f6;\n"
"	mov.f32 	%f11, %f7;\n"
"	mov.f32 	%f12, %f9;\n"
"	mov.f32 	%f13, %f10;\n"
"	mov.f32 	%f14, %f11;\n"
"$Lt_3_15362:\n"
"	cvt.s64.s32 	%rd12, %r47;\n"
"	mul.wide.s32 	%rd13, %r47, 4;\n"
"	setp.ge.s32 	%p5, %r47, %r38;\n"
"	@%p5 bra 	$Lt_3_16130;\n"
"	.loc	16	153	0\n"
"	ld.param.u64 	%rd14, [__cudaparm_calc_neigh_list_cell_nbor_list];\n"
"	add.u64 	%rd15, %rd12, %rd5;\n"
"	mul.lo.u64 	%rd16, %rd15, 4;\n"
"	add.u64 	%rd17, %rd14, %rd16;\n"
"	mov.s64 	%rd18, %rd17;\n"
"	.loc	16	154	0\n"
"	ld.param.s32 	%r56, [__cudaparm_calc_neigh_list_cell_t_per_atom];\n"
"	sub.s32 	%r57, %r56, 1;\n"
"	mul.lo.s32 	%r58, %r47, %r57;\n"
"	cvt.s64.s32 	%rd19, %r58;\n"
"	add.u64 	%rd20, %rd19, %rd5;\n"
"	mul.lo.u64 	%rd21, %rd20, 4;\n"
"	add.u64 	%rd22, %rd17, %rd21;\n"
"	.loc	16	155	0\n"
"	mul.lo.s32 	%r59, %r56, %r38;\n"
"	sub.s32 	%r60, %r59, %r56;\n"
"	.loc	16	156	0\n"
"	add.u64 	%rd23, %rd13, %rd14;\n"
"	st.global.s32 	[%rd23+0], %r47;\n"
"	bra.uni 	$Lt_3_15874;\n"
"$Lt_3_16130:\n"
"	.loc	16	159	0\n"
"	ld.param.u64 	%rd24, [__cudaparm_calc_neigh_list_cell_host_numj];\n"
"	add.u64 	%rd25, %rd24, %rd13;\n"
"	mul.lo.u64 	%rd26, %rd5, 4;\n"
"	sub.u64 	%rd18, %rd25, %rd26;\n"
"	.loc	16	160	0\n"
"	ld.param.u64 	%rd27, [__cudaparm_calc_neigh_list_cell_host_nbor_list];\n"
"	ld.param.s32 	%r61, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n"
"	sub.s32 	%r62, %r47, %r38;\n"
"	mul.lo.s32 	%r63, %r61, %r62;\n"
"	cvt.s64.s32 	%rd28, %r63;\n"
"	mul.wide.s32 	%rd29, %r63, 4;\n"
"	add.u64 	%rd22, %rd27, %rd29;\n"
"	mov.s32 	%r60, 0;\n"
"$Lt_3_15874:\n"
"	.loc	16	165	0\n"
"	mov.s32 	%r64, %r42;\n"
"	@!%p2 bra 	$Lt_3_24066;\n"
"	sub.s32 	%r65, %r27, %r42;\n"
"	add.s32 	%r66, %r65, 1;\n"
"	setp.le.s32 	%p6, %r20, %r23;\n"
"	add.s32 	%r67, %r27, 1;\n"
"	mov.s32 	%r68, 0;\n"
"	mov.s32 	%r69, %r66;\n"
"$Lt_3_16898:\n"
"	.loc	16	166	0\n"
"	mov.s32 	%r70, %r20;\n"
"	@!%p6 bra 	$Lt_3_17154;\n"
"	sub.s32 	%r71, %r23, %r20;\n"
"	add.s32 	%r72, %r71, 1;\n"
"	setp.ge.s32 	%p7, %r33, %r30;\n"
"	add.s32 	%r73, %r23, 1;\n"
"	mov.s32 	%r74, %r72;\n"
"$Lt_3_17666:\n"
"	@!%p7 bra 	$Lt_3_17922;\n"
"	sub.s32 	%r75, %r33, %r30;\n"
"	add.s32 	%r76, %r75, 1;\n"
"	mul.lo.s32 	%r77, %r70, %r5;\n"
"	mul.lo.s32 	%r78, %r64, %r5;\n"
"	mul.lo.s32 	%r79, %r78, %r1;\n"
"	add.s32 	%r80, %r33, 1;\n"
"	add.s32 	%r81, %r77, %r79;\n"
"	add.s32 	%r82, %r81, %r30;\n"
"	add.s32 	%r83, %r80, %r81;\n"
"	cvt.s64.s32 	%rd30, %r82;\n"
"	mul.wide.s32 	%rd31, %r82, 4;\n"
"	add.u64 	%rd32, %rd1, %rd31;\n"
"	mov.s32 	%r84, %r76;\n"
"$Lt_3_18434:\n"
"	.loc	16	171	0\n"
"	ld.global.s32 	%r85, [%rd32+0];\n"
"	.loc	16	172	0\n"
"	ld.global.s32 	%r86, [%rd32+4];\n"
"	.loc	16	176	0\n"
"	sub.s32 	%r87, %r86, %r85;\n"
"	cvt.rn.f32.s32 	%f15, %r87;\n"
"	mov.f32 	%f16, 0f43000000;    	\n"
"	div.approx.ftz.f32 	%f17, %f15, %f16;\n"
"	cvt.rpi.ftz.f32.f32 	%f18, %f17;\n"
"	cvt.rzi.ftz.s32.f32 	%r88, %f18;\n"
"	mov.u32 	%r89, 0;\n"
"	setp.le.s32 	%p8, %r88, %r89;\n"
"	@%p8 bra 	$Lt_3_18690;\n"
"	mov.s32 	%r90, %r88;\n"
"	mov.s32 	%r91, 0;\n"
"	setp.lt.s32 	%p9, %r47, %r43;\n"
"	mul.lo.s32 	%r92, %r88, 128;\n"
"	mov.s32 	%r93, %r90;\n"
"$Lt_3_19202:\n"
"	sub.s32 	%r94, %r87, %r91;\n"
"	mov.s32 	%r95, 128;\n"
"	min.s32 	%r96, %r94, %r95;\n"
"	setp.le.s32 	%p10, %r96, %r35;\n"
"	@%p10 bra 	$Lt_3_19458;\n"
"	.loc	16	183	0\n"
"	ld.param.u64 	%rd33, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
"	add.s32 	%r97, %r91, %r35;\n"
"	add.s32 	%r98, %r85, %r97;\n"
"	cvt.s64.s32 	%rd34, %r98;\n"
"	mul.wide.s32 	%rd35, %r98, 4;\n"
"	add.u64 	%rd36, %rd33, %rd35;\n"
"	ld.global.s32 	%r99, [%rd36+0];\n"
"	.loc	16	184	0\n"
"	cvt.s64.s32 	%rd37, %r35;\n"
"	mul.wide.s32 	%rd38, %r35, 4;\n"
"	add.u64 	%rd39, %rd7, %rd38;\n"
"	st.shared.s32 	[%rd39+0], %r99;\n"
"	.loc	16	185	0\n"
"	mov.u32 	%r100, %r99;\n"
"	mov.s32 	%r101, 0;\n"
"	mov.u32 	%r102, %r101;\n"
"	mov.s32 	%r103, 0;\n"
"	mov.u32 	%r104, %r103;\n"
"	mov.s32 	%r105, 0;\n"
"	mov.u32 	%r106, %r105;\n"
"	tex.1d.v4.f32.s32 {%f19,%f20,%f21,%f22},[neigh_tex,{%r100,%r102,%r104,%r106}];\n"
"	mov.f32 	%f23, %f19;\n"
"	mov.f32 	%f24, %f20;\n"
"	mov.f32 	%f25, %f21;\n"
"	.loc	16	186	0\n"
"	mul.lo.u64 	%rd40, %rd37, 16;\n"
"	add.u64 	%rd41, %rd6, %rd40;\n"
"	st.shared.v2.f32 	[%rd41+0], {%f23,%f24};\n"
"	.loc	16	188	0\n"
"	st.shared.f32 	[%rd41+8], %f25;\n"
"$Lt_3_19458:\n"
"	.loc	16	190	0\n"
"	bar.sync 	0;\n"
"	@!%p9 bra 	$Lt_3_20482;\n"
"	mov.u32 	%r107, 0;\n"
"	setp.le.s32 	%p11, %r96, %r107;\n"
"	@%p11 bra 	$Lt_3_20482;\n"
"	mov.s32 	%r108, %r96;\n"
"	mov.s64 	%rd42, 0;\n"
"	ld.param.f32 	%f26, [__cudaparm_calc_neigh_list_cell_cell_size];\n"
"	mul.ftz.f32 	%f27, %f26, %f26;\n"
"	mov.s64 	%rd43, %rd6;\n"
"	mov.f32 	%f28, %f14;\n"
"	mov.f32 	%f29, %f13;\n"
"	mov.f32 	%f30, %f12;\n"
"	mov.s32 	%r109, 0;\n"
"	mov.s32 	%r110, %r108;\n"
"$Lt_3_20994:\n"
"	ld.shared.v4.f32 	{%f31,%f32,%f33,_}, [%rd43+0];\n"
"	.loc	16	196	0\n"
"	sub.ftz.f32 	%f34, %f30, %f31;\n"
"	.loc	16	197	0\n"
"	sub.ftz.f32 	%f35, %f29, %f32;\n"
"	.loc	16	198	0\n"
"	sub.ftz.f32 	%f36, %f28, %f33;\n"
"	.loc	16	195	0\n"
"	mul.ftz.f32 	%f37, %f35, %f35;\n"
"	fma.rn.ftz.f32 	%f38, %f34, %f34, %f37;\n"
"	fma.rn.ftz.f32 	%f39, %f36, %f36, %f38;\n"
"	setp.gt.ftz.f32 	%p12, %f27, %f39;\n"
"	@!%p12 bra 	$Lt_3_25346;\n"
"	cvt.ftz.f64.f32 	%fd1, %f39;\n"
"	mov.f64 	%fd2, 0d3ee4f8b588e368f1;	\n"
"	setp.gt.f64 	%p13, %fd1, %fd2;\n"
"	@!%p13 bra 	$Lt_3_25346;\n"
"	.loc	16	202	0\n"
"	add.s32 	%r68, %r68, 1;\n"
"	ld.param.s32 	%r111, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n"
"	setp.lt.s32 	%p14, %r111, %r68;\n"
"	@%p14 bra 	$Lt_3_25346;\n"
"	.loc	16	204	0\n"
"	mul.lo.u64 	%rd44, %rd42, 4;\n"
"	add.u64 	%rd45, %rd7, %rd44;\n"
"	ld.shared.s32 	%r112, [%rd45+0];\n"
"	st.global.s32 	[%rd22+0], %r112;\n"
"	cvt.s64.s32 	%rd46, %r60;\n"
"	mul.wide.s32 	%rd47, %r60, 4;\n"
"	add.u64 	%rd48, %rd22, %rd47;\n"
"	add.u64 	%rd49, %rd48, 4;\n"
"	add.u64 	%rd50, %rd22, 4;\n"
"	ld.param.s32 	%r113, [__cudaparm_calc_neigh_list_cell_t_per_atom];\n"
"	sub.s32 	%r114, %r113, 1;\n"
"	and.b32 	%r115, %r68, %r114;\n"
"	mov.s32 	%r116, 0;\n"
"	setp.eq.s32 	%p15, %r115, %r116;\n"
"	selp.u64 	%rd22, %rd49, %rd50, %p15;\n"
"$Lt_3_25346:\n"
"$L_3_13570:\n"
"	.loc	16	202	0\n"
"	add.s32 	%r109, %r109, 1;\n"
"	add.s64 	%rd42, %rd42, 1;\n"
"	add.u64 	%rd43, %rd43, 16;\n"
"	setp.ne.s32 	%p16, %r96, %r109;\n"
"	@%p16 bra 	$Lt_3_20994;\n"
"$Lt_3_20482:\n"
"$Lt_3_19970:\n"
"	.loc	16	212	0\n"
"	bar.sync 	0;\n"
"	add.s32 	%r91, %r91, 128;\n"
"	setp.ne.s32 	%p17, %r91, %r92;\n"
"	@%p17 bra 	$Lt_3_19202;\n"
"$Lt_3_18690:\n"
"	add.s32 	%r82, %r82, 1;\n"
"	add.u64 	%rd32, %rd32, 4;\n"
"	setp.ne.s32 	%p18, %r82, %r83;\n"
"	@%p18 bra 	$Lt_3_18434;\n"
"$Lt_3_17922:\n"
"	add.s32 	%r70, %r70, 1;\n"
"	setp.ne.s32 	%p19, %r73, %r70;\n"
"	@%p19 bra 	$Lt_3_17666;\n"
"$Lt_3_17154:\n"
"	add.s32 	%r64, %r64, 1;\n"
"	setp.ne.s32 	%p20, %r67, %r64;\n"
"	@%p20 bra 	$Lt_3_16898;\n"
"	bra.uni 	$Lt_3_16386;\n"
"$Lt_3_24066:\n"
"	mov.s32 	%r68, 0;\n"
"$Lt_3_16386:\n"
"	@!%p4 bra 	$Lt_3_23042;\n"
"	.loc	16	218	0\n"
"	st.global.s32 	[%rd18+0], %r68;\n"
"$Lt_3_23042:\n"
"	add.s32 	%r45, %r45, 1;\n"
"	add.u32 	%r37, %r37, %r15;\n"
"	add.s32 	%r40, %r40, %r15;\n"
"	setp.ne.s32 	%p21, %r16, %r45;\n"
"	@%p21 bra 	$Lt_3_14594;\n"
"$Lt_3_14082:\n"
"	.loc	16	220	0\n"
"	exit;\n"
"$LDWend_calc_neigh_list_cell:\n"
"	}\n"
"	.entry kernel_special (\n"
"		.param .u64 __cudaparm_kernel_special_dev_nbor,\n"
"		.param .u64 __cudaparm_kernel_special_host_nbor_list,\n"
"		.param .u64 __cudaparm_kernel_special_host_numj,\n"
"		.param .u64 __cudaparm_kernel_special_tag,\n"
"		.param .u64 __cudaparm_kernel_special_nspecial,\n"
"		.param .u64 __cudaparm_kernel_special_special,\n"
"		.param .s32 __cudaparm_kernel_special_inum,\n"
"		.param .s32 __cudaparm_kernel_special_nt,\n"
"		.param .s32 __cudaparm_kernel_special_max_nbors,\n"
"		.param .s32 __cudaparm_kernel_special_t_per_atom)\n"
"	{\n"
"	.reg .u32 %r<45>;\n"
"	.reg .u64 %rd<45>;\n"
"	.reg .pred %p<11>;\n"
"	.loc	16	226	0\n"
"$LDWbegin_kernel_special:\n"
"	ld.param.s32 	%r1, [__cudaparm_kernel_special_t_per_atom];\n"
"	cvt.s32.u32 	%r2, %tid.x;\n"
"	div.s32 	%r3, %r2, %r1;\n"
"	cvt.s32.u32 	%r4, %ntid.x;\n"
"	div.s32 	%r5, %r4, %r1;\n"
"	cvt.s32.u32 	%r6, %ctaid.x;\n"
"	mul.lo.s32 	%r7, %r6, %r5;\n"
"	add.s32 	%r8, %r3, %r7;\n"
"	ld.param.s32 	%r9, [__cudaparm_kernel_special_nt];\n"
"	setp.ge.s32 	%p1, %r8, %r9;\n"
"	@%p1 bra 	$Lt_4_6146;\n"
"	.loc	16	236	0\n"
"	ld.param.u64 	%rd1, [__cudaparm_kernel_special_nspecial];\n"
"	mul.lo.s32 	%r10, %r8, 3;\n"
"	cvt.s64.s32 	%rd2, %r10;\n"
"	mul.wide.s32 	%rd3, %r10, 4;\n"
"	add.u64 	%rd4, %rd1, %rd3;\n"
"	ld.global.s32 	%r11, [%rd4+0];\n"
"	.loc	16	237	0\n"
"	ld.global.s32 	%r12, [%rd4+4];\n"
"	.loc	16	238	0\n"
"	ld.global.s32 	%r13, [%rd4+8];\n"
"	ld.param.s32 	%r14, [__cudaparm_kernel_special_inum];\n"
"	setp.ge.s32 	%p2, %r8, %r14;\n"
"	@%p2 bra 	$Lt_4_6914;\n"
"	.loc	16	244	0\n"
"	ld.param.u64 	%rd5, [__cudaparm_kernel_special_dev_nbor];\n"
"	cvt.s64.s32 	%rd6, %r8;\n"
"	cvt.s64.s32 	%rd7, %r14;\n"
"	add.u64 	%rd8, %rd6, %rd7;\n"
"	mul.lo.u64 	%rd9, %rd8, 4;\n"
"	add.u64 	%rd10, %rd5, %rd9;\n"
"	ld.global.s32 	%r15, [%rd10+0];\n"
"	.loc	16	246	0\n"
"	mul.lo.s32 	%r16, %r14, %r1;\n"
"	mov.s32 	%r17, %r16;\n"
"	.loc	16	248	0\n"
"	sub.s32 	%r18, %r1, 1;\n"
"	mul.lo.s32 	%r19, %r18, %r8;\n"
"	add.s32 	%r20, %r14, %r19;\n"
"	cvt.s64.s32 	%rd11, %r20;\n"
"	mul.wide.s32 	%rd12, %r20, 4;\n"
"	add.u64 	%rd13, %rd10, %rd12;\n"
"	and.b32 	%r21, %r18, %r15;\n"
"	cvt.s64.s32 	%rd14, %r21;\n"
"	div.s32 	%r22, %r15, %r1;\n"
"	mul.lo.s32 	%r23, %r16, %r22;\n"
"	cvt.s64.s32 	%rd15, %r23;\n"
"	add.u64 	%rd16, %rd14, %rd15;\n"
"	mul.lo.u64 	%rd17, %rd16, 4;\n"
"	add.u64 	%rd18, %rd13, %rd17;\n"
"	.loc	16	249	0\n"
"	and.b32 	%r24, %r18, %r2;\n"
"	cvt.s64.s32 	%rd19, %r24;\n"
"	mul.wide.s32 	%rd20, %r24, 4;\n"
"	add.u64 	%rd21, %rd13, %rd20;\n"
"	bra.uni 	$Lt_4_6658;\n"
"$Lt_4_6914:\n"
"	.loc	16	252	0\n"
"	sub.s32 	%r25, %r8, %r14;\n"
"	ld.param.u64 	%rd22, [__cudaparm_kernel_special_host_nbor_list];\n"
"	ld.param.s32 	%r26, [__cudaparm_kernel_special_max_nbors];\n"
"	mul.lo.s32 	%r27, %r26, %r25;\n"
"	cvt.s64.s32 	%rd23, %r27;\n"
"	mul.wide.s32 	%rd24, %r27, 4;\n"
"	add.u64 	%rd25, %rd22, %rd24;\n"
"	mov.s64 	%rd21, %rd25;\n"
"	.loc	16	254	0\n"
"	ld.param.u64 	%rd26, [__cudaparm_kernel_special_host_numj];\n"
"	cvt.s64.s32 	%rd27, %r25;\n"
"	mul.wide.s32 	%rd28, %r25, 4;\n"
"	add.u64 	%rd29, %rd26, %rd28;\n"
"	ld.global.s32 	%r28, [%rd29+0];\n"
"	cvt.s64.s32 	%rd30, %r28;\n"
"	mul.wide.s32 	%rd31, %r28, 4;\n"
"	add.u64 	%rd18, %rd25, %rd31;\n"
"	mov.s32 	%r17, 1;\n"
"$Lt_4_6658:\n"
"	setp.ge.u64 	%p3, %rd21, %rd18;\n"
"	@%p3 bra 	$Lt_4_7170;\n"
"	mov.s32 	%r29, 0;\n"
"	setp.gt.s32 	%p4, %r13, %r29;\n"
"	cvt.s64.s32 	%rd32, %r17;\n"
"	ld.param.u64 	%rd33, [__cudaparm_kernel_special_tag];\n"
"$Lt_4_7682:\n"
"	.loc	16	258	0\n"
"	ld.global.s32 	%r30, [%rd21+0];\n"
"	.loc	16	259	0\n"
"	cvt.s64.s32 	%rd34, %r30;\n"
"	mul.wide.s32 	%rd35, %r30, 4;\n"
"	add.u64 	%rd36, %rd33, %rd35;\n"
"	ld.global.s32 	%r31, [%rd36+0];\n"
"	@!%p4 bra 	$Lt_4_7938;\n"
"	mov.s32 	%r32, %r13;\n"
"	cvt.s64.s32 	%rd37, %r8;\n"
"	cvt.s64.s32 	%rd38, %r9;\n"
"	mul.wide.s32 	%rd39, %r9, 4;\n"
"	ld.param.u64 	%rd40, [__cudaparm_kernel_special_special];\n"
"	mul.wide.s32 	%rd41, %r8, 4;\n"
"	add.u64 	%rd42, %rd40, %rd41;\n"
"	mov.s32 	%r33, 0;\n"
"	mov.s32 	%r34, %r32;\n"
"$Lt_4_8450:\n"
"	ld.global.s32 	%r35, [%rd42+0];\n"
"	setp.ne.s32 	%p5, %r35, %r31;\n"
"	@%p5 bra 	$Lt_4_8706;\n"
"	.loc	16	269	0\n"
"	setp.le.s32 	%p6, %r11, %r33;\n"
"	mov.s32 	%r36, 3;\n"
"	mov.s32 	%r37, 2;\n"
"	selp.s32 	%r38, %r36, %r37, %p6;\n"
"	mov.s32 	%r39, 2;\n"
"	mov.s32 	%r40, 1;\n"
"	selp.s32 	%r41, %r39, %r40, %p6;\n"
"	setp.le.s32 	%p7, %r12, %r33;\n"
"	selp.s32 	%r42, %r38, %r41, %p7;\n"
"	shl.b32 	%r43, %r42, 30;\n"
"	xor.b32 	%r30, %r30, %r43;\n"
"	.loc	16	270	0\n"
"	st.global.s32 	[%rd21+0], %r30;\n"
"$Lt_4_8706:\n"
"	add.s32 	%r33, %r33, 1;\n"
"	add.u64 	%rd42, %rd39, %rd42;\n"
"	setp.ne.s32 	%p8, %r13, %r33;\n"
"	@%p8 bra 	$Lt_4_8450;\n"
"$Lt_4_7938:\n"
"	.loc	16	257	0\n"
"	mul.lo.u64 	%rd43, %rd32, 4;\n"
"	add.u64 	%rd21, %rd21, %rd43;\n"
"	setp.lt.u64 	%p9, %rd21, %rd18;\n"
"	@%p9 bra 	$Lt_4_7682;\n"
"$Lt_4_7170:\n"
"$Lt_4_6146:\n"
"	.loc	16	276	0\n"
"	exit;\n"
"$LDWend_kernel_special:\n"
"	}\n"
;