Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F83008201
lal_neighbor.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Sep 14, 17:17
Size
19 KB
Mime Type
text/x-c
Expires
Mon, Sep 16, 17:17 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
20785008
Attached To
rLAMMPS lammps
lal_neighbor.cpp
View Options
/***************************************************************************
neighbor.cpp
-------------------
W. Michael Brown (ORNL)
Peng Wang (Nvidia)
Class for handling neighbor lists
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov, penwang@nvidia.com
***************************************************************************/
#include "lal_precision.h"
#include "lal_neighbor.h"
#include "lal_device.h"
#include "math.h"
using
namespace
LAMMPS_AL
;
int
Neighbor
::
bytes_per_atom
(
const
int
max_nbors
)
const
{
if
(
_gpu_nbor
==
1
)
return
(
max_nbors
+
2
)
*
sizeof
(
int
);
else
if
(
_gpu_nbor
==
2
)
return
(
max_nbors
+
3
)
*
sizeof
(
int
);
else
if
(
_use_packing
)
return
((
max_nbors
+
2
)
*
2
)
*
sizeof
(
int
);
else
return
(
max_nbors
+
3
)
*
sizeof
(
int
);
}
bool
Neighbor
::
init
(
NeighborShared
*
shared
,
const
int
inum
,
const
int
host_inum
,
const
int
max_nbors
,
const
int
maxspecial
,
UCL_Device
&
devi
,
const
int
gpu_nbor
,
const
int
gpu_host
,
const
bool
pre_cut
,
const
int
block_cell_2d
,
const
int
block_cell_id
,
const
int
block_nbor_build
,
const
int
threads_per_atom
,
const
int
warp_size
,
const
bool
time_device
,
const
std
::
string
compile_flags
)
{
clear
();
_threads_per_atom
=
threads_per_atom
;
_block_cell_2d
=
block_cell_2d
;
_block_cell_id
=
block_cell_id
;
_max_block_nbor_build
=
block_nbor_build
;
_block_nbor_build
=
block_nbor_build
;
_warp_size
=
warp_size
;
_shared
=
shared
;
dev
=&
devi
;
_gpu_nbor
=
gpu_nbor
;
_time_device
=
time_device
;
if
(
gpu_host
==
0
)
_gpu_host
=
false
;
else
if
(
gpu_host
==
1
)
_gpu_host
=
true
;
else
// Not yet implemented
assert
(
0
==
1
);
if
(
pre_cut
||
gpu_nbor
==
0
)
_alloc_packed
=
true
;
else
_alloc_packed
=
false
;
if
(
pre_cut
)
_packed_permissions
=
UCL_READ_WRITE
;
else
_packed_permissions
=
UCL_READ_ONLY
;
bool
success
=
true
;
// Initialize timers for the selected GPU
_nbor_time_avail
=
false
;
time_nbor
.
init
(
*
dev
);
time_kernel
.
init
(
*
dev
);
time_hybrid1
.
init
(
*
dev
);
time_hybrid2
.
init
(
*
dev
);
time_transpose
.
init
(
*
dev
);
time_nbor
.
zero
();
time_kernel
.
zero
();
time_hybrid1
.
zero
();
time_hybrid2
.
zero
();
time_transpose
.
zero
();
_max_atoms
=
static_cast
<
int
>
(
static_cast
<
double
>
(
inum
)
*
1.10
);
if
(
_max_atoms
==
0
)
_max_atoms
=
1000
;
_max_host
=
static_cast
<
int
>
(
static_cast
<
double
>
(
host_inum
)
*
1.10
);
_max_nbors
=
(
max_nbors
/
threads_per_atom
+
1
)
*
threads_per_atom
;
_maxspecial
=
maxspecial
;
if
(
gpu_nbor
==
0
)
_maxspecial
=
0
;
if
(
gpu_nbor
==
0
)
success
=
success
&&
(
host_packed
.
alloc
(
2
*
IJ_SIZE
,
*
dev
,
UCL_WRITE_ONLY
)
==
UCL_SUCCESS
);
alloc
(
success
);
if
(
!
success
)
return
false
;
if
(
_use_packing
==
false
)
_shared
->
compile_kernels
(
devi
,
gpu_nbor
,
compile_flags
);
return
success
;
}
void
Neighbor
::
alloc
(
bool
&
success
)
{
dev_nbor
.
clear
();
host_acc
.
clear
();
int
nt
=
_max_atoms
+
_max_host
;
if
(
_use_packing
==
false
||
_gpu_nbor
>
0
)
success
=
success
&&
(
dev_nbor
.
alloc
((
_max_nbors
+
2
)
*
_max_atoms
,
*
dev
)
==
UCL_SUCCESS
);
else
success
=
success
&&
(
dev_nbor
.
alloc
(
3
*
_max_atoms
,
*
dev
,
UCL_READ_ONLY
)
==
UCL_SUCCESS
);
success
=
success
&&
(
host_acc
.
alloc
(
nt
*
2
,
*
dev
,
UCL_READ_WRITE
)
==
UCL_SUCCESS
);
_c_bytes
=
dev_nbor
.
row_bytes
();
if
(
_alloc_packed
)
{
dev_packed
.
clear
();
success
=
success
&&
(
dev_packed
.
alloc
((
_max_nbors
+
2
)
*
_max_atoms
,
*
dev
,
_packed_permissions
)
==
UCL_SUCCESS
);
dev_acc
.
clear
();
success
=
success
&&
(
dev_acc
.
alloc
(
_max_atoms
,
*
dev
,
UCL_READ_WRITE
)
==
UCL_SUCCESS
);
_c_bytes
+=
dev_packed
.
row_bytes
()
+
dev_acc
.
row_bytes
();
}
if
(
_max_host
>
0
)
{
nbor_host
.
clear
();
dev_numj_host
.
clear
();
host_ilist
.
clear
();
host_jlist
.
clear
();
success
=
(
nbor_host
.
alloc
(
_max_nbors
*
_max_host
,
*
dev
,
UCL_READ_WRITE
,
UCL_READ_WRITE
)
==
UCL_SUCCESS
)
&&
success
;
success
=
success
&&
(
dev_numj_host
.
alloc
(
_max_host
,
*
dev
,
UCL_READ_WRITE
)
==
UCL_SUCCESS
);
success
=
success
&&
(
host_ilist
.
alloc
(
nt
,
*
dev
,
UCL_NOT_PINNED
)
==
UCL_SUCCESS
);
if
(
!
success
)
return
;
for
(
int
i
=
0
;
i
<
nt
;
i
++
)
host_ilist
[
i
]
=
i
;
success
=
success
&&
(
host_jlist
.
alloc
(
_max_host
,
*
dev
,
UCL_NOT_PINNED
)
==
UCL_SUCCESS
);
if
(
!
success
)
return
;
int
*
ptr
=
nbor_host
.
host
.
begin
();
for
(
int
i
=
0
;
i
<
_max_host
;
i
++
)
{
host_jlist
[
i
]
=
ptr
;
ptr
+=
_max_nbors
;
}
_c_bytes
+=
nbor_host
.
device
.
row_bytes
()
+
dev_numj_host
.
row_bytes
();
}
else
{
// Some OpenCL implementations return errors for NULL pointers as args
nbor_host
.
device
.
view
(
dev_nbor
);
dev_numj_host
.
view
(
dev_nbor
);
}
if
(
_maxspecial
>
0
)
{
dev_nspecial
.
clear
();
dev_special
.
clear
();
dev_special_t
.
clear
();
int
at
=
_max_atoms
+
_max_host
;
success
=
success
&&
(
dev_nspecial
.
alloc
(
3
*
at
,
*
dev
,
UCL_READ_ONLY
)
==
UCL_SUCCESS
);
success
=
success
&&
(
dev_special
.
alloc
(
_maxspecial
*
at
,
*
dev
,
UCL_READ_WRITE
)
==
UCL_SUCCESS
);
success
=
success
&&
(
dev_special_t
.
alloc
(
_maxspecial
*
at
,
*
dev
,
UCL_READ_ONLY
)
==
UCL_SUCCESS
);
_gpu_bytes
+=
dev_nspecial
.
row_bytes
()
+
dev_special
.
row_bytes
()
+
dev_special_t
.
row_bytes
();
}
_allocated
=
true
;
}
void
Neighbor
::
clear
()
{
_gpu_bytes
=
0.0
;
_cell_bytes
=
0.0
;
_c_bytes
=
0.0
;
_bin_time
=
0.0
;
if
(
_ncells
>
0
)
{
_ncells
=
0
;
cell_counts
.
clear
();
if
(
_gpu_nbor
==
2
)
delete
[]
cell_iter
;
}
if
(
_allocated
)
{
_allocated
=
false
;
_nbor_time_avail
=
false
;
host_packed
.
clear
();
host_acc
.
clear
();
dev_acc
.
clear
();
dev_nbor
.
clear
();
nbor_host
.
clear
();
dev_packed
.
clear
();
dev_numj_host
.
clear
();
host_ilist
.
clear
();
host_jlist
.
clear
();
dev_nspecial
.
clear
();
dev_special
.
clear
();
dev_special_t
.
clear
();
time_kernel
.
clear
();
time_nbor
.
clear
();
time_hybrid1
.
clear
();
time_hybrid2
.
clear
();
time_transpose
.
clear
();
}
}
double
Neighbor
::
host_memory_usage
()
const
{
if
(
_gpu_nbor
>
0
)
{
if
(
_gpu_host
)
return
nbor_host
.
device
.
row_bytes
()
*
nbor_host
.
rows
()
+
host_ilist
.
row_bytes
()
+
host_jlist
.
row_bytes
();
else
return
0
;
}
else
return
host_packed
.
row_bytes
()
*
host_packed
.
rows
()
+
host_acc
.
row_bytes
()
+
sizeof
(
Neighbor
);
}
void
Neighbor
::
get_host
(
const
int
inum
,
int
*
ilist
,
int
*
numj
,
int
**
firstneigh
,
const
int
block_size
)
{
_nbor_time_avail
=
true
;
time_nbor
.
start
();
UCL_H_Vec
<
int
>
ilist_view
;
ilist_view
.
view
(
ilist
,
inum
,
*
dev
);
ucl_copy
(
dev_nbor
,
ilist_view
,
false
);
UCL_D_Vec
<
int
>
nbor_offset
;
UCL_H_Vec
<
int
>
host_offset
;
int
copy_count
=
0
;
int
ij_count
=
0
;
int
acc_count
=
0
;
int
dev_count
=
0
;
int
*
h_ptr
=
host_packed
.
begin
();
_nbor_pitch
=
inum
;
for
(
int
ii
=
0
;
ii
<
inum
;
ii
++
)
{
int
i
=
ilist
[
ii
];
int
nj
=
numj
[
i
];
host_acc
[
ii
]
=
nj
;
host_acc
[
ii
+
inum
]
=
acc_count
;
acc_count
+=
nj
;
int
*
jlist
=
firstneigh
[
i
];
for
(
int
jj
=
0
;
jj
<
nj
;
jj
++
)
{
*
h_ptr
=
jlist
[
jj
];
h_ptr
++
;
ij_count
++
;
if
(
ij_count
==
IJ_SIZE
)
{
dev_nbor
.
sync
();
host_offset
.
view_offset
(
IJ_SIZE
*
(
copy_count
%
2
),
host_packed
,
IJ_SIZE
);
nbor_offset
.
view_offset
(
dev_count
,
dev_packed
,
IJ_SIZE
);
ucl_copy
(
nbor_offset
,
host_offset
,
true
);
copy_count
++
;
ij_count
=
0
;
dev_count
+=
IJ_SIZE
;
h_ptr
=
host_packed
.
begin
()
+
(
IJ_SIZE
*
(
copy_count
%
2
));
}
}
}
if
(
ij_count
!=
0
)
{
dev_nbor
.
sync
();
host_offset
.
view_offset
(
IJ_SIZE
*
(
copy_count
%
2
),
host_packed
,
ij_count
);
nbor_offset
.
view_offset
(
dev_count
,
dev_packed
,
ij_count
);
ucl_copy
(
nbor_offset
,
host_offset
,
true
);
}
UCL_D_Vec
<
int
>
acc_view
;
acc_view
.
view_offset
(
inum
,
dev_nbor
,
inum
*
2
);
ucl_copy
(
acc_view
,
host_acc
,
true
);
UCL_H_Vec
<
int
>
host_view
;
host_view
.
alloc
(
_max_atoms
,
*
dev
,
UCL_READ_WRITE
);
for
(
int
ii
=
0
;
ii
<
inum
;
ii
++
)
{
int
i
=
ilist
[
ii
];
host_view
[
i
]
=
ii
;
}
ucl_copy
(
dev_acc
,
host_view
,
true
);
time_nbor
.
stop
();
if
(
_use_packing
==
false
)
{
time_kernel
.
start
();
int
GX
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
inum
)
*
_threads_per_atom
/
block_size
));
_shared
->
k_nbor
.
set_size
(
GX
,
block_size
);
_shared
->
k_nbor
.
run
(
&
dev_nbor
,
&
dev_packed
,
&
inum
,
&
_threads_per_atom
);
time_kernel
.
stop
();
}
}
// This is the same as get host, but the requirement that ilist[i]=i and
// inum=nlocal is forced to be true to allow direct indexing of neighbors of
// neighbors
void
Neighbor
::
get_host3
(
const
int
inum
,
const
int
nlist
,
int
*
ilist
,
int
*
numj
,
int
**
firstneigh
,
const
int
block_size
)
{
_nbor_time_avail
=
true
;
time_nbor
.
start
();
UCL_H_Vec
<
int
>
ilist_view
;
ilist_view
.
view
(
ilist
,
inum
,
*
dev
);
ucl_copy
(
dev_nbor
,
ilist_view
,
false
);
UCL_D_Vec
<
int
>
nbor_offset
;
UCL_H_Vec
<
int
>
host_offset
;
int
copy_count
=
0
;
int
ij_count
=
0
;
int
acc_count
=
0
;
int
dev_count
=
0
;
int
*
h_ptr
=
host_packed
.
begin
();
_nbor_pitch
=
inum
;
if
(
nlist
!=
inum
)
host_acc
.
zero
(
inum
);
for
(
int
ii
=
0
;
ii
<
nlist
;
ii
++
)
{
int
i
=
ilist
[
ii
];
int
nj
=
numj
[
i
];
host_acc
[
i
]
=
nj
;
host_acc
[
i
+
inum
]
=
acc_count
;
acc_count
+=
nj
;
}
for
(
int
i
=
0
;
i
<
inum
;
i
++
)
{
int
nj
=
host_acc
[
i
];
int
*
jlist
=
firstneigh
[
i
];
for
(
int
jj
=
0
;
jj
<
nj
;
jj
++
)
{
*
h_ptr
=
jlist
[
jj
];
h_ptr
++
;
ij_count
++
;
if
(
ij_count
==
IJ_SIZE
)
{
dev_nbor
.
sync
();
host_offset
.
view_offset
(
IJ_SIZE
*
(
copy_count
%
2
),
host_packed
,
IJ_SIZE
);
nbor_offset
.
view_offset
(
dev_count
,
dev_packed
,
IJ_SIZE
);
ucl_copy
(
nbor_offset
,
host_offset
,
true
);
copy_count
++
;
ij_count
=
0
;
dev_count
+=
IJ_SIZE
;
h_ptr
=
host_packed
.
begin
()
+
(
IJ_SIZE
*
(
copy_count
%
2
));
}
}
}
if
(
ij_count
!=
0
)
{
dev_nbor
.
sync
();
host_offset
.
view_offset
(
IJ_SIZE
*
(
copy_count
%
2
),
host_packed
,
ij_count
);
nbor_offset
.
view_offset
(
dev_count
,
dev_packed
,
ij_count
);
ucl_copy
(
nbor_offset
,
host_offset
,
true
);
}
UCL_D_Vec
<
int
>
acc_view
;
acc_view
.
view_offset
(
inum
,
dev_nbor
,
inum
*
2
);
ucl_copy
(
acc_view
,
host_acc
,
true
);
time_nbor
.
stop
();
if
(
_use_packing
==
false
)
{
time_kernel
.
start
();
int
GX
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
inum
)
*
_threads_per_atom
/
block_size
));
_shared
->
k_nbor
.
set_size
(
GX
,
block_size
);
_shared
->
k_nbor
.
run
(
&
dev_nbor
,
&
dev_packed
,
&
inum
,
&
_threads_per_atom
);
time_kernel
.
stop
();
}
}
template
<
class
numtyp
,
class
acctyp
>
void
Neighbor
::
resize_max_neighbors
(
const
int
maxn
,
bool
&
success
)
{
if
(
maxn
>
_max_nbors
)
{
int
mn
=
static_cast
<
int
>
(
static_cast
<
double
>
(
maxn
)
*
1.10
);
mn
=
(
mn
/
_threads_per_atom
+
1
)
*
_threads_per_atom
;
success
=
success
&&
(
dev_nbor
.
resize
((
mn
+
1
)
*
_max_atoms
)
==
UCL_SUCCESS
);
_gpu_bytes
=
dev_nbor
.
row_bytes
();
if
(
_max_host
>
0
)
{
success
=
success
&&
(
nbor_host
.
resize
(
mn
*
_max_host
)
==
UCL_SUCCESS
);
int
*
ptr
=
nbor_host
.
host
.
begin
();
for
(
int
i
=
0
;
i
<
_max_host
;
i
++
)
{
host_jlist
[
i
]
=
ptr
;
ptr
+=
mn
;
}
_gpu_bytes
+=
nbor_host
.
row_bytes
();
}
else
{
nbor_host
.
device
.
view
(
dev_nbor
);
dev_numj_host
.
view
(
dev_nbor
);
}
if
(
_alloc_packed
)
{
success
=
success
&&
(
dev_packed
.
resize
((
mn
+
2
)
*
_max_atoms
)
==
UCL_SUCCESS
);
_gpu_bytes
+=
dev_packed
.
row_bytes
();
}
_max_nbors
=
mn
;
}
}
template
<
class
numtyp
,
class
acctyp
>
void
Neighbor
::
build_nbor_list
(
double
**
x
,
const
int
inum
,
const
int
host_inum
,
const
int
nall
,
Atom
<
numtyp
,
acctyp
>
&
atom
,
double
*
sublo
,
double
*
subhi
,
tagint
*
tag
,
int
**
nspecial
,
tagint
**
special
,
bool
&
success
,
int
&
mn
)
{
_nbor_time_avail
=
true
;
const
int
nt
=
inum
+
host_inum
;
// Calculate number of cells and allocate storage for binning as necessary
int
ncellx
,
ncelly
,
ncellz
,
ncell_3d
;
int
ghost_cells
=
2
*
_cells_in_cutoff
;
ncellx
=
static_cast
<
int
>
(
ceil
((
subhi
[
0
]
-
sublo
[
0
])
/
_cell_size
))
+
ghost_cells
;
ncelly
=
static_cast
<
int
>
(
ceil
((
subhi
[
1
]
-
sublo
[
1
])
/
_cell_size
))
+
ghost_cells
;
ncellz
=
static_cast
<
int
>
(
ceil
((
subhi
[
2
]
-
sublo
[
2
])
/
_cell_size
))
+
ghost_cells
;
ncell_3d
=
ncellx
*
ncelly
*
ncellz
;
if
(
ncell_3d
+
1
>
_ncells
)
{
cell_counts
.
clear
();
if
(
_gpu_nbor
==
2
)
{
if
(
_ncells
>
0
)
delete
[]
cell_iter
;
cell_iter
=
new
int
[
ncell_3d
+
1
];
cell_counts
.
alloc
(
ncell_3d
+
1
,
dev_nbor
,
UCL_READ_WRITE
,
UCL_READ_ONLY
);
}
else
{
cell_counts
.
device
.
clear
();
cell_counts
.
device
.
alloc
(
ncell_3d
+
1
,
dev_nbor
);
}
_ncells
=
ncell_3d
+
1
;
_cell_bytes
=
cell_counts
.
device
.
row_bytes
();
}
const
numtyp
cutoff_cast
=
static_cast
<
numtyp
>
(
_cutoff
);
if
(
_maxspecial
>
0
)
{
time_nbor
.
start
();
UCL_H_Vec
<
int
>
view_nspecial
;
UCL_H_Vec
<
tagint
>
view_special
,
view_tag
;
view_nspecial
.
view
(
nspecial
[
0
],
nt
*
3
,
*
dev
);
view_special
.
view
(
special
[
0
],
nt
*
_maxspecial
,
*
dev
);
view_tag
.
view
(
tag
,
nall
,
*
dev
);
ucl_copy
(
dev_nspecial
,
view_nspecial
,
nt
*
3
,
false
);
ucl_copy
(
dev_special_t
,
view_special
,
nt
*
_maxspecial
,
false
);
ucl_copy
(
atom
.
dev_tag
,
view_tag
,
nall
,
false
);
time_nbor
.
stop
();
if
(
_time_device
)
time_nbor
.
add_to_total
();
time_transpose
.
start
();
const
int
b2x
=
_block_cell_2d
;
const
int
b2y
=
_block_cell_2d
;
const
int
g2x
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
_maxspecial
)
/
b2x
));
const
int
g2y
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
nt
)
/
b2y
));
_shared
->
k_transpose
.
set_size
(
g2x
,
g2y
,
b2x
,
b2y
);
_shared
->
k_transpose
.
run
(
&
dev_special
,
&
dev_special_t
,
&
_maxspecial
,
&
nt
);
time_transpose
.
stop
();
}
// If binning on CPU, do this now
if
(
_gpu_nbor
==
2
)
{
double
stime
=
MPI_Wtime
();
int
*
cell_id
=
atom
.
host_cell_id
.
begin
();
int
*
particle_id
=
atom
.
host_particle_id
.
begin
();
// Build cell list on CPU
cell_counts
.
host
.
zero
();
double
i_cell_size
=
1.0
/
_cell_size
;
int
offset_hi
=
_cells_in_cutoff
+
1
;
for
(
int
i
=
0
;
i
<
nt
;
i
++
)
{
double
px
,
py
,
pz
;
px
=
x
[
i
][
0
]
-
sublo
[
0
];
py
=
x
[
i
][
1
]
-
sublo
[
1
];
pz
=
x
[
i
][
2
]
-
sublo
[
2
];
int
ix
=
static_cast
<
int
>
(
px
*
i_cell_size
+
1
);
ix
=
std
::
max
(
ix
,
_cells_in_cutoff
);
ix
=
std
::
min
(
ix
,
ncellx
-
offset_hi
);
int
iy
=
static_cast
<
int
>
(
py
*
i_cell_size
+
1
);
iy
=
std
::
max
(
iy
,
_cells_in_cutoff
);
iy
=
std
::
min
(
iy
,
ncelly
-
offset_hi
);
int
iz
=
static_cast
<
int
>
(
pz
*
i_cell_size
+
1
);
iz
=
std
::
max
(
iz
,
_cells_in_cutoff
);
iz
=
std
::
min
(
iz
,
ncellz
-
offset_hi
);
int
id
=
ix
+
iy
*
ncellx
+
iz
*
ncellx
*
ncelly
;
cell_id
[
i
]
=
id
;
cell_counts
[
id
+
1
]
++
;
}
for
(
int
i
=
nt
;
i
<
nall
;
i
++
)
{
double
px
,
py
,
pz
;
px
=
x
[
i
][
0
]
-
sublo
[
0
];
py
=
x
[
i
][
1
]
-
sublo
[
1
];
pz
=
x
[
i
][
2
]
-
sublo
[
2
];
int
ix
=
static_cast
<
int
>
(
px
*
i_cell_size
+
1
);
ix
=
std
::
max
(
ix
,
0
);
ix
=
std
::
min
(
ix
,
ncellx
-
1
);
int
iy
=
static_cast
<
int
>
(
py
*
i_cell_size
+
1
);
iy
=
std
::
max
(
iy
,
0
);
iy
=
std
::
min
(
iy
,
ncelly
-
1
);
int
iz
=
static_cast
<
int
>
(
pz
*
i_cell_size
+
1
);
iz
=
std
::
max
(
iz
,
0
);
iz
=
std
::
min
(
iz
,
ncellz
-
1
);
int
id
=
ix
+
iy
*
ncellx
+
iz
*
ncellx
*
ncelly
;
cell_id
[
i
]
=
id
;
cell_counts
[
id
+
1
]
++
;
}
mn
=
0
;
for
(
int
i
=
0
;
i
<
_ncells
;
i
++
)
mn
=
std
::
max
(
mn
,
cell_counts
[
i
]);
mn
*=
8
;
set_nbor_block_size
(
mn
/
2
);
resize_max_neighbors
<
numtyp
,
acctyp
>
(
mn
,
success
);
if
(
!
success
)
return
;
_total_atoms
=
nt
;
cell_iter
[
0
]
=
0
;
for
(
int
i
=
1
;
i
<
_ncells
;
i
++
)
{
cell_counts
[
i
]
+=
cell_counts
[
i
-
1
];
cell_iter
[
i
]
=
cell_counts
[
i
];
}
time_hybrid1
.
start
();
cell_counts
.
update_device
(
true
);
time_hybrid1
.
stop
();
for
(
int
i
=
0
;
i
<
nall
;
i
++
)
{
int
celli
=
cell_id
[
i
];
int
ploc
=
cell_iter
[
celli
];
cell_iter
[
celli
]
++
;
particle_id
[
ploc
]
=
i
;
}
time_hybrid2
.
start
();
ucl_copy
(
atom
.
dev_particle_id
,
atom
.
host_particle_id
,
true
);
time_hybrid2
.
stop
();
_bin_time
+=
MPI_Wtime
()
-
stime
;
}
time_kernel
.
start
();
_nbor_pitch
=
inum
;
_shared
->
neigh_tex
.
bind_float
(
atom
.
x
,
4
);
// If binning on GPU, do this now
if
(
_gpu_nbor
==
1
)
{
const
numtyp
i_cell_size
=
static_cast
<
numtyp
>
(
1.0
/
_cell_size
);
const
int
neigh_block
=
_block_cell_id
;
const
int
GX
=
(
int
)
ceil
((
float
)
nall
/
neigh_block
);
const
numtyp
sublo0
=
static_cast
<
numtyp
>
(
sublo
[
0
]);
const
numtyp
sublo1
=
static_cast
<
numtyp
>
(
sublo
[
1
]);
const
numtyp
sublo2
=
static_cast
<
numtyp
>
(
sublo
[
2
]);
_shared
->
k_cell_id
.
set_size
(
GX
,
neigh_block
);
_shared
->
k_cell_id
.
run
(
&
atom
.
x
,
&
atom
.
dev_cell_id
,
&
atom
.
dev_particle_id
,
&
sublo0
,
&
sublo1
,
&
sublo2
,
&
i_cell_size
,
&
ncellx
,
&
ncelly
,
&
ncellz
,
&
nt
,
&
nall
,
&
_cells_in_cutoff
);
atom
.
sort_neighbor
(
nall
);
/* calculate cell count */
_shared
->
k_cell_counts
.
set_size
(
GX
,
neigh_block
);
_shared
->
k_cell_counts
.
run
(
&
atom
.
dev_cell_id
,
&
cell_counts
,
&
nall
,
&
ncell_3d
);
}
/* build the neighbor list */
const
int
cell_block
=
_block_nbor_build
;
_shared
->
k_build_nbor
.
set_size
(
ncellx
-
ghost_cells
,(
ncelly
-
ghost_cells
)
*
(
ncellz
-
ghost_cells
),
cell_block
,
1
);
_shared
->
k_build_nbor
.
run
(
&
atom
.
x
,
&
atom
.
dev_particle_id
,
&
cell_counts
,
&
dev_nbor
,
&
nbor_host
,
&
dev_numj_host
,
&
_max_nbors
,
&
cutoff_cast
,
&
ncellx
,
&
ncelly
,
&
ncellz
,
&
inum
,
&
nt
,
&
nall
,
&
_threads_per_atom
,
&
_cells_in_cutoff
);
/* Get the maximum number of nbors and realloc if necessary */
UCL_D_Vec
<
int
>
numj
;
numj
.
view_offset
(
inum
,
dev_nbor
,
inum
);
ucl_copy
(
host_acc
,
numj
,
inum
,
true
);
if
(
nt
>
inum
)
{
UCL_H_Vec
<
int
>
host_offset
;
host_offset
.
view_offset
(
inum
,
host_acc
,
nt
-
inum
);
ucl_copy
(
host_offset
,
dev_numj_host
,
nt
-
inum
,
true
);
}
if
(
_gpu_nbor
!=
2
)
{
host_acc
.
sync
();
mn
=
host_acc
[
0
];
for
(
int
i
=
1
;
i
<
nt
;
i
++
)
mn
=
std
::
max
(
mn
,
host_acc
[
i
]);
set_nbor_block_size
(
mn
);
if
(
mn
>
_max_nbors
)
{
resize_max_neighbors
<
numtyp
,
acctyp
>
(
mn
,
success
);
if
(
!
success
)
return
;
time_kernel
.
stop
();
if
(
_time_device
)
time_kernel
.
add_to_total
();
build_nbor_list
(
x
,
inum
,
host_inum
,
nall
,
atom
,
sublo
,
subhi
,
tag
,
nspecial
,
special
,
success
,
mn
);
return
;
}
}
if
(
_maxspecial
>
0
)
{
const
int
GX2
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
nt
*
_threads_per_atom
)
/
cell_block
));
_shared
->
k_special
.
set_size
(
GX2
,
cell_block
);
_shared
->
k_special
.
run
(
&
dev_nbor
,
&
nbor_host
,
&
dev_numj_host
,
&
atom
.
dev_tag
,
&
dev_nspecial
,
&
dev_special
,
&
inum
,
&
nt
,
&
_max_nbors
,
&
_threads_per_atom
);
}
time_kernel
.
stop
();
time_nbor
.
start
();
if
(
inum
<
nt
)
{
nbor_host
.
update_host
(
true
);
nbor_host
.
sync
();
}
time_nbor
.
stop
();
}
template
void
Neighbor
::
build_nbor_list
<
PRECISION
,
ACC_PRECISION
>
(
double
**
x
,
const
int
inum
,
const
int
host_inum
,
const
int
nall
,
Atom
<
PRECISION
,
ACC_PRECISION
>
&
atom
,
double
*
sublo
,
double
*
subhi
,
tagint
*
,
int
**
,
tagint
**
,
bool
&
success
,
int
&
mn
);
Event Timeline
Log In to Comment