Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F73239916
fix_intel.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Jul 19, 22:15
Size
37 KB
Mime Type
text/x-c
Expires
Sun, Jul 21, 22:15 (2 d)
Engine
blob
Format
Raw Data
Handle
19149573
Attached To
rLAMMPS lammps
fix_intel.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
Anupama Kurpad (Intel) - Host Affinitization
------------------------------------------------------------------------- */
#include "comm.h"
#include "error.h"
#include "force.h"
#include "neighbor.h"
#include "neigh_request.h"
#include "pair.h"
#include "pair_hybrid.h"
#include "pair_hybrid_overlay.h"
#include "timer.h"
#include "universe.h"
#include "update.h"
#include "fix_intel.h"
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef _LMP_INTEL_OFFLOAD
#ifndef INTEL_OFFLOAD_NOAFFINITY
#include <unistd.h>
#endif
#endif
#include "suffix.h"
using
namespace
LAMMPS_NS
;
using
namespace
FixConst
;
#ifdef __INTEL_OFFLOAD
#ifndef _LMP_INTEL_OFFLOAD
#warning "Not building Intel package with Xeon Phi offload support."
#endif
#endif
enum
{
NSQ
,
BIN
,
MULTI
};
/* ---------------------------------------------------------------------- */
FixIntel
::
FixIntel
(
LAMMPS
*
lmp
,
int
narg
,
char
**
arg
)
:
Fix
(
lmp
,
narg
,
arg
)
{
if
(
narg
<
4
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
int
ncops
=
force
->
inumeric
(
FLERR
,
arg
[
3
]);
_nbor_pack_width
=
1
;
_three_body_neighbor
=
0
;
_precision_mode
=
PREC_MODE_MIXED
;
_offload_balance
=
-
1.0
;
_overflow_flag
[
LMP_OVERFLOW
]
=
0
;
_off_overflow_flag
[
LMP_OVERFLOW
]
=
0
;
_offload_affinity_balanced
=
0
;
_offload_threads
=
0
;
_offload_tpc
=
4
;
_force_array_s
=
0
;
_force_array_m
=
0
;
_force_array_d
=
0
;
_ev_array_s
=
0
;
_ev_array_d
=
0
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
ncops
<
0
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
_offload_affinity_set
=
0
;
_off_force_array_s
=
0
;
_off_force_array_m
=
0
;
_off_force_array_d
=
0
;
_off_ev_array_s
=
0
;
_off_ev_array_d
=
0
;
_balance_fixed
=
0.0
;
_cop
=
0
;
#endif
// optional keywords
int
nomp
=
0
,
no_affinity
=
0
;
_allow_separate_buffers
=
1
;
_offload_ghost
=
-
1
;
_lrt
=
0
;
int
iarg
=
4
;
while
(
iarg
<
narg
)
{
if
(
strcmp
(
arg
[
iarg
],
"omp"
)
==
0
)
{
if
(
iarg
+
2
>
narg
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
nomp
=
force
->
inumeric
(
FLERR
,
arg
[
iarg
+
1
]);
iarg
+=
2
;
}
else
if
(
strcmp
(
arg
[
iarg
],
"mode"
)
==
0
)
{
if
(
iarg
+
2
>
narg
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
if
(
strcmp
(
arg
[
iarg
+
1
],
"single"
)
==
0
)
_precision_mode
=
PREC_MODE_SINGLE
;
else
if
(
strcmp
(
arg
[
iarg
+
1
],
"mixed"
)
==
0
)
_precision_mode
=
PREC_MODE_MIXED
;
else
if
(
strcmp
(
arg
[
iarg
+
1
],
"double"
)
==
0
)
_precision_mode
=
PREC_MODE_DOUBLE
;
else
error
->
all
(
FLERR
,
"Illegal package intel command"
);
iarg
+=
2
;
}
else
if
(
strcmp
(
arg
[
iarg
],
"balance"
)
==
0
)
{
if
(
iarg
+
2
>
narg
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
_offload_balance
=
force
->
numeric
(
FLERR
,
arg
[
iarg
+
1
]);
iarg
+=
2
;
}
else
if
(
strcmp
(
arg
[
iarg
],
"ghost"
)
==
0
)
{
if
(
iarg
+
2
>
narg
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
if
(
strcmp
(
arg
[
iarg
+
1
],
"yes"
)
==
0
)
_offload_ghost
=
1
;
else
if
(
strcmp
(
arg
[
iarg
+
1
],
"no"
)
==
0
)
_offload_ghost
=
0
;
else
error
->
all
(
FLERR
,
"Illegal package intel command"
);
iarg
+=
2
;
}
else
if
(
strcmp
(
arg
[
iarg
],
"tpc"
)
==
0
)
{
if
(
iarg
+
2
>
narg
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
_offload_tpc
=
atoi
(
arg
[
iarg
+
1
]);
iarg
+=
2
;
}
else
if
(
strcmp
(
arg
[
iarg
],
"tptask"
)
==
0
)
{
if
(
iarg
+
2
>
narg
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
_offload_threads
=
atoi
(
arg
[
iarg
+
1
]);
iarg
+=
2
;
}
else
if
(
strcmp
(
arg
[
iarg
],
"no_affinity"
)
==
0
)
{
no_affinity
=
1
;
iarg
++
;
}
else
if
(
strcmp
(
arg
[
iarg
],
"lrt"
)
==
0
)
{
if
(
iarg
+
2
>
narg
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
if
(
strcmp
(
arg
[
iarg
+
1
],
"yes"
)
==
0
)
_lrt
=
1
;
else
if
(
strcmp
(
arg
[
iarg
+
1
],
"no"
)
==
0
)
_lrt
=
0
;
else
error
->
all
(
FLERR
,
"Illegal package intel command"
);
iarg
+=
2
;
}
// undocumented options
else
if
(
strcmp
(
arg
[
iarg
],
"offload_affinity_balanced"
)
==
0
)
{
_offload_affinity_balanced
=
1
;
iarg
++
;
}
else
if
(
strcmp
(
arg
[
iarg
],
"buffers"
)
==
0
)
{
if
(
iarg
+
2
>
narg
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
_allow_separate_buffers
=
atoi
(
arg
[
iarg
+
1
]);
iarg
+=
2
;
}
else
error
->
all
(
FLERR
,
"Illegal package intel command"
);
}
// if ncops is zero, just run on the cpu
if
(
ncops
<
1
)
{
ncops
=
-
1
;
_offload_balance
=
0.0
;
}
// if using LRT mode, create the integrate style
if
(
_lrt
)
{
char
*
str
;
str
=
(
char
*
)
"verlet/lrt/intel"
;
update
->
create_integrate
(
1
,
&
str
,
0
);
}
// error check
if
(
_offload_balance
>
1.0
||
_offload_threads
<
0
||
_offload_tpc
<=
0
||
_offload_tpc
>
4
||
nomp
<
0
)
error
->
all
(
FLERR
,
"Illegal package intel command"
);
#ifdef _LMP_INTEL_OFFLOAD
_ncops
=
ncops
;
if
(
_offload_balance
!=
0.0
)
{
_real_space_comm
=
MPI_COMM_WORLD
;
if
(
no_affinity
==
0
)
if
(
set_host_affinity
(
nomp
)
!=
0
)
error
->
all
(
FLERR
,
"Could not set host affinity for offload tasks"
);
}
int
max_offload_threads
=
0
,
offload_cores
=
0
;
if
(
_offload_balance
!=
0.0
)
{
#pragma offload target(mic:_cop) mandatory \
out(max_offload_threads,offload_cores)
{
offload_cores
=
omp_get_num_procs
();
omp_set_num_threads
(
offload_cores
);
max_offload_threads
=
omp_get_max_threads
();
#ifdef __AVX512F__
if
(
(
offload_cores
/
4
)
%
2
==
1
)
{
offload_cores
+=
4
;
max_offload_threads
+=
4
;
}
#endif
}
_max_offload_threads
=
max_offload_threads
;
_offload_cores
=
offload_cores
;
if
(
_offload_threads
==
0
)
_offload_threads
=
offload_cores
;
if
(
_offload_cores
>
244
&&
_offload_tpc
>
2
)
_offload_tpc
=
2
;
}
#endif
// set OpenMP threads
// nomp is user setting, default = 0
#if defined(_OPENMP)
#if defined(__INTEL_COMPILER)
kmp_set_blocktime
(
0
);
#endif
if
(
nomp
!=
0
)
{
omp_set_num_threads
(
nomp
);
comm
->
nthreads
=
nomp
;
}
else
{
int
nthreads
;
#pragma omp parallel default(none) shared(nthreads)
nthreads
=
omp_get_num_threads
();
comm
->
nthreads
=
nthreads
;
}
#endif
// set offload params
#ifdef _LMP_INTEL_OFFLOAD
if
(
_offload_balance
<
0.0
)
{
_balance_neighbor
=
0.9
;
_balance_pair
=
0.9
;
}
else
{
_balance_neighbor
=
_offload_balance
;
_balance_pair
=
_offload_balance
;
}
_tscreen
=
screen
;
zero_timers
();
_setup_time_cleared
=
false
;
_timers_allocated
=
false
;
#else
_offload_balance
=
0.0
;
#endif
// set precision
if
(
_precision_mode
==
PREC_MODE_SINGLE
)
_single_buffers
=
new
IntelBuffers
<
float
,
float
>
(
lmp
);
else
if
(
_precision_mode
==
PREC_MODE_MIXED
)
_mixed_buffers
=
new
IntelBuffers
<
float
,
double
>
(
lmp
);
else
_double_buffers
=
new
IntelBuffers
<
double
,
double
>
(
lmp
);
}
/* ---------------------------------------------------------------------- */
FixIntel
::~
FixIntel
()
{
#ifdef _LMP_INTEL_OFFLOAD
output_timing_data
();
if
(
_timers_allocated
)
{
double
*
time1
=
off_watch_pair
();
double
*
time2
=
off_watch_neighbor
();
int
*
overflow
=
get_off_overflow_flag
();
if
(
_offload_balance
!=
0.0
&&
time1
!=
NULL
&&
time2
!=
NULL
&&
overflow
!=
NULL
)
{
#pragma offload_transfer target(mic:_cop) \
nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
}
}
#endif
if
(
_precision_mode
==
PREC_MODE_SINGLE
)
delete
_single_buffers
;
else
if
(
_precision_mode
==
PREC_MODE_MIXED
)
delete
_mixed_buffers
;
else
delete
_double_buffers
;
}
/* ---------------------------------------------------------------------- */
int
FixIntel
::
setmask
()
{
int
mask
=
0
;
mask
|=
PRE_REVERSE
;
mask
|=
MIN_PRE_REVERSE
;
#ifdef _LMP_INTEL_OFFLOAD
mask
|=
POST_FORCE
;
mask
|=
MIN_POST_FORCE
;
#endif
return
mask
;
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
init
()
{
#ifdef _LMP_INTEL_OFFLOAD
output_timing_data
();
_sync_mode
=
0
;
if
(
offload_balance
()
!=
0.0
)
{
if
(
offload_noghost
()
||
force
->
newton_pair
==
0
)
_sync_mode
=
2
;
else
_sync_mode
=
1
;
if
(
update
->
whichflag
==
2
)
_sync_mode
=
1
;
}
#endif
int
nstyles
=
0
;
if
(
force
->
pair_match
(
"hybrid"
,
1
)
!=
NULL
)
{
PairHybrid
*
hybrid
=
(
PairHybrid
*
)
force
->
pair
;
for
(
int
i
=
0
;
i
<
hybrid
->
nstyles
;
i
++
)
if
(
strstr
(
hybrid
->
keywords
[
i
],
"/intel"
)
!=
NULL
)
nstyles
++
;
}
else
if
(
force
->
pair_match
(
"hybrid/overlay"
,
1
)
!=
NULL
)
{
PairHybridOverlay
*
hybrid
=
(
PairHybridOverlay
*
)
force
->
pair
;
for
(
int
i
=
0
;
i
<
hybrid
->
nstyles
;
i
++
)
if
(
strstr
(
hybrid
->
keywords
[
i
],
"/intel"
)
!=
NULL
)
nstyles
++
;
else
force
->
pair
->
no_virial_fdotr_compute
=
1
;
}
if
(
nstyles
>
1
)
error
->
all
(
FLERR
,
"Currently, cannot use more than one intel style with hybrid."
);
check_neighbor_intel
();
int
off_mode
=
0
;
if
(
_offload_balance
!=
0.0
)
off_mode
=
1
;
if
(
_precision_mode
==
PREC_MODE_SINGLE
)
{
_single_buffers
->
zero_ev
();
_single_buffers
->
grow_ncache
(
off_mode
,
_nthreads
);
}
else
if
(
_precision_mode
==
PREC_MODE_MIXED
)
{
_mixed_buffers
->
zero_ev
();
_mixed_buffers
->
grow_ncache
(
off_mode
,
_nthreads
);
}
else
{
_double_buffers
->
zero_ev
();
_double_buffers
->
grow_ncache
(
off_mode
,
_nthreads
);
}
_need_reduce
=
0
;
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
setup
(
int
vflag
)
{
if
(
neighbor
->
style
!=
BIN
)
error
->
all
(
FLERR
,
"Currently, neighbor style BIN must be used with Intel package."
);
if
(
neighbor
->
exclude_setting
()
!=
0
)
error
->
all
(
FLERR
,
"Currently, cannot use neigh_modify exclude with Intel package."
);
if
(
vflag_atom
)
error
->
all
(
FLERR
,
"Cannot currently get per-atom virials with Intel package."
);
#ifdef _LMP_INTEL_OFFLOAD
post_force
(
vflag
);
#endif
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
setup_pre_reverse
(
int
eflag
,
int
vflag
)
{
pre_reverse
(
eflag
,
vflag
);
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
pair_init_check
(
const
bool
cdmessage
)
{
#ifdef INTEL_VMASK
atom
->
sortfreq
=
1
;
#endif
_nbor_pack_width
=
1
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
_offload_balance
!=
0.0
)
atom
->
sortfreq
=
1
;
_offload_noghost
=
0
;
if
(
force
->
newton_pair
&&
_offload_ghost
==
0
)
_offload_noghost
=
1
;
set_offload_affinity
();
if
(
!
_timers_allocated
)
{
double
*
time1
=
off_watch_pair
();
double
*
time2
=
off_watch_neighbor
();
int
*
overflow
=
get_off_overflow_flag
();
if
(
_offload_balance
!=
0.0
&&
time1
!=
NULL
&&
time2
!=
NULL
&&
overflow
!=
NULL
)
{
#pragma offload_transfer target(mic:_cop) \
nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
in(overflow:length(5) alloc_if(1) free_if(0))
}
_timers_allocated
=
true
;
}
#endif
_nthreads
=
comm
->
nthreads
;
if
(
_offload_balance
!=
0.0
&&
comm
->
me
==
0
)
{
#ifndef __INTEL_COMPILER_BUILD_DATE
error
->
warning
(
FLERR
,
"Unknown Intel Compiler Version
\n
"
);
#else
if
(
__INTEL_COMPILER_BUILD_DATE
!=
20131008
&&
__INTEL_COMPILER_BUILD_DATE
<
20141023
)
error
->
warning
(
FLERR
,
"Unsupported Intel Compiler."
);
#endif
#if !defined(__INTEL_COMPILER)
error
->
warning
(
FLERR
,
"Unsupported Intel Compiler."
);
#endif
}
int
need_tag
=
0
;
if
(
atom
->
molecular
)
need_tag
=
1
;
// Clear buffers used for pair style
char
kmode
[
80
];
if
(
_precision_mode
==
PREC_MODE_SINGLE
)
{
strcpy
(
kmode
,
"single"
);
get_single_buffers
()
->
need_tag
(
need_tag
);
}
else
if
(
_precision_mode
==
PREC_MODE_MIXED
)
{
strcpy
(
kmode
,
"mixed"
);
get_mixed_buffers
()
->
need_tag
(
need_tag
);
}
else
{
strcpy
(
kmode
,
"double"
);
get_double_buffers
()
->
need_tag
(
need_tag
);
}
#ifdef _LMP_INTEL_OFFLOAD
set_offload_affinity
();
#endif
if
(
comm
->
me
==
0
)
{
if
(
screen
)
{
fprintf
(
screen
,
"----------------------------------------------------------
\n
"
);
if
(
_offload_balance
!=
0.0
)
{
fprintf
(
screen
,
"Using Intel Coprocessor with %d threads per core, "
,
_offload_tpc
);
fprintf
(
screen
,
"%d threads per task
\n
"
,
_offload_threads
);
}
else
{
fprintf
(
screen
,
"Using Intel Package without Coprocessor.
\n
"
);
}
fprintf
(
screen
,
"Precision: %s
\n
"
,
kmode
);
if
(
cdmessage
)
{
#ifdef LMP_USE_AVXCD
fprintf
(
screen
,
"AVX512 CD Optimizations: Enabled
\n
"
);
#else
fprintf
(
screen
,
"AVX512 CD Optimizations: Disabled
\n
"
);
#endif
}
fprintf
(
screen
,
"----------------------------------------------------------
\n
"
);
}
}
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
bond_init_check
()
{
if
(
_offload_balance
!=
0.0
&&
atom
->
molecular
&&
force
->
newton_pair
!=
force
->
newton_bond
)
error
->
all
(
FLERR
,
"USER-INTEL package requires same setting for newton bond and non-bond."
);
int
intel_pair
=
0
;
if
(
force
->
pair_match
(
"/intel"
,
0
)
!=
NULL
)
intel_pair
=
1
;
else
if
(
force
->
pair_match
(
"hybrid"
,
1
)
!=
NULL
)
{
PairHybrid
*
hybrid
=
(
PairHybrid
*
)
force
->
pair
;
for
(
int
i
=
0
;
i
<
hybrid
->
nstyles
;
i
++
)
if
(
strstr
(
hybrid
->
keywords
[
i
],
"/intel"
)
!=
NULL
)
intel_pair
=
1
;
}
else
if
(
force
->
pair_match
(
"hybrid/overlay"
,
1
)
!=
NULL
)
{
PairHybridOverlay
*
hybrid
=
(
PairHybridOverlay
*
)
force
->
pair
;
for
(
int
i
=
0
;
i
<
hybrid
->
nstyles
;
i
++
)
if
(
strstr
(
hybrid
->
keywords
[
i
],
"/intel"
)
!=
NULL
)
intel_pair
=
1
;
}
if
(
intel_pair
==
0
)
error
->
all
(
FLERR
,
"Intel styles for bond/angle/dihedral/improper "
"require intel pair style."
);
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
kspace_init_check
()
{
int
intel_pair
=
0
;
if
(
force
->
pair_match
(
"/intel"
,
0
)
!=
NULL
)
intel_pair
=
1
;
else
if
(
force
->
pair_match
(
"hybrid"
,
1
)
!=
NULL
)
{
PairHybrid
*
hybrid
=
(
PairHybrid
*
)
force
->
pair
;
for
(
int
i
=
0
;
i
<
hybrid
->
nstyles
;
i
++
)
if
(
strstr
(
hybrid
->
keywords
[
i
],
"/intel"
)
!=
NULL
)
intel_pair
=
1
;
}
else
if
(
force
->
pair_match
(
"hybrid/overlay"
,
1
)
!=
NULL
)
{
PairHybridOverlay
*
hybrid
=
(
PairHybridOverlay
*
)
force
->
pair
;
for
(
int
i
=
0
;
i
<
hybrid
->
nstyles
;
i
++
)
if
(
strstr
(
hybrid
->
keywords
[
i
],
"/intel"
)
!=
NULL
)
intel_pair
=
1
;
}
if
(
intel_pair
==
0
)
error
->
all
(
FLERR
,
"Intel styles for kspace require intel pair style."
);
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
check_neighbor_intel
()
{
#ifdef _LMP_INTEL_OFFLOAD
_full_host_list
=
0
;
#endif
const
int
nrequest
=
neighbor
->
nrequest
;
for
(
int
i
=
0
;
i
<
nrequest
;
++
i
)
{
#ifdef _LMP_INTEL_OFFLOAD
if
(
_offload_balance
!=
0.0
&&
neighbor
->
requests
[
i
]
->
intel
==
0
)
{
_full_host_list
=
1
;
_offload_noghost
=
0
;
}
#endif
if
(
neighbor
->
requests
[
i
]
->
skip
)
error
->
all
(
FLERR
,
"Cannot yet use hybrid styles with Intel package."
);
}
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
pre_reverse
(
int
eflag
,
int
vflag
)
{
if
(
_force_array_m
!=
0
)
{
if
(
_need_reduce
)
{
reduce_results
(
&
_force_array_m
[
0
].
x
);
_need_reduce
=
0
;
}
add_results
(
_force_array_m
,
_ev_array_d
,
_results_eatom
,
_results_vatom
,
0
);
_force_array_m
=
0
;
}
else
if
(
_force_array_d
!=
0
)
{
if
(
_need_reduce
)
{
reduce_results
(
&
_force_array_d
[
0
].
x
);
_need_reduce
=
0
;
}
add_results
(
_force_array_d
,
_ev_array_d
,
_results_eatom
,
_results_vatom
,
0
);
_force_array_d
=
0
;
}
else
if
(
_force_array_s
!=
0
)
{
if
(
_need_reduce
)
{
reduce_results
(
&
_force_array_s
[
0
].
x
);
_need_reduce
=
0
;
}
add_results
(
_force_array_s
,
_ev_array_s
,
_results_eatom
,
_results_vatom
,
0
);
_force_array_s
=
0
;
}
#ifdef _LMP_INTEL_OFFLOAD
if
(
_sync_mode
==
1
)
sync_coprocessor
();
#endif
}
/* ---------------------------------------------------------------------- */
template
<
class
acc_t
>
void
FixIntel
::
reduce_results
(
acc_t
*
_noalias
const
f_scalar
)
{
int
o_range
,
f_stride
;
if
(
force
->
newton_pair
)
o_range
=
atom
->
nlocal
+
atom
->
nghost
;
else
o_range
=
atom
->
nlocal
;
IP_PRE_get_stride
(
f_stride
,
o_range
,
(
sizeof
(
acc_t
)
*
4
),
lmp
->
atom
->
torque
);
o_range
*=
4
;
const
int
f_stride4
=
f_stride
*
4
;
if
(
_nthreads
<=
INTEL_HTHREADS
)
{
acc_t
*
f_scalar2
=
f_scalar
+
f_stride4
;
if
(
_nthreads
==
4
)
{
acc_t
*
f_scalar3
=
f_scalar2
+
f_stride4
;
acc_t
*
f_scalar4
=
f_scalar3
+
f_stride4
;
_use_simd_pragma
(
"vector aligned"
)
_use_simd_pragma
(
"simd"
)
for
(
int
n
=
0
;
n
<
o_range
;
n
++
)
f_scalar
[
n
]
+=
f_scalar2
[
n
]
+
f_scalar3
[
n
]
+
f_scalar4
[
n
];
}
else
if
(
_nthreads
==
2
)
{
_use_simd_pragma
(
"vector aligned"
)
_use_simd_pragma
(
"simd"
)
for
(
int
n
=
0
;
n
<
o_range
;
n
++
)
f_scalar
[
n
]
+=
f_scalar2
[
n
];
}
else
{
acc_t
*
f_scalar3
=
f_scalar2
+
f_stride4
;
_use_simd_pragma
(
"vector aligned"
)
_use_simd_pragma
(
"simd"
)
for
(
int
n
=
0
;
n
<
o_range
;
n
++
)
f_scalar
[
n
]
+=
f_scalar2
[
n
]
+
f_scalar3
[
n
];
}
}
else
{
#if defined(_OPENMP)
#pragma omp parallel
#endif
{
int
iifrom
,
iito
,
tid
;
IP_PRE_omp_range_id_align
(
iifrom
,
iito
,
tid
,
o_range
,
_nthreads
,
sizeof
(
acc_t
));
acc_t
*
f_scalar2
=
f_scalar
+
f_stride4
;
for
(
int
t
=
1
;
t
<
_nthreads
;
t
++
)
{
_use_simd_pragma
(
"vector aligned"
)
_use_simd_pragma
(
"simd"
)
for
(
int
n
=
iifrom
;
n
<
iito
;
n
++
)
f_scalar
[
n
]
+=
f_scalar2
[
n
];
f_scalar2
+=
f_stride4
;
}
}
}
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
sync_coprocessor
()
{
#ifdef _LMP_INTEL_OFFLOAD
if
(
_offload_balance
!=
0.0
)
{
if
(
_off_force_array_m
!=
0
)
{
add_off_results
(
_off_force_array_m
,
_off_ev_array_d
);
_off_force_array_m
=
0
;
}
else
if
(
_off_force_array_d
!=
0
)
{
add_off_results
(
_off_force_array_d
,
_off_ev_array_d
);
_off_force_array_d
=
0
;
}
else
if
(
_off_force_array_s
!=
0
)
{
add_off_results
(
_off_force_array_s
,
_off_ev_array_s
);
_off_force_array_s
=
0
;
}
}
#endif
}
/* ---------------------------------------------------------------------- */
template
<
class
ft
,
class
acc_t
>
void
FixIntel
::
add_results
(
const
ft
*
_noalias
const
f_in
,
const
acc_t
*
_noalias
const
ev_global
,
const
int
eatom
,
const
int
vatom
,
const
int
offload
)
{
start_watch
(
TIME_PACK
);
int
f_length
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
_separate_buffers
)
{
if
(
offload
)
{
if
(
force
->
newton_pair
)
{
add_oresults
(
f_in
,
ev_global
,
eatom
,
vatom
,
0
,
_offload_nlocal
);
const
acc_t
*
_noalias
const
enull
=
0
;
int
offset
=
_offload_nlocal
;
if
(
atom
->
torque
)
offset
*=
2
;
add_oresults
(
f_in
+
offset
,
enull
,
eatom
,
vatom
,
_offload_min_ghost
,
_offload_nghost
);
}
else
add_oresults
(
f_in
,
ev_global
,
eatom
,
vatom
,
0
,
offload_end_pair
());
}
else
{
if
(
force
->
newton_pair
)
{
add_oresults
(
f_in
,
ev_global
,
eatom
,
vatom
,
_host_min_local
,
_host_used_local
);
const
acc_t
*
_noalias
const
enull
=
0
;
int
offset
=
_host_used_local
;
if
(
atom
->
torque
)
offset
*=
2
;
add_oresults
(
f_in
+
offset
,
enull
,
eatom
,
vatom
,
_host_min_ghost
,
_host_used_ghost
);
}
else
{
int
start
=
host_start_pair
();
add_oresults
(
f_in
,
ev_global
,
eatom
,
vatom
,
start
,
atom
->
nlocal
-
start
);
}
}
stop_watch
(
TIME_PACK
);
return
;
}
int
start
;
if
(
offload
)
{
start
=
0
;
if
(
force
->
newton_pair
)
{
if
(
_offload_noghost
==
0
)
f_length
=
atom
->
nlocal
+
atom
->
nghost
;
else
f_length
=
atom
->
nlocal
;
}
else
f_length
=
offload_end_pair
();
}
else
{
if
(
force
->
newton_pair
)
{
start
=
0
;
f_length
=
atom
->
nlocal
+
atom
->
nghost
;
}
else
{
start
=
host_start_pair
();
f_length
=
atom
->
nlocal
-
start
;
}
}
add_oresults
(
f_in
,
ev_global
,
eatom
,
vatom
,
start
,
f_length
);
#else
if
(
force
->
newton_pair
)
f_length
=
atom
->
nlocal
+
atom
->
nghost
;
else
f_length
=
atom
->
nlocal
;
add_oresults
(
f_in
,
ev_global
,
eatom
,
vatom
,
0
,
f_length
);
#endif
stop_watch
(
TIME_PACK
);
}
/* ---------------------------------------------------------------------- */
template
<
class
ft
,
class
acc_t
>
void
FixIntel
::
add_oresults
(
const
ft
*
_noalias
const
f_in
,
const
acc_t
*
_noalias
const
ev_global
,
const
int
eatom
,
const
int
vatom
,
const
int
out_offset
,
const
int
nall
)
{
lmp_ft
*
_noalias
const
f
=
(
lmp_ft
*
)
lmp
->
atom
->
f
[
0
]
+
out_offset
;
if
(
atom
->
torque
)
{
if
(
f_in
[
1
].
w
)
if
(
f_in
[
1
].
w
==
1
)
error
->
all
(
FLERR
,
"Bad matrix inversion in mldivide3"
);
else
error
->
all
(
FLERR
,
"Sphere particles not yet supported for gayberne/intel"
);
}
int
packthreads
;
if
(
_nthreads
>
INTEL_HTHREADS
)
packthreads
=
_nthreads
;
else
packthreads
=
1
;
#if defined(_OPENMP)
#pragma omp parallel if(packthreads > 1)
#endif
{
#if defined(_OPENMP)
const
int
tid
=
omp_get_thread_num
();
#else
const
int
tid
=
0
;
#endif
int
ifrom
,
ito
;
IP_PRE_omp_range_align
(
ifrom
,
ito
,
tid
,
nall
,
packthreads
,
sizeof
(
acc_t
));
if
(
atom
->
torque
)
{
int
ii
=
ifrom
*
2
;
lmp_ft
*
_noalias
const
tor
=
(
lmp_ft
*
)
lmp
->
atom
->
torque
[
0
]
+
out_offset
;
if
(
eatom
)
{
double
*
_noalias
const
lmp_eatom
=
force
->
pair
->
eatom
+
out_offset
;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
f
[
i
].
x
+=
f_in
[
ii
].
x
;
f
[
i
].
y
+=
f_in
[
ii
].
y
;
f
[
i
].
z
+=
f_in
[
ii
].
z
;
lmp_eatom
[
i
]
+=
f_in
[
ii
].
w
;
tor
[
i
].
x
+=
f_in
[
ii
+
1
].
x
;
tor
[
i
].
y
+=
f_in
[
ii
+
1
].
y
;
tor
[
i
].
z
+=
f_in
[
ii
+
1
].
z
;
ii
+=
2
;
}
}
else
{
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
f
[
i
].
x
+=
f_in
[
ii
].
x
;
f
[
i
].
y
+=
f_in
[
ii
].
y
;
f
[
i
].
z
+=
f_in
[
ii
].
z
;
tor
[
i
].
x
+=
f_in
[
ii
+
1
].
x
;
tor
[
i
].
y
+=
f_in
[
ii
+
1
].
y
;
tor
[
i
].
z
+=
f_in
[
ii
+
1
].
z
;
ii
+=
2
;
}
}
}
else
{
if
(
eatom
)
{
double
*
_noalias
const
lmp_eatom
=
force
->
pair
->
eatom
+
out_offset
;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
f
[
i
].
x
+=
f_in
[
i
].
x
;
f
[
i
].
y
+=
f_in
[
i
].
y
;
f
[
i
].
z
+=
f_in
[
i
].
z
;
lmp_eatom
[
i
]
+=
f_in
[
i
].
w
;
}
}
else
{
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
f
[
i
].
x
+=
f_in
[
i
].
x
;
f
[
i
].
y
+=
f_in
[
i
].
y
;
f
[
i
].
z
+=
f_in
[
i
].
z
;
}
}
}
}
if
(
ev_global
!=
NULL
)
{
force
->
pair
->
eng_vdwl
+=
ev_global
[
0
];
force
->
pair
->
eng_coul
+=
ev_global
[
1
];
force
->
pair
->
virial
[
0
]
+=
ev_global
[
2
];
force
->
pair
->
virial
[
1
]
+=
ev_global
[
3
];
force
->
pair
->
virial
[
2
]
+=
ev_global
[
4
];
force
->
pair
->
virial
[
3
]
+=
ev_global
[
5
];
force
->
pair
->
virial
[
4
]
+=
ev_global
[
6
];
force
->
pair
->
virial
[
5
]
+=
ev_global
[
7
];
}
}
/* ---------------------------------------------------------------------- */
double
FixIntel
::
memory_usage
()
{
double
bytes
;
if
(
_precision_mode
==
PREC_MODE_SINGLE
)
bytes
=
_single_buffers
->
memory_usage
(
_nthreads
);
else
if
(
_precision_mode
==
PREC_MODE_MIXED
)
bytes
=
_mixed_buffers
->
memory_usage
(
_nthreads
);
else
bytes
=
_double_buffers
->
memory_usage
(
_nthreads
);
return
bytes
;
}
/* ---------------------------------------------------------------------- */
#ifdef _LMP_INTEL_OFFLOAD
/* ---------------------------------------------------------------------- */
void
FixIntel
::
post_force
(
int
vflag
)
{
if
(
_sync_mode
==
2
)
sync_coprocessor
();
}
/* ---------------------------------------------------------------------- */
template
<
class
ft
,
class
acc_t
>
void
FixIntel
::
add_off_results
(
const
ft
*
_noalias
const
f_in
,
const
acc_t
*
_noalias
const
ev_global
)
{
if
(
_offload_balance
<
0.0
)
_balance_other_time
=
MPI_Wtime
()
-
_balance_other_time
;
start_watch
(
TIME_OFFLOAD_WAIT
);
#ifdef _LMP_INTEL_OFFLOAD
if
(
neighbor
->
ago
==
0
)
{
#pragma offload_wait target(mic:_cop) wait(atom->tag,f_in)
}
else
{
#pragma offload_wait target(mic:_cop) wait(f_in)
}
#endif
double
wait_time
=
stop_watch
(
TIME_OFFLOAD_WAIT
);
int
nlocal
=
atom
->
nlocal
;
if
(
neighbor
->
ago
==
0
)
{
if
(
_off_overflow_flag
[
LMP_OVERFLOW
])
error
->
one
(
FLERR
,
"Neighbor list overflow, boost neigh_modify one"
);
_offload_nlocal
=
_off_overflow_flag
[
LMP_LOCAL_MAX
]
+
1
;
_offload_min_ghost
=
_off_overflow_flag
[
LMP_GHOST_MIN
];
_offload_nghost
=
_off_overflow_flag
[
LMP_GHOST_MAX
]
+
1
-
_offload_min_ghost
;
if
(
_offload_nghost
<
0
)
_offload_nghost
=
0
;
_offload_nall
=
_offload_nlocal
+
_offload_nghost
;
_offload_nlocal
;
}
if
(
atom
->
torque
)
if
(
f_in
[
1
].
w
<
0.0
)
error
->
all
(
FLERR
,
"Bad matrix inversion in mldivide3"
);
add_results
(
f_in
,
ev_global
,
_off_results_eatom
,
_off_results_vatom
,
1
);
// Load balance?
if
(
_offload_balance
<
0.0
)
{
if
(
neighbor
->
ago
==
0
)
_balance_pair
=
_balance_neighbor
;
double
mic_time
;
mic_time
=
*
_stopwatch_offload_pair
;
if
(
_balance_pair_time
+
_balance_other_time
<
mic_time
)
{
double
ft
=
_balance_pair_time
+
_balance_other_time
+
wait_time
-
mic_time
;
_balance_fixed
=
(
1.0
-
INTEL_LB_MEAN_WEIGHT
)
*
_balance_fixed
+
INTEL_LB_MEAN_WEIGHT
*
ft
;
}
double
ctps
=
_balance_pair_time
/
(
1.0
-
_balance_pair
);
double
otps
=
mic_time
/
_balance_pair
;
double
new_balance
=
(
ctps
+
_balance_other_time
-
_balance_fixed
)
/
(
otps
+
ctps
);
_balance_neighbor
=
(
1.0
-
INTEL_LB_MEAN_WEIGHT
)
*
_balance_neighbor
+
INTEL_LB_MEAN_WEIGHT
*
new_balance
;
}
#ifdef TIME_BALANCE
start_watch
(
TIME_IMBALANCE
);
MPI_Barrier
(
_real_space_comm
);
stop_watch
(
TIME_IMBALANCE
);
#endif
acc_timers
();
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
output_timing_data
()
{
if
(
_im_real_space_task
==
0
||
_offload_affinity_set
==
0
)
return
;
double
timer_total
=
0.0
;
int
size
,
rank
;
double
timers
[
NUM_ITIMERS
];
MPI_Comm_size
(
_real_space_comm
,
&
size
);
MPI_Comm_rank
(
_real_space_comm
,
&
rank
);
MPI_Allreduce
(
&
_timers
,
&
timers
,
NUM_ITIMERS
,
MPI_DOUBLE
,
MPI_SUM
,
_real_space_comm
);
for
(
int
i
=
0
;
i
<
NUM_ITIMERS
;
i
++
)
{
timers
[
i
]
/=
size
;
timer_total
+=
timers
[
i
];
}
#ifdef TIME_BALANCE
double
timers_min
[
NUM_ITIMERS
],
timers_max
[
NUM_ITIMERS
];
MPI_Allreduce
(
&
_timers
,
&
timers_max
,
NUM_ITIMERS
,
MPI_DOUBLE
,
MPI_MAX
,
_real_space_comm
);
MPI_Allreduce
(
&
_timers
,
&
timers_min
,
NUM_ITIMERS
,
MPI_DOUBLE
,
MPI_MIN
,
_real_space_comm
);
#endif
if
(
timer_total
>
0.0
)
{
double
balance_out
[
2
],
balance_in
[
2
];
balance_out
[
0
]
=
_balance_pair
;
balance_out
[
1
]
=
_balance_neighbor
;
MPI_Reduce
(
balance_out
,
balance_in
,
2
,
MPI_DOUBLE
,
MPI_SUM
,
0
,
_real_space_comm
);
balance_in
[
0
]
/=
size
;
balance_in
[
1
]
/=
size
;
if
(
rank
==
0
&&
_tscreen
)
{
fprintf
(
_tscreen
,
"
\n
------------------------------------------------
\n
"
);
fprintf
(
_tscreen
,
" Offload Timing Data
\n
"
);
fprintf
(
_tscreen
,
"------------------------------------------------
\n
"
);
fprintf
(
_tscreen
,
" Data Pack/Cast Seconds %f
\n
"
,
timers
[
TIME_PACK
]);
if
(
_offload_balance
!=
0.0
)
{
fprintf
(
_tscreen
,
" Host Neighbor Seconds %f
\n
"
,
timers
[
TIME_HOST_NEIGHBOR
]);
fprintf
(
_tscreen
,
" Host Pair Seconds %f
\n
"
,
timers
[
TIME_HOST_PAIR
]);
fprintf
(
_tscreen
,
" Offload Neighbor Seconds %f
\n
"
,
timers
[
TIME_OFFLOAD_NEIGHBOR
]);
fprintf
(
_tscreen
,
" Offload Pair Seconds %f
\n
"
,
timers
[
TIME_OFFLOAD_PAIR
]);
fprintf
(
_tscreen
,
" Offload Wait Seconds %f
\n
"
,
timers
[
TIME_OFFLOAD_WAIT
]);
fprintf
(
_tscreen
,
" Offload Latency Seconds %f
\n
"
,
timers
[
TIME_OFFLOAD_LATENCY
]);
fprintf
(
_tscreen
,
" Offload Neighbor Balance %f
\n
"
,
balance_in
[
1
]);
fprintf
(
_tscreen
,
" Offload Pair Balance %f
\n
"
,
balance_in
[
0
]);
fprintf
(
_tscreen
,
" Offload Ghost Atoms "
);
if
(
_offload_noghost
)
fprintf
(
_tscreen
,
"No
\n
"
);
else
fprintf
(
_tscreen
,
"Yes
\n
"
);
#ifdef TIME_BALANCE
fprintf
(
_tscreen
,
" Offload Imbalance Seconds %f
\n
"
,
timers
[
TIME_IMBALANCE
]);
fprintf
(
_tscreen
,
" Offload Min/Max Seconds "
);
for
(
int
i
=
0
;
i
<
NUM_ITIMERS
;
i
++
)
fprintf
(
_tscreen
,
"[%f, %f] "
,
timers_min
[
i
],
timers_max
[
i
]);
fprintf
(
_tscreen
,
"
\n
"
);
#endif
double
ht
=
timers
[
TIME_HOST_NEIGHBOR
]
+
timers
[
TIME_HOST_PAIR
]
+
timers
[
TIME_OFFLOAD_WAIT
];
double
ct
=
timers
[
TIME_OFFLOAD_NEIGHBOR
]
+
timers
[
TIME_OFFLOAD_PAIR
];
double
tt
=
MAX
(
ht
,
ct
);
if
(
timers
[
TIME_OFFLOAD_LATENCY
]
/
tt
>
0.07
&&
_separate_coi
==
0
)
error
->
warning
(
FLERR
,
"Leaving a core free can improve performance for offload"
);
}
fprintf
(
_tscreen
,
"------------------------------------------------
\n
"
);
}
zero_timers
();
_setup_time_cleared
=
false
;
}
}
/* ---------------------------------------------------------------------- */
int
FixIntel
::
get_ppn
(
int
&
node_rank
)
{
int
nprocs
;
int
rank
;
MPI_Comm_size
(
_real_space_comm
,
&
nprocs
);
MPI_Comm_rank
(
_real_space_comm
,
&
rank
);
int
name_length
;
char
node_name
[
MPI_MAX_PROCESSOR_NAME
];
MPI_Get_processor_name
(
node_name
,
&
name_length
);
node_name
[
name_length
]
=
'\0'
;
char
*
node_names
=
new
char
[
MPI_MAX_PROCESSOR_NAME
*
nprocs
];
MPI_Allgather
(
node_name
,
MPI_MAX_PROCESSOR_NAME
,
MPI_CHAR
,
node_names
,
MPI_MAX_PROCESSOR_NAME
,
MPI_CHAR
,
_real_space_comm
);
int
ppn
=
0
;
node_rank
=
0
;
for
(
int
i
=
0
;
i
<
nprocs
;
i
++
)
{
if
(
strcmp
(
node_name
,
node_names
+
i
*
MPI_MAX_PROCESSOR_NAME
)
==
0
)
{
ppn
++
;
if
(
i
<
rank
)
node_rank
++
;
}
}
return
ppn
;
}
/* ---------------------------------------------------------------------- */
void
FixIntel
::
set_offload_affinity
()
{
_separate_buffers
=
0
;
if
(
_allow_separate_buffers
)
if
(
_offload_balance
!=
0.0
&&
_offload_balance
<
1.0
)
_separate_buffers
=
1
;
_im_real_space_task
=
1
;
if
(
strncmp
(
update
->
integrate_style
,
"verlet/split"
,
12
)
==
0
)
{
_real_space_comm
=
world
;
if
(
universe
->
iworld
!=
0
)
{
_im_real_space_task
=
0
;
return
;
}
}
else
_real_space_comm
=
universe
->
uworld
;
if
(
_offload_balance
==
0.0
)
_cop
=
-
1
;
if
(
_offload_balance
==
0.0
||
_offload_affinity_set
==
1
)
return
;
_offload_affinity_set
=
1
;
int
node_rank
;
int
ppn
=
get_ppn
(
node_rank
);
if
(
ppn
%
_ncops
!=
0
)
error
->
all
(
FLERR
,
"MPI tasks per node must be multiple of offload_cards"
);
ppn
=
ppn
/
_ncops
;
_cop
=
node_rank
/
ppn
;
node_rank
=
node_rank
%
ppn
;
int
max_threads_per_task
=
_offload_cores
/
4
*
_offload_tpc
/
ppn
;
if
(
_offload_threads
>
max_threads_per_task
)
_offload_threads
=
max_threads_per_task
;
if
(
_offload_threads
>
_max_offload_threads
)
_offload_threads
=
_max_offload_threads
;
int
offload_threads
=
_offload_threads
;
int
offload_tpc
=
_offload_tpc
;
int
offload_affinity_balanced
=
_offload_affinity_balanced
;
int
offload_cores
=
_offload_cores
;
#pragma offload target(mic:_cop) mandatory \
in(node_rank,offload_threads,offload_tpc,offload_affinity_balanced, \
offload_cores)
{
omp_set_num_threads
(
offload_threads
);
#pragma omp parallel
{
int
tnum
=
omp_get_thread_num
();
kmp_affinity_mask_t
mask
;
kmp_create_affinity_mask
(
&
mask
);
int
proc
=
offload_threads
*
node_rank
+
tnum
;
#ifdef __AVX512F__
proc
=
(
proc
/
offload_tpc
)
+
(
proc
%
offload_tpc
)
*
((
offload_cores
)
/
4
);
proc
+=
68
;
#else
if
(
offload_affinity_balanced
)
proc
=
proc
*
4
-
(
proc
/
60
)
*
240
+
proc
/
60
+
1
;
else
proc
+=
(
proc
/
4
)
*
(
4
-
offload_tpc
)
+
1
;
#endif
kmp_set_affinity_mask_proc
(
proc
,
&
mask
);
if
(
kmp_set_affinity
(
&
mask
)
!=
0
)
printf
(
"Could not set affinity on rank %d thread %d to %d
\n
"
,
node_rank
,
tnum
,
proc
);
}
}
if
(
_precision_mode
==
PREC_MODE_SINGLE
)
_single_buffers
->
set_off_params
(
offload_threads
,
_cop
,
_separate_buffers
);
else
if
(
_precision_mode
==
PREC_MODE_MIXED
)
_mixed_buffers
->
set_off_params
(
offload_threads
,
_cop
,
_separate_buffers
);
else
_double_buffers
->
set_off_params
(
offload_threads
,
_cop
,
_separate_buffers
);
}
/* ---------------------------------------------------------------------- */
int
FixIntel
::
set_host_affinity
(
const
int
nomp
)
{
#ifndef INTEL_OFFLOAD_NOAFFINITY
_separate_coi
=
1
;
int
rank
=
comm
->
me
;
int
node_rank
;
int
ppn
=
get_ppn
(
node_rank
);
int
cop
=
node_rank
/
(
ppn
/
_ncops
);
// Get a sorted list of logical cores
int
proc_list
[
INTEL_MAX_HOST_CORE_COUNT
];
int
ncores
;
FILE
*
p
;
char
cmd
[
512
];
char
readbuf
[
INTEL_MAX_HOST_CORE_COUNT
*
5
];
sprintf
(
cmd
,
"lscpu -p | grep -v '#' |"
"sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'"
);
p
=
popen
(
cmd
,
"r"
);
if
(
p
==
NULL
)
return
-
1
;
ncores
=
0
;
while
(
fgets
(
readbuf
,
512
,
p
))
{
proc_list
[
ncores
]
=
atoi
(
readbuf
);
ncores
++
;
}
pclose
(
p
);
// Sanity checks for core list
if
(
ncores
<
2
)
return
-
1
;
int
nzero
=
0
;
for
(
int
i
=
0
;
i
<
ncores
;
i
++
)
{
if
(
proc_list
[
i
]
==
0
)
nzero
++
;
if
(
proc_list
[
i
]
<
0
||
proc_list
[
i
]
>=
ncores
)
return
-
1
;
}
if
(
nzero
>
1
)
return
-
1
;
// Determine the OpenMP/MPI configuration
char
*
estring
;
int
nthreads
=
nomp
;
if
(
nthreads
==
0
)
{
estring
=
getenv
(
"OMP_NUM_THREADS"
);
if
(
estring
!=
NULL
)
{
nthreads
=
atoi
(
estring
);
if
(
nthreads
<
2
)
nthreads
=
1
;
}
else
nthreads
=
1
;
}
// Determine how many logical cores for COI and MPI tasks
int
coi_cores
=
0
,
mpi_cores
;
int
subscription
=
nthreads
*
ppn
;
if
(
subscription
>
ncores
)
{
if
(
rank
==
0
)
error
->
warning
(
FLERR
,
"More MPI tasks/OpenMP threads than available cores"
);
return
0
;
}
if
(
subscription
==
ncores
)
_separate_coi
=
0
;
if
(
subscription
>
ncores
/
2
)
{
coi_cores
=
ncores
-
subscription
;
if
(
coi_cores
>
INTEL_MAX_COI_CORES
)
coi_cores
=
INTEL_MAX_COI_CORES
;
}
mpi_cores
=
(
ncores
-
coi_cores
)
/
ppn
;
// Get ids of all LWPs that COI spawned and affinitize
int
lwp
=
0
,
plwp
=
0
,
nlwp
=
0
,
mlwp
=
0
,
fail
=
0
;
cpu_set_t
cpuset
;
pid_t
pid
=
getpid
();
if
(
coi_cores
)
{
sprintf
(
cmd
,
"ps -Lp %d -o lwp | awk ' (NR > 2) {print}'"
,
pid
);
p
=
popen
(
cmd
,
"r"
);
if
(
p
==
NULL
)
return
-
1
;
while
(
fgets
(
readbuf
,
512
,
p
))
{
lwp
=
atoi
(
readbuf
);
int
first
=
coi_cores
+
node_rank
*
mpi_cores
;
CPU_ZERO
(
&
cpuset
);
for
(
int
i
=
first
;
i
<
first
+
mpi_cores
;
i
++
)
CPU_SET
(
proc_list
[
i
],
&
cpuset
);
if
(
sched_setaffinity
(
lwp
,
sizeof
(
cpu_set_t
),
&
cpuset
))
{
fail
=
1
;
break
;
}
plwp
++
;
}
pclose
(
p
);
// Do async offload to create COI threads
int
sig1
,
sig2
;
float
*
buf1
;
int
pragma_size
=
1024
;
buf1
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
pragma_size
);
#pragma offload target (mic:0) mandatory \
in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \
signal(&sig1)
{
buf1
[
0
]
=
0.0
;
}
#pragma offload_wait target(mic:0) wait(&sig1)
#pragma offload target (mic:0) mandatory \
out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \
signal(&sig2)
{
buf1
[
0
]
=
1.0
;
}
#pragma offload_wait target(mic:0) wait(&sig2)
free
(
buf1
);
p
=
popen
(
cmd
,
"r"
);
if
(
p
==
NULL
)
return
-
1
;
while
(
fgets
(
readbuf
,
512
,
p
))
{
lwp
=
atoi
(
readbuf
);
nlwp
++
;
if
(
nlwp
<=
plwp
)
continue
;
CPU_ZERO
(
&
cpuset
);
for
(
int
i
=
0
;
i
<
coi_cores
;
i
++
)
CPU_SET
(
proc_list
[
i
],
&
cpuset
);
if
(
sched_setaffinity
(
lwp
,
sizeof
(
cpu_set_t
),
&
cpuset
))
{
fail
=
1
;
break
;
}
}
pclose
(
p
);
nlwp
-=
plwp
;
// Get stats on the number of LWPs per process
MPI_Reduce
(
&
nlwp
,
&
mlwp
,
1
,
MPI_INT
,
MPI_MAX
,
0
,
MPI_COMM_WORLD
);
}
if
(
screen
&&
rank
==
0
)
{
if
(
coi_cores
)
fprintf
(
screen
,
"Intel Package: Affinitizing %d Offload Threads to %d Cores
\n
"
,
mlwp
,
coi_cores
);
fprintf
(
screen
,
"Intel Package: Affinitizing MPI Tasks to %d Cores Each
\n
"
,
mpi_cores
);
}
if
(
fail
)
return
-
1
;
// Affinitize MPI Ranks
CPU_ZERO
(
&
cpuset
);
int
first
=
coi_cores
+
node_rank
*
mpi_cores
;
for
(
int
i
=
first
;
i
<
first
+
mpi_cores
;
i
++
)
CPU_SET
(
proc_list
[
i
],
&
cpuset
);
if
(
sched_setaffinity
(
pid
,
sizeof
(
cpu_set_t
),
&
cpuset
))
return
-
1
;
#endif
return
0
;
}
#endif
Event Timeline
Log In to Comment