Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F87425892
intel_intrinsics_airebo.h
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Oct 12, 15:12
Size
75 KB
Mime Type
text/x-c++
Expires
Mon, Oct 14, 15:12 (2 d)
Engine
blob
Format
Raw Data
Handle
21486212
Attached To
rLAMMPS lammps
intel_intrinsics_airebo.h
View Options
#ifndef LMP_INTEL_AIREBO_SCALAR
# ifdef __INTEL_COMPILER
# if defined(__MIC__) || defined(__AVX512F__)
# define LMP_INTEL_AIREBO_512
# elif defined(__AVX__)
# define LMP_INTEL_AIREBO_256
# else
# define LMP_INTEL_AIREBO_SCALAR
# endif
# else
# define LMP_INTEL_AIREBO_SCALAR
# endif
#endif
#ifdef LMP_INTEL_AIREBO_512
#include <cassert>
#include <immintrin.h>
#define VEC_INLINE __attribute__((always_inline))
#ifndef FVEC_FIRST_PASS
# define FVEC_LEN 8
# define FVEC_SUFFIX(a) a##pd
# define FVEC_SUFFIX_MASK(a) a##pd_mask
# define FVEC_MASK_T __mmask8
# define FVEC_VEC_T __m512d
# define FVEC_SCAL_T double
# define IVEC_NAME ivec8
# define FVEC_NAME fvec8pd
# define BVEC_NAME bvec8
# define AVEC_NAME avec8pd
#else
# undef FVEC_LEN
# undef FVEC_SUFFIX
# undef FVEC_SUFFIX_MASK
# undef FVEC_MASK_T
# undef FVEC_VEC_T
# undef FVEC_SCAL_T
# undef IVEC_NAME
# undef FVEC_NAME
# undef BVEC_NAME
# undef AVEC_NAME
# define FVEC_LEN 16
# define FVEC_SUFFIX(a) a##ps
# define FVEC_SUFFIX_MASK(a) a##ps_mask
# define FVEC_MASK_T __mmask16
# define FVEC_VEC_T __m512
# define FVEC_SCAL_T float
# define IVEC_NAME ivec16
# define FVEC_NAME fvec16ps
# define BVEC_NAME bvec16
# define AVEC_NAME avec16ps
#endif
namespace
mm512
{
#ifndef __AVX512F__
#ifndef FVEC_FIRST_PASS
VEC_INLINE
static
inline
__m512i
_mm512_mask_expand_epi32
(
__m512i
src
,
__mmask16
k
,
__m512i
a
)
{
int
buf
[
16
]
__attribute__
((
aligned
(
64
)));
_mm512_store_epi32
(
buf
,
a
);
return
_mm512_mask_loadunpacklo_epi32
(
src
,
k
,
buf
);
}
VEC_INLINE
static
inline
__m512i
_mm512_maskz_expand_epi32
(
__mmask16
k
,
__m512i
a
)
{
int
buf
[
16
]
__attribute__
((
aligned
(
64
)));
_mm512_store_epi32
(
buf
,
a
);
return
_mm512_mask_loadunpacklo_epi32
(
_mm512_setzero_epi32
(),
k
,
buf
);
}
VEC_INLINE
static
inline
__m512i
_mm512_mask_compress_epi32
(
__m512i
src
,
__mmask16
k
,
__m512i
a
)
{
int
buf
[
16
]
__attribute__
((
aligned
(
64
)));
_mm512_store_epi32
(
buf
,
src
);
_mm512_mask_packstorelo_epi32
(
buf
,
k
,
a
);
return
_mm512_load_epi32
(
buf
);
}
VEC_INLINE
static
inline
__m512i
_mm512_maskz_compress_epi32
(
__mmask16
k
,
__m512i
a
)
{
int
buf
[
16
]
__attribute__
((
aligned
(
64
)))
=
{
0
};
_mm512_mask_packstorelo_epi32
(
buf
,
k
,
a
);
return
_mm512_load_epi32
(
buf
);
}
VEC_INLINE
static
inline
void
_mm512_mask_compressstoreu_epi32
(
int
*
dest
,
__mmask16
mask
,
__m512i
src
)
{
_mm512_mask_packstorelo_epi32
(
dest
,
mask
,
src
);
_mm512_mask_packstorehi_epi32
(
dest
+
16
,
mask
,
src
);
}
VEC_INLINE
static
inline
__m512i
_mm512_mask_loadu_epi32
(
__m512i
src
,
__mmask16
k
,
const
int
*
mem_addr
)
{
assert
((
k
&
(
k
+
1
))
==
0
);
__m512i
ret
=
_mm512_mask_loadunpacklo_epi32
(
src
,
k
,
mem_addr
);
ret
=
_mm512_mask_loadunpackhi_epi32
(
ret
,
k
,
mem_addr
+
16
);
return
ret
;
}
VEC_INLINE
static
inline
__m512i
_mm512_maskz_loadu_epi32
(
__mmask16
k
,
const
int
*
mem_addr
)
{
assert
((
k
&
(
k
+
1
))
==
0
);
__m512i
ret
=
_mm512_mask_loadunpacklo_epi32
(
_mm512_setzero_epi32
(),
k
,
mem_addr
);
ret
=
_mm512_mask_loadunpackhi_epi32
(
ret
,
k
,
mem_addr
+
16
);
return
ret
;
}
VEC_INLINE
static
inline
void
_mm512_mask_storeu_epi32
(
int
*
dest
,
__mmask16
mask
,
__m512i
src
)
{
assert
((
mask
&
(
mask
+
1
))
==
0
);
_mm512_mask_packstorelo_epi32
(
dest
,
mask
,
src
);
_mm512_mask_packstorehi_epi32
(
dest
+
16
,
mask
,
src
);
}
#endif
VEC_INLINE
static
inline
FVEC_VEC_T
FVEC_SUFFIX
(
_mm512_mask_expand_
)
(
FVEC_VEC_T
src
,
__mmask16
k
,
FVEC_VEC_T
a
)
{
FVEC_SCAL_T
buf
[
FVEC_LEN
]
__attribute__
((
aligned
(
64
)));
FVEC_SUFFIX
(
_mm512_store_
)(
buf
,
a
);
return
FVEC_SUFFIX
(
_mm512_mask_loadunpacklo_
)(
src
,
k
,
buf
);
}
VEC_INLINE
static
inline
FVEC_VEC_T
FVEC_SUFFIX
(
_mm512_maskz_expand_
)
(
__mmask16
k
,
FVEC_VEC_T
a
)
{
FVEC_SCAL_T
buf
[
FVEC_LEN
]
__attribute__
((
aligned
(
64
)));
FVEC_SUFFIX
(
_mm512_store_
)(
buf
,
a
);
return
FVEC_SUFFIX
(
_mm512_mask_loadunpacklo_
)(
FVEC_SUFFIX
(
_mm512_setzero_
)(),
k
,
buf
);
}
VEC_INLINE
static
inline
FVEC_VEC_T
FVEC_SUFFIX
(
_mm512_mask_compress_
)
(
FVEC_VEC_T
src
,
__mmask16
k
,
FVEC_VEC_T
a
)
{
FVEC_SCAL_T
buf
[
FVEC_LEN
]
__attribute__
((
aligned
(
64
)));
FVEC_SUFFIX
(
_mm512_store_
)(
buf
,
src
);
FVEC_SUFFIX
(
_mm512_mask_packstorelo_
)(
buf
,
k
,
a
);
return
FVEC_SUFFIX
(
_mm512_load_
)(
buf
);
}
VEC_INLINE
static
inline
FVEC_VEC_T
FVEC_SUFFIX
(
_mm512_maskz_compress_
)
(
__mmask16
k
,
FVEC_VEC_T
a
)
{
FVEC_SCAL_T
buf
[
FVEC_LEN
]
__attribute__
((
aligned
(
64
)))
=
{
0
};
FVEC_SUFFIX
(
_mm512_mask_packstorelo_
)(
buf
,
k
,
a
);
return
FVEC_SUFFIX
(
_mm512_load_
)(
buf
);
}
VEC_INLINE
static
inline
void
FVEC_SUFFIX
(
_mm512_mask_storeu_
)
(
FVEC_SCAL_T
*
dest
,
FVEC_MASK_T
mask
,
FVEC_VEC_T
src
)
{
assert
((
mask
&
(
mask
+
1
))
==
0
);
FVEC_SUFFIX
(
_mm512_mask_packstorelo_
)(
dest
,
mask
,
src
);
FVEC_SUFFIX
(
_mm512_mask_packstorehi_
)(
dest
+
FVEC_LEN
,
mask
,
src
);
}
#endif
class
FVEC_NAME
;
class
IVEC_NAME
;
class
AVEC_NAME
;
class
BVEC_NAME
{
friend
class
FVEC_NAME
;
friend
class
IVEC_NAME
;
friend
class
AVEC_NAME
;
# if FVEC_LEN==16
friend
class
avec16pd
;
# endif
FVEC_MASK_T
val_
;
VEC_INLINE
BVEC_NAME
(
const
FVEC_MASK_T
&
v
)
:
val_
(
v
)
{}
public:
VEC_INLINE
BVEC_NAME
()
{}
VEC_INLINE
static
BVEC_NAME
kand
(
const
BVEC_NAME
&
a
,
const
BVEC_NAME
&
b
)
{
return
_mm512_kand
(
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
BVEC_NAME
kandn
(
const
BVEC_NAME
&
a
,
const
BVEC_NAME
&
b
)
{
return
_mm512_kandn
(
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
BVEC_NAME
knot
(
const
BVEC_NAME
&
a
)
{
return
_mm512_knot
(
a
.
val_
);
}
VEC_INLINE
static
int
kortestz
(
const
BVEC_NAME
&
a
,
const
BVEC_NAME
&
b
)
{
return
_mm512_kortestz
(
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
BVEC_NAME
masku_compress
(
const
BVEC_NAME
&
mask
,
const
BVEC_NAME
&
a
)
{
const
__m512i
c_i1
=
_mm512_set1_epi32
(
1
);
__m512i
a_int_vec
=
_mm512_mask_blend_epi32
(
a
.
val_
,
_mm512_setzero_epi32
(),
c_i1
);
__m512i
compressed
=
_mm512_mask_compress_epi32
(
_mm512_undefined_epi32
(),
mask
.
val_
,
a_int_vec
);
return
_mm512_cmpeq_epi32_mask
(
compressed
,
c_i1
);
}
VEC_INLINE
static
BVEC_NAME
mask_expand
(
const
BVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
BVEC_NAME
&
a
)
{
const
__m512i
c_i1
=
_mm512_set1_epi32
(
1
);
__m512i
a_int_vec
=
_mm512_mask_blend_epi32
(
a
.
val_
,
_mm512_setzero_epi32
(),
c_i1
);
__m512i
src_int_vec
=
_mm512_mask_blend_epi32
(
src
.
val_
,
_mm512_setzero_epi32
(),
c_i1
);
__m512i
compressed
=
_mm512_mask_expand_epi32
(
src_int_vec
,
mask
.
val_
,
a_int_vec
);
return
_mm512_cmpeq_epi32_mask
(
compressed
,
c_i1
);
}
VEC_INLINE
static
BVEC_NAME
full
()
{
return
static_cast
<
FVEC_MASK_T
>
(
0xFFFF
);
}
VEC_INLINE
static
BVEC_NAME
empty
()
{
return
0
;
}
VEC_INLINE
static
BVEC_NAME
only
(
int
n
)
{
return
full
().
val_
>>
(
FVEC_LEN
-
n
);
}
VEC_INLINE
static
BVEC_NAME
after
(
int
n
)
{
return
full
().
val_
<<
n
;
}
VEC_INLINE
static
BVEC_NAME
onlyafter
(
int
only
,
int
after
)
{
return
(
full
().
val_
>>
(
FVEC_LEN
-
only
))
<<
after
;
}
VEC_INLINE
static
int
popcnt
(
const
BVEC_NAME
&
a
)
{
return
_popcnt32
(
a
.
val_
);
}
VEC_INLINE
static
bool
test_all_unset
(
const
BVEC_NAME
&
a
)
{
return
_mm512_kortestz
(
a
.
val_
,
a
.
val_
);
}
VEC_INLINE
static
bool
test_any_set
(
const
BVEC_NAME
&
a
)
{
return
!
test_all_unset
(
a
);
}
VEC_INLINE
static
bool
test_at
(
const
BVEC_NAME
&
a
,
int
i
)
{
assert
(
i
<
FVEC_LEN
);
return
a
.
val_
&
(
1
<<
i
);
}
VEC_INLINE
BVEC_NAME
operator
&
(
const
BVEC_NAME
&
b
)
const
{
return
_mm512_kand
(
val_
,
b
.
val_
);
}
VEC_INLINE
BVEC_NAME
operator
|
(
const
BVEC_NAME
&
b
)
const
{
return
_mm512_kor
(
val_
,
b
.
val_
);
}
VEC_INLINE
BVEC_NAME
operator
~
()
const
{
return
_mm512_knot
(
val_
);
}
};
class
IVEC_NAME
{
friend
class
FVEC_NAME
;
friend
class
AVEC_NAME
;
# if FVEC_LEN==16
friend
class
avec16pd
;
# endif
__m512i
val_
;
VEC_INLINE
IVEC_NAME
(
const
__m512i
&
v
)
:
val_
(
v
)
{}
public:
static
const
int
VL
=
16
;
VEC_INLINE
IVEC_NAME
()
{}
#define IVEC_MASK_BINFN_B(the_name) \
VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a, \
const IVEC_NAME &b) { \
return _mm512_##the_name##_epi32_mask(a.val_, b.val_); \
} \
VEC_INLINE static BVEC_NAME mask_##the_name( \
const BVEC_NAME &mask, \
const IVEC_NAME &a, \
const IVEC_NAME &b \
) { \
return _mm512_mask_##the_name##_epi32_mask( \
mask.val_, a.val_, b.val_); \
}
IVEC_MASK_BINFN_B
(
cmpeq
)
IVEC_MASK_BINFN_B
(
cmplt
)
IVEC_MASK_BINFN_B
(
cmpneq
)
IVEC_MASK_BINFN_B
(
cmpgt
)
#define IVEC_MASK_BINFN_I(the_name) \
VEC_INLINE static IVEC_NAME mask_##the_name( \
const IVEC_NAME &src, const BVEC_NAME &mask, \
const IVEC_NAME &a, const IVEC_NAME &b \
) { \
return _mm512_mask_##the_name##_epi32( \
src.val_, mask.val_, a.val_, b.val_); \
}
IVEC_MASK_BINFN_I
(
add
)
VEC_INLINE
static
IVEC_NAME
mask_blend
(
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
a
,
const
IVEC_NAME
&
b
)
{
return
_mm512_mask_blend_epi32
(
mask
.
val_
,
a
.
val_
,
b
.
val_
);
}
#define IVEC_BINFN_I(the_name) \
VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a, \
const IVEC_NAME &b) { \
return _mm512_##the_name##_epi32(a.val_, b.val_); \
}
IVEC_BINFN_I
(
mullo
)
IVEC_BINFN_I
(
srlv
)
VEC_INLINE
static
IVEC_NAME
the_and
(
const
IVEC_NAME
&
a
,
const
IVEC_NAME
&
b
)
{
return
_mm512_and_epi32
(
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
IVEC_NAME
mask_expand
(
const
IVEC_NAME
&
src
,
const
BVEC_NAME
&
a
,
const
IVEC_NAME
&
b
)
{
return
_mm512_mask_expand_epi32
(
src
.
val_
,
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
IVEC_NAME
masku_compress
(
const
BVEC_NAME
&
a
,
const
IVEC_NAME
&
b
)
{
return
_mm512_mask_compress_epi32
(
_mm512_undefined_epi32
(),
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
int
at
(
const
IVEC_NAME
&
a
,
int
b
)
{
int
data
[
16
]
__attribute__
((
aligned
(
64
)));
_mm512_store_epi32
(
data
,
a
.
val_
);
return
data
[
b
];
}
VEC_INLINE
static
IVEC_NAME
load
(
const
int
*
src
)
{
return
_mm512_load_epi32
(
src
);
}
VEC_INLINE
static
IVEC_NAME
mask_loadu
(
const
BVEC_NAME
&
mask
,
const
int
*
src
)
{
assert
((
mask
.
val_
&
(
mask
.
val_
+
1
))
==
0
);
assert
(
mask
.
val_
<=
BVEC_NAME
::
full
().
val_
);
return
_mm512_mask_loadu_epi32
(
_mm512_undefined_epi32
(),
mask
.
val_
,
src
);
}
VEC_INLINE
static
IVEC_NAME
maskz_loadu
(
const
BVEC_NAME
&
mask
,
const
int
*
src
)
{
assert
((
mask
.
val_
&
(
mask
.
val_
+
1
))
==
0
);
assert
(
mask
.
val_
<=
BVEC_NAME
::
full
().
val_
);
return
_mm512_maskz_loadu_epi32
(
mask
.
val_
,
src
);
}
VEC_INLINE
static
void
mask_storeu
(
const
BVEC_NAME
&
mask
,
int
*
dest
,
const
IVEC_NAME
&
src
)
{
assert
((
mask
.
val_
&
(
mask
.
val_
+
1
))
==
0
);
assert
(
mask
.
val_
<=
BVEC_NAME
::
full
().
val_
);
_mm512_mask_storeu_epi32
(
dest
,
mask
.
val_
,
src
.
val_
);
}
VEC_INLINE
static
void
store
(
int
*
dest
,
const
IVEC_NAME
&
src
)
{
_mm512_store_epi32
(
dest
,
src
.
val_
);
}
VEC_INLINE
static
IVEC_NAME
mask_gather
(
const
IVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
int
*
mem
,
const
int
scale
)
{
assert
(
mask
.
val_
<=
BVEC_NAME
::
full
().
val_
);
assert
(
scale
==
sizeof
(
int
));
return
_mm512_mask_i32gather_epi32
(
src
.
val_
,
mask
.
val_
,
idx
.
val_
,
mem
,
sizeof
(
int
));
}
VEC_INLINE
static
void
mask_i32scatter
(
int
*
mem
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
IVEC_NAME
&
a
,
const
int
scale
)
{
assert
(
mask
.
val_
<=
BVEC_NAME
::
full
().
val_
);
assert
(
scale
==
sizeof
(
int
));
_mm512_mask_i32scatter_epi32
(
mem
,
mask
.
val_
,
idx
.
val_
,
a
.
val_
,
sizeof
(
int
));
}
VEC_INLINE
static
void
mask_compressstore
(
const
BVEC_NAME
&
mask
,
int
*
dest
,
const
IVEC_NAME
&
src
)
{
_mm512_mask_compressstoreu_epi32
(
dest
,
mask
.
val_
,
src
.
val_
);
}
VEC_INLINE
static
IVEC_NAME
set1
(
int
i
)
{
return
_mm512_set1_epi32
(
i
);
}
VEC_INLINE
static
IVEC_NAME
setzero
()
{
return
_mm512_setzero_epi32
();
}
VEC_INLINE
static
IVEC_NAME
undefined
()
{
return
_mm512_undefined_epi32
();
}
VEC_INLINE
IVEC_NAME
operator
+
(
const
IVEC_NAME
&
b
)
const
{
return
_mm512_add_epi32
(
this
->
val_
,
b
.
val_
);
}
VEC_INLINE
static
void
print
(
const
char
*
str
,
const
IVEC_NAME
&
a
)
{
int
data
[
8
]
__attribute__
((
aligned
(
32
)));
store
(
data
,
a
);
printf
(
"%s:"
,
str
);
for
(
int
i
=
0
;
i
<
FVEC_LEN
;
i
++
)
{
printf
(
" %d"
,
data
[
i
]);
}
printf
(
"
\n
"
);
}
};
class
FVEC_NAME
{
friend
class
AVEC_NAME
;
#if FVEC_LEN==16
friend
class
avec16pd
;
#endif
FVEC_VEC_T
val_
;
VEC_INLINE
FVEC_NAME
(
const
FVEC_VEC_T
&
v
)
:
val_
(
v
)
{}
public:
static
const
int
VL
=
FVEC_LEN
;
VEC_INLINE
FVEC_NAME
()
{}
VEC_INLINE
static
FVEC_SCAL_T
at
(
const
FVEC_NAME
&
a
,
int
i
)
{
assert
(
i
<
FVEC_LEN
);
FVEC_SCAL_T
data
[
FVEC_LEN
]
__attribute__
((
aligned
(
64
)));
FVEC_SUFFIX
(
_mm512_store_
)(
data
,
a
.
val_
);
return
data
[
i
];
}
VEC_INLINE
static
bool
fast_compress
()
{
return
true
;
}
#define FVEC_MASK_BINFN_B(the_name) \
VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a, \
const FVEC_NAME &b) { \
return FVEC_SUFFIX_MASK(_mm512_##the_name##_)(a.val_, b.val_); \
} \
VEC_INLINE static BVEC_NAME mask_##the_name( \
const BVEC_NAME &mask, \
const FVEC_NAME &a, const FVEC_NAME &b \
) { \
return FVEC_SUFFIX_MASK(_mm512_mask_##the_name##_)( \
mask.val_, a.val_, b.val_); \
}
FVEC_MASK_BINFN_B
(
cmple
)
FVEC_MASK_BINFN_B
(
cmplt
)
FVEC_MASK_BINFN_B
(
cmpneq
)
FVEC_MASK_BINFN_B
(
cmpnle
)
FVEC_MASK_BINFN_B
(
cmpnlt
)
#define FVEC_UNFN_F(the_name) \
VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) { \
return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_); \
}
FVEC_UNFN_F
(
abs
)
FVEC_UNFN_F
(
exp
)
FVEC_UNFN_F
(
invsqrt
)
FVEC_UNFN_F
(
recip
)
FVEC_UNFN_F
(
sqrt
)
#define FVEC_MASK_UNFN_F(the_name) \
VEC_INLINE static FVEC_NAME mask_##the_name( \
const FVEC_NAME &src, const BVEC_NAME &mask, \
const FVEC_NAME &a \
) { \
return FVEC_SUFFIX(_mm512_mask_##the_name##_)( \
src.val_, mask.val_, a.val_); \
}
FVEC_MASK_UNFN_F
(
cos
)
FVEC_MASK_UNFN_F
(
recip
)
FVEC_MASK_UNFN_F
(
sqrt
)
#define FVEC_BINFN_F(the_name) \
VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a, \
const FVEC_NAME &b) { \
return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_, b.val_); \
}
FVEC_BINFN_F
(
max
)
FVEC_BINFN_F
(
min
)
#define FVEC_MASK_BINFN_F(the_name) \
VEC_INLINE static FVEC_NAME mask_##the_name( \
const FVEC_NAME &src, const BVEC_NAME &mask, \
const FVEC_NAME &a, const FVEC_NAME &b \
) { \
return FVEC_SUFFIX(_mm512_mask_##the_name##_)( \
src.val_, mask.val_, a.val_, b.val_); \
}
FVEC_MASK_BINFN_F
(
add
)
FVEC_MASK_BINFN_F
(
div
)
FVEC_MASK_BINFN_F
(
mul
)
FVEC_MASK_BINFN_F
(
sub
)
VEC_INLINE
static
FVEC_NAME
mask_blend
(
const
BVEC_NAME
&
mask
,
const
FVEC_NAME
&
a
,
const
FVEC_NAME
&
b
)
{
return
FVEC_SUFFIX
(
_mm512_mask_blend_
)(
mask
.
val_
,
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
FVEC_NAME
mask_expand
(
const
FVEC_NAME
&
src
,
const
BVEC_NAME
&
a
,
const
FVEC_NAME
&
b
)
{
return
FVEC_SUFFIX
(
_mm512_mask_expand_
)(
src
.
val_
,
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
FVEC_NAME
masku_compress
(
const
BVEC_NAME
&
a
,
const
FVEC_NAME
&
b
)
{
return
FVEC_SUFFIX
(
_mm512_mask_compress_
)(
FVEC_SUFFIX
(
_mm512_undefined_
)(),
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
FVEC_NAME
set1
(
const
FVEC_SCAL_T
&
a
)
{
return
FVEC_SUFFIX
(
_mm512_set1_
)(
a
);
}
VEC_INLINE
static
FVEC_NAME
setzero
()
{
return
FVEC_SUFFIX
(
_mm512_setzero_
)();
}
VEC_INLINE
static
FVEC_NAME
undefined
()
{
return
FVEC_SUFFIX
(
_mm512_undefined_
)();
}
VEC_INLINE
static
FVEC_NAME
load
(
const
FVEC_SCAL_T
*
mem
)
{
return
FVEC_SUFFIX
(
_mm512_load_
)(
mem
);
}
VEC_INLINE
static
void
mask_storeu
(
const
BVEC_NAME
&
mask
,
FVEC_SCAL_T
*
dest
,
const
FVEC_NAME
&
a
)
{
FVEC_SUFFIX
(
_mm512_mask_storeu_
)(
dest
,
mask
.
val_
,
a
.
val_
);
}
VEC_INLINE
static
void
store
(
FVEC_SCAL_T
*
dest
,
const
FVEC_NAME
&
a
)
{
FVEC_SUFFIX
(
_mm512_store_
)(
dest
,
a
.
val_
);
}
VEC_INLINE
static
FVEC_NAME
gather
(
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
# if FVEC_LEN==8
return
FVEC_SUFFIX
(
_mm512_i32logather_
)(
idx
.
val_
,
mem
,
sizeof
(
FVEC_SCAL_T
));
# else
return
FVEC_SUFFIX
(
_mm512_i32gather_
)(
idx
.
val_
,
mem
,
sizeof
(
FVEC_SCAL_T
));
# endif
}
VEC_INLINE
static
FVEC_NAME
mask_gather
(
const
FVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
# if FVEC_LEN==8
return
FVEC_SUFFIX
(
_mm512_mask_i32logather_
)(
src
.
val_
,
mask
.
val_
,
idx
.
val_
,
mem
,
sizeof
(
FVEC_SCAL_T
));
# else
return
FVEC_SUFFIX
(
_mm512_mask_i32gather_
)(
src
.
val_
,
mask
.
val_
,
idx
.
val_
,
mem
,
sizeof
(
FVEC_SCAL_T
));
# endif
}
VEC_INLINE
static
void
gather_3_adjacent
(
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
,
FVEC_NAME
*
out_0
,
FVEC_NAME
*
out_1
,
FVEC_NAME
*
out_2
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
*
out_0
=
FVEC_NAME
::
gather
(
idx
,
mem
+
0
,
scale
);
*
out_1
=
FVEC_NAME
::
gather
(
idx
,
mem
+
1
,
scale
);
*
out_2
=
FVEC_NAME
::
gather
(
idx
,
mem
+
2
,
scale
);
}
VEC_INLINE
static
void
gather_4_adjacent
(
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
,
FVEC_NAME
*
out_0
,
FVEC_NAME
*
out_1
,
FVEC_NAME
*
out_2
,
FVEC_NAME
*
out_3
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
*
out_0
=
FVEC_NAME
::
gather
(
idx
,
mem
+
0
,
scale
);
*
out_1
=
FVEC_NAME
::
gather
(
idx
,
mem
+
1
,
scale
);
*
out_2
=
FVEC_NAME
::
gather
(
idx
,
mem
+
2
,
scale
);
*
out_3
=
FVEC_NAME
::
gather
(
idx
,
mem
+
3
,
scale
);
}
VEC_INLINE
static
FVEC_SCAL_T
mask_reduce_add
(
const
BVEC_NAME
&
mask
,
const
FVEC_NAME
&
a
)
{
return
FVEC_SUFFIX
(
_mm512_mask_reduce_add_
)(
mask
.
val_
,
a
.
val_
);
}
VEC_INLINE
static
FVEC_SCAL_T
reduce_add
(
const
FVEC_NAME
&
a
)
{
return
FVEC_SUFFIX
(
_mm512_reduce_add_
)(
a
.
val_
);
}
VEC_INLINE
static
IVEC_NAME
unpackloepi32
(
const
FVEC_NAME
&
a
)
{
# if FVEC_LEN==8
return
_mm512_maskz_compress_epi32
(
0x5555
,
_mm512_castpd_si512
(
a
.
val_
));
# else
return
_mm512_castps_si512
(
a
.
val_
);
# endif
}
VEC_INLINE
static
FVEC_NAME
mask_sincos
(
FVEC_NAME
*
cos
,
const
FVEC_NAME
&
src_a
,
const
FVEC_NAME
&
src_b
,
const
BVEC_NAME
&
mask
,
const
FVEC_NAME
&
arg
)
{
return
FVEC_SUFFIX
(
_mm512_mask_sincos_
)(
&
cos
->
val_
,
src_a
.
val_
,
src_b
.
val_
,
mask
.
val_
,
arg
.
val_
);
}
#define FVEC_BINOP(the_sym, the_name) \
VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \
return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_); \
}
FVEC_BINOP
(
+
,
add
)
FVEC_BINOP
(
-
,
sub
)
FVEC_BINOP
(
*
,
mul
)
FVEC_BINOP
(
/
,
div
)
VEC_INLINE
static
void
gather_prefetch0
(
const
IVEC_NAME
&
a
,
void
*
mem
)
{
#ifdef __AVX512PF__
_mm512_mask_prefetch_i32gather_ps
(
a
.
val_
,
BVEC_NAME
::
full
().
val_
,
mem
,
sizeof
(
FVEC_SCAL_T
),
_MM_HINT_T0
);
#endif
}
};
class
AVEC_NAME
{
FVEC_VEC_T
val_
;
VEC_INLINE
AVEC_NAME
(
const
FVEC_VEC_T
&
a
)
:
val_
(
a
)
{}
public:
VEC_INLINE
AVEC_NAME
(
const
FVEC_NAME
&
a
)
:
val_
(
a
.
val_
)
{}
VEC_INLINE
static
AVEC_NAME
undefined
()
{
return
FVEC_SUFFIX
(
_mm512_undefined_
)();
}
VEC_INLINE
static
AVEC_NAME
mask_gather
(
const
AVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
# if FVEC_LEN==8
return
FVEC_SUFFIX
(
_mm512_mask_i32logather_
)(
src
.
val_
,
mask
.
val_
,
idx
.
val_
,
mem
,
sizeof
(
FVEC_SCAL_T
));
# else
return
FVEC_SUFFIX
(
_mm512_mask_i32gather_
)(
src
.
val_
,
mask
.
val_
,
idx
.
val_
,
mem
,
sizeof
(
FVEC_SCAL_T
));
# endif
}
VEC_INLINE
static
void
mask_i32loscatter
(
FVEC_SCAL_T
*
mem
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
AVEC_NAME
&
a
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
# if FVEC_LEN==8
FVEC_SUFFIX
(
_mm512_mask_i32loscatter_
)(
mem
,
mask
.
val_
,
idx
.
val_
,
a
.
val_
,
sizeof
(
FVEC_SCAL_T
));
# else
FVEC_SUFFIX
(
_mm512_mask_i32scatter_
)(
mem
,
mask
.
val_
,
idx
.
val_
,
a
.
val_
,
sizeof
(
FVEC_SCAL_T
));
# endif
}
#define AVEC_BINOP(the_sym, the_name) \
VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \
return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_); \
}
AVEC_BINOP
(
-
,
sub
)
VEC_INLINE
static
void
gather_prefetch0
(
const
IVEC_NAME
&
a
,
void
*
mem
)
{
_mm512_mask_prefetch_i32gather_ps
(
a
.
val_
,
BVEC_NAME
::
full
().
val_
,
mem
,
sizeof
(
FVEC_SCAL_T
),
_MM_HINT_T0
);
}
};
#if FVEC_LEN==16
class
avec16pd
{
__m512d
lo_
,
hi_
;
VEC_INLINE
avec16pd
(
const
__m512d
&
lo
,
const
__m512d
&
hi
)
:
lo_
(
lo
),
hi_
(
hi
)
{}
VEC_INLINE
static
__mmask8
get_bvec_hi
(
__mmask16
a
)
{
return
a
>>
8
;
}
VEC_INLINE
static
__m512i
get_ivec_hi
(
__m512i
a
)
{
return
_mm512_permute4f128_epi32
(
a
,
_MM_PERM_BADC
);
}
public:
VEC_INLINE
avec16pd
(
const
FVEC_NAME
&
a
)
{
lo_
=
_mm512_cvtpslo_pd
(
a
.
val_
);
hi_
=
_mm512_cvtpslo_pd
(
_mm512_permute4f128_ps
(
a
.
val_
,
_MM_PERM_BADC
));
}
VEC_INLINE
static
avec16pd
undefined
()
{
return
avec16pd
(
_mm512_undefined_pd
(),
_mm512_undefined_pd
());
}
VEC_INLINE
static
avec16pd
mask_gather
(
const
avec16pd
&
src
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
double
*
mem
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
double
));
__m512d
lo
=
_mm512_mask_i32logather_pd
(
src
.
lo_
,
mask
.
val_
,
idx
.
val_
,
mem
,
sizeof
(
double
));
__m512d
hi
=
_mm512_mask_i32logather_pd
(
src
.
hi_
,
get_bvec_hi
(
mask
.
val_
),
get_ivec_hi
(
idx
.
val_
),
mem
,
sizeof
(
double
));
return
avec16pd
(
lo
,
hi
);
}
VEC_INLINE
static
void
mask_i32loscatter
(
double
*
mem
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
avec16pd
&
a
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
double
));
_mm512_mask_i32loscatter_pd
(
mem
,
mask
.
val_
,
idx
.
val_
,
a
.
lo_
,
sizeof
(
double
));
_mm512_mask_i32loscatter_pd
(
mem
,
get_bvec_hi
(
mask
.
val_
),
get_ivec_hi
(
idx
.
val_
),
a
.
hi_
,
sizeof
(
double
));
}
#define AVEC2_BINOP(the_sym, the_name) \
VEC_INLINE inline avec16pd operator the_sym(const avec16pd &b) const { \
__m512d lo = _mm512_##the_name##_pd(this->lo_, b.lo_); \
__m512d hi = _mm512_##the_name##_pd(this->hi_, b.hi_); \
return avec16pd(lo, hi); \
}
AVEC2_BINOP
(
-
,
sub
)
VEC_INLINE
static
void
gather_prefetch0
(
const
IVEC_NAME
&
a
,
void
*
mem
)
{
_mm512_mask_prefetch_i32gather_ps
(
a
.
val_
,
BVEC_NAME
::
full
().
val_
,
mem
,
sizeof
(
double
),
_MM_HINT_T0
);
}
};
#endif
}
#ifdef FVEC_FIRST_PASS
template
<
typename
flt_t
,
typename
acc_t
>
struct
intr_types
;
template
<>
struct
intr_types
<
double
,
double
>
{
typedef
mm512
::
fvec8pd
fvec
;
typedef
mm512
::
ivec8
ivec
;
typedef
mm512
::
bvec8
bvec
;
typedef
mm512
::
avec8pd
avec
;
};
template
<>
struct
intr_types
<
float
,
float
>
{
typedef
mm512
::
fvec16ps
fvec
;
typedef
mm512
::
ivec16
ivec
;
typedef
mm512
::
bvec16
bvec
;
typedef
mm512
::
avec16ps
avec
;
};
template
<>
struct
intr_types
<
float
,
double
>
{
typedef
mm512
::
fvec16ps
fvec
;
typedef
mm512
::
ivec16
ivec
;
typedef
mm512
::
bvec16
bvec
;
typedef
mm512
::
avec16pd
avec
;
};
#endif
#ifndef FVEC_FIRST_PASS
# define FVEC_FIRST_PASS
# include "intel_intrinsics_airebo.h"
#endif
#endif
#ifdef LMP_INTEL_AIREBO_256
#include <cassert>
#include <immintrin.h>
#include <stdint.h>
#define VEC_INLINE __attribute__((always_inline))
#ifndef FVEC_FIRST_PASS
# define FVEC_LEN 4
# define FVEC_SUFFIX(a) a##pd
# define FVEC_MASK_T __m256d
# define FVEC_VEC_T __m256d
# define FVEC_SCAL_T double
# define IVEC_NAME ivec4
# define FVEC_NAME fvec4pd
# define BVEC_NAME bvec4
# define AVEC_NAME avec4pd
#else
# undef FVEC_LEN
# undef FVEC_SUFFIX
# undef FVEC_SUFFIX_MASK
# undef FVEC_MASK_T
# undef FVEC_VEC_T
# undef FVEC_SCAL_T
# undef IVEC_NAME
# undef FVEC_NAME
# undef BVEC_NAME
# undef AVEC_NAME
# define FVEC_LEN 8
# define FVEC_SUFFIX(a) a##ps
# define FVEC_MASK_T __m256
# define FVEC_VEC_T __m256
# define FVEC_SCAL_T float
# define IVEC_NAME ivec8
# define FVEC_NAME fvec8ps
# define BVEC_NAME bvec8
# define AVEC_NAME avec8ps
#endif
namespace
mm256
{
//#define __AVX2__ __AVX2__
#if !defined(__AVX2__) && !defined(FVEC_FIRST_PASS)
#define IVEC_EM_BIN(op) \
__m128i a_lo = _mm256_castsi256_si128(a); \
__m128i b_lo = _mm256_castsi256_si128(b); \
__m128i a_hi = _mm256_extractf128_si256(a, 1); \
__m128i b_hi = _mm256_extractf128_si256(b, 1); \
__m128i c_lo = op(a_lo, b_lo); \
__m128i c_hi = op(a_hi, b_hi); \
__m256i ret = _mm256_setr_m128i(c_lo, c_hi); \
return ret;
VEC_INLINE
inline
__m256i
_cm256_add_epi32
(
const
__m256i
&
a
,
const
__m256i
&
b
)
{
IVEC_EM_BIN
(
_mm_add_epi32
)
}
VEC_INLINE
inline
__m256i
_cm256_and_si256
(
const
__m256i
&
a
,
const
__m256i
&
b
)
{
IVEC_EM_BIN
(
_mm_and_si128
)
}
VEC_INLINE
inline
__m256i
_cm256_andnot_si256
(
const
__m256i
&
a
,
const
__m256i
&
b
)
{
IVEC_EM_BIN
(
_mm_andnot_si128
)
}
VEC_INLINE
inline
__m256i
_cm256_cmpeq_epi32
(
const
__m256i
&
a
,
const
__m256i
&
b
)
{
IVEC_EM_BIN
(
_mm_cmpeq_epi32
)
}
VEC_INLINE
inline
__m256i
_cm256_cmpgt_epi32
(
const
__m256i
&
a
,
const
__m256i
&
b
)
{
IVEC_EM_BIN
(
_mm_cmpgt_epi32
)
}
VEC_INLINE
inline
__m256i
_cm256_cvtepu8_epi32
(
const
__m128i
&
a
)
{
__m128i
a_hi
=
_mm_castps_si128
(
_mm_permute_ps
(
_mm_castsi128_ps
(
a
),
1
));
__m128i
c_lo
=
_mm_cvtepu8_epi32
(
a
);
__m128i
c_hi
=
_mm_cvtepu8_epi32
(
a_hi
);
__m256i
ret
=
_mm256_setr_m128i
(
c_lo
,
c_hi
);
return
ret
;
}
#define IVEC_EM_SCAL(op) \
int buf_a[8] __attribute__((aligned(32))); \
int buf_b[8] __attribute__((aligned(32))); \
int dest[8] __attribute__((aligned(32))); \
_mm256_store_si256((__m256i*)buf_a, a); \
_mm256_store_si256((__m256i*)buf_b, b); \
for (int i = 0; i < 8; i++) { \
dest[i] = op; \
} \
return _mm256_load_si256((__m256i*) dest);
VEC_INLINE
inline
__m256i
_cm256_permutevar8x32_epi32
(
const
__m256i
&
a
,
const
__m256i
&
b
)
{
IVEC_EM_SCAL
(
buf_a
[
buf_b
[
i
]])
}
VEC_INLINE
inline
__m256i
_cm256_mullo_epi32
(
__m256i
a
,
__m256i
b
)
{
IVEC_EM_BIN
(
_mm_mullo_epi32
)
}
VEC_INLINE
inline
__m256i
_cm256_srlv_epi32
(
__m256i
a
,
__m256i
b
)
{
IVEC_EM_SCAL
(
buf_a
[
i
]
>>
buf_b
[
i
])
}
VEC_INLINE
inline
__m256
_cm256_permutevar8x32_ps
(
const
__m256
&
a
,
const
__m256i
&
b
)
{
return
_mm256_castsi256_ps
(
_cm256_permutevar8x32_epi32
(
_mm256_castps_si256
(
a
),
b
));
}
VEC_INLINE
inline
__m128i
_cm_maskload_epi32
(
int
const
*
mem
,
__m128i
mask
)
{
return
_mm_castps_si128
(
_mm_maskload_ps
((
float
const
*
)
mem
,
mask
));
}
VEC_INLINE
inline
__m256i
_cm256_maskload_epi32
(
int
const
*
mem
,
__m256i
mask
)
{
__m128i
a_lo
=
_mm256_castsi256_si128
(
mask
);
__m128i
a_hi
=
_mm256_extractf128_si256
(
mask
,
1
);
__m128i
c_lo
=
_cm_maskload_epi32
(
mem
,
a_lo
);
__m128i
c_hi
=
_cm_maskload_epi32
(
mem
+
4
,
a_hi
);
__m256i
ret
=
_mm256_setr_m128i
(
c_lo
,
c_hi
);
return
ret
;
}
VEC_INLINE
inline
__m256i
_cm256_mask_i32gather_epi32
(
__m256i
src
,
int
const
*
base_addr
,
__m256i
index
,
__m256i
mask
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
int
));
int
buf_index
[
8
]
__attribute__
((
aligned
(
32
)));
int
buf_mask
[
8
]
__attribute__
((
aligned
(
32
)));
int
dest
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_si256
((
__m256i
*
)
dest
,
src
);
_mm256_store_si256
((
__m256i
*
)
buf_index
,
index
);
_mm256_store_si256
((
__m256i
*
)
buf_mask
,
mask
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
if
(
buf_mask
[
i
])
dest
[
i
]
=
base_addr
[
buf_index
[
i
]];
}
return
_mm256_load_si256
((
__m256i
*
)
dest
);
}
VEC_INLINE
inline
__m256
_cm256_mask_i32gather_ps
(
__m256
src
,
float
const
*
base_addr
,
__m256i
index
,
__m256
mask
,
const
int
scale
)
{
return
_mm256_castsi256_ps
(
_cm256_mask_i32gather_epi32
(
_mm256_castps_si256
(
src
),
(
const
int
*
)
base_addr
,
index
,
_mm256_castps_si256
(
mask
),
scale
));
}
VEC_INLINE
inline
__m256d
_cm256_mask_i32gather_pd
(
__m256d
src
,
double
const
*
base_addr
,
__m128i
index
,
__m256d
mask
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
double
));
int
buf_index
[
4
]
__attribute__
((
aligned
(
32
)));
int
buf_mask
[
8
]
__attribute__
((
aligned
(
32
)));
double
dest
[
4
]
__attribute__
((
aligned
(
32
)));
_mm256_store_pd
(
dest
,
src
);
_mm_store_si128
((
__m128i
*
)
buf_index
,
index
);
_mm256_store_si256
((
__m256i
*
)
buf_mask
,
_mm256_castpd_si256
(
mask
));
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
if
(
buf_mask
[
2
*
i
])
dest
[
i
]
=
base_addr
[
buf_index
[
i
]];
}
return
_mm256_load_pd
(
dest
);
}
VEC_INLINE
inline
__m256i
_cm256_i32gather_epi32
(
int
const
*
base_addr
,
__m256i
index
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
int
));
int
buf_index
[
8
]
__attribute__
((
aligned
(
32
)));
int
dest
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_si256
((
__m256i
*
)
buf_index
,
index
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
dest
[
i
]
=
base_addr
[
buf_index
[
i
]];
}
return
_mm256_load_si256
((
__m256i
*
)
dest
);
}
VEC_INLINE
inline
__m256
_cm256_i32gather_ps
(
float
const
*
base_addr
,
__m256i
index
,
const
int
scale
)
{
return
_mm256_castsi256_ps
(
_cm256_i32gather_epi32
((
const
int
*
)
base_addr
,
index
,
scale
));
}
VEC_INLINE
inline
__m256d
_cm256_i32gather_pd
(
double
const
*
base_addr
,
__m128i
index
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
double
));
int
buf_index
[
4
]
__attribute__
((
aligned
(
32
)));
double
dest
[
4
]
__attribute__
((
aligned
(
32
)));
_mm_store_si128
((
__m128i
*
)
buf_index
,
index
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
dest
[
i
]
=
base_addr
[
buf_index
[
i
]];
}
return
_mm256_load_pd
(
dest
);
}
VEC_INLINE
inline
uint64_t
_cdep_u64
(
uint64_t
tmp
,
uint64_t
mask
)
{
uint64_t
dst
=
0
;
uint64_t
k
=
0
;
const
uint64_t
one
=
1
;
const
uint64_t
zero
=
0
;
for
(
uint64_t
m
=
0
;
m
<
64
;
m
++
)
{
if
(
mask
&
(
one
<<
m
))
{
dst
|=
static_cast
<
uint64_t
>
((
tmp
&
(
one
<<
k
))
!=
zero
)
<<
m
;
k
+=
1
;
}
}
return
dst
;
}
VEC_INLINE
inline
uint64_t
_cext_u64
(
uint64_t
tmp
,
uint64_t
mask
)
{
uint64_t
dst
=
0
;
uint64_t
k
=
0
;
const
uint64_t
one
=
1
;
const
uint64_t
zero
=
0
;
for
(
uint64_t
m
=
0
;
m
<
64
;
m
++
)
{
if
(
mask
&
(
one
<<
m
))
{
dst
|=
static_cast
<
uint64_t
>
((
tmp
&
(
one
<<
m
))
!=
zero
)
<<
k
;
k
+=
1
;
}
}
return
dst
;
}
#define _mm256_add_epi32 _cm256_add_epi32
#define _mm256_and_si256 _cm256_and_si256
#define _mm256_andnot_si256 _cm256_andnot_si256
#define _mm256_cmpeq_epi32 _cm256_cmpeq_epi32
#define _mm256_cmpgt_epi32 _cm256_cmpgt_epi32
#define _mm256_permutevar8x32_epi32 _cm256_permutevar8x32_epi32
#define _mm256_permutevar8x32_ps _cm256_permutevar8x32_ps
#define _mm_maskload_epi32 _cm_maskload_epi32
#define _mm256_maskload_epi32 _cm256_maskload_epi32
#define _mm256_mullo_epi32 _cm256_mullo_epi32
#define _mm256_srlv_epi32 _cm256_srlv_epi32
#define _mm256_mask_i32gather_epi32 _cm256_mask_i32gather_epi32
#define _mm256_mask_i32gather_pd _cm256_mask_i32gather_pd
#define _mm256_mask_i32gather_ps _cm256_mask_i32gather_ps
#define _mm256_i32gather_epi32 _cm256_i32gather_epi32
#define _mm256_i32gather_pd _cm256_i32gather_pd
#define _mm256_i32gather_ps _cm256_i32gather_ps
#define _pdep_u64 _cdep_u64
#define _pext_u64 _cext_u64
#define _mm256_cvtepu8_epi32 _cm256_cvtepu8_epi32
#endif
#ifndef FVEC_FIRST_PASS
VEC_INLINE
inline
__m256
_mm256_compress_ps
(
__m256
mask
,
__m256
a
)
{
# ifdef __AVX2__
uint64_t
expanded_mask
=
_pdep_u64
(
_mm256_movemask_ps
(
mask
),
0x0101010101010101
);
// unpack each bit to a byte
expanded_mask
*=
0xFF
;
// mask |= mask<<1 | mask<<2 | ... | mask<<7;
// the identity shuffle for vpermps, packed to one index per byte
const
uint64_t
identity_indices
=
0x0706050403020100
;
uint64_t
wanted_indices
=
_pext_u64
(
identity_indices
,
expanded_mask
);
__m128i
bytevec
=
_mm_cvtsi64_si128
(
wanted_indices
);
__m256i
shufmask
=
_mm256_cvtepu8_epi32
(
bytevec
);
return
_mm256_permutevar8x32_ps
(
a
,
shufmask
);
# else
int
mask_buf
[
8
]
__attribute__
((
aligned
(
32
)));
float
a_buf
[
8
]
__attribute__
((
aligned
(
32
)));
float
dst_buf
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_si256
((
__m256i
*
)
mask_buf
,
_mm256_castps_si256
(
mask
));
_mm256_store_ps
(
a_buf
,
a
);
int
k
=
0
;
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
if
(
mask
[
i
])
{
dst_buf
[
k
++
]
=
a_buf
[
i
];
}
}
return
_mm256_load_ps
(
dst_buf
);
# endif
}
VEC_INLINE
inline
__m256
_mm256_expand_ps
(
__m256
mask
,
__m256
a
)
{
# ifdef __AVX2__
uint64_t
expanded_mask
=
_pdep_u64
(
_mm256_movemask_ps
(
mask
),
0x0101010101010101
);
expanded_mask
*=
0xFF
;
const
uint64_t
identity_indices
=
0x0706050403020100
;
uint64_t
wanted_indices
=
_pdep_u64
(
identity_indices
,
expanded_mask
);
__m128i
bytevec
=
_mm_cvtsi64_si128
(
wanted_indices
);
__m256i
shufmask
=
_mm256_cvtepu8_epi32
(
bytevec
);
return
_mm256_permutevar8x32_ps
(
a
,
shufmask
);
# else
int
mask_buf
[
8
]
__attribute__
((
aligned
(
32
)));
float
a_buf
[
8
]
__attribute__
((
aligned
(
32
)));
float
dst_buf
[
8
]
__attribute__
((
aligned
(
32
)))
=
{
0
};
_mm256_store_si256
((
__m256i
*
)
mask_buf
,
_mm256_castps_si256
(
mask
));
_mm256_store_ps
(
a_buf
,
a
);
int
k
=
0
;
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
if
(
mask
[
i
])
{
dst_buf
[
i
]
=
a_buf
[
k
++
];
}
}
return
_mm256_load_ps
(
dst_buf
);
# endif
}
VEC_INLINE
inline
__m256d
_mm256_compress_pd
(
__m256d
mask
,
__m256d
a
)
{
return
_mm256_castps_pd
(
_mm256_compress_ps
(
_mm256_castpd_ps
(
mask
),
_mm256_castpd_ps
(
a
)));
}
VEC_INLINE
inline
__m256d
_mm256_expand_pd
(
__m256d
mask
,
__m256d
a
)
{
return
_mm256_castps_pd
(
_mm256_expand_ps
(
_mm256_castpd_ps
(
mask
),
_mm256_castpd_ps
(
a
)));
}
#endif
class
FVEC_NAME
;
class
IVEC_NAME
;
class
AVEC_NAME
;
class
BVEC_NAME
{
friend
class
FVEC_NAME
;
friend
class
IVEC_NAME
;
friend
class
AVEC_NAME
;
# if FVEC_LEN==8
friend
class
avec8pd
;
# endif
FVEC_MASK_T
val_
;
VEC_INLINE
BVEC_NAME
(
const
FVEC_MASK_T
&
v
)
:
val_
(
v
)
{}
VEC_INLINE
BVEC_NAME
(
const
__m256i
&
v
)
:
val_
(
FVEC_SUFFIX
(
_mm256_castsi256_
)
(
v
))
{}
public:
VEC_INLINE
BVEC_NAME
()
{}
VEC_INLINE
static
BVEC_NAME
kand
(
const
BVEC_NAME
&
a
,
const
BVEC_NAME
&
b
)
{
return
FVEC_SUFFIX
(
_mm256_and_
)(
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
BVEC_NAME
kandn
(
const
BVEC_NAME
&
a
,
const
BVEC_NAME
&
b
)
{
return
FVEC_SUFFIX
(
_mm256_andnot_
)(
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
BVEC_NAME
masku_compress
(
const
BVEC_NAME
&
mask
,
const
BVEC_NAME
&
a
)
{
return
FVEC_SUFFIX
(
_mm256_compress_
)(
mask
.
val_
,
a
.
val_
);
}
VEC_INLINE
static
BVEC_NAME
mask_expand
(
const
BVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
BVEC_NAME
&
a
)
{
FVEC_MASK_T
ret
=
FVEC_SUFFIX
(
_mm256_expand_
)(
mask
.
val_
,
a
.
val_
);
ret
=
FVEC_SUFFIX
(
_mm256_and_
)(
mask
.
val_
,
ret
);
ret
=
FVEC_SUFFIX
(
_mm256_or_
)(
ret
,
FVEC_SUFFIX
(
_mm256_andnot_
)
(
mask
.
val_
,
src
.
val_
));
return
ret
;
}
VEC_INLINE
static
BVEC_NAME
full
()
{
__m256i
a
=
_mm256_undefined_si256
();
return
FVEC_SUFFIX
(
_mm256_castsi256_
)(
_mm256_cmpeq_epi32
(
a
,
a
));
}
VEC_INLINE
static
BVEC_NAME
empty
()
{
return
FVEC_SUFFIX
(
_mm256_setzero_
)();
}
VEC_INLINE
static
BVEC_NAME
only
(
int
n
)
{
static
const
unsigned
int
FULL_ps
=
(
unsigned
int
)
-
1
;
static
const
unsigned
int
LUT_ps
[
9
][
8
]
=
{
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
FULL_ps
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
FULL_ps
,
FULL_ps
,
0
,
0
,
0
,
0
,
0
,
0
},
{
FULL_ps
,
FULL_ps
,
FULL_ps
,
0
,
0
,
0
,
0
,
0
},
{
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
0
,
0
,
0
,
0
},
{
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
0
,
0
,
0
},
{
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
0
,
0
},
{
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
0
},
{
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
},
};
static
const
unsigned
long
long
FULL_pd
=
(
unsigned
long
long
)
-
1
;
static
const
unsigned
long
long
LUT_pd
[
5
][
4
]
=
{
{
0
,
0
,
0
,
0
},
{
FULL_pd
,
0
,
0
,
0
},
{
FULL_pd
,
FULL_pd
,
0
,
0
},
{
FULL_pd
,
FULL_pd
,
FULL_pd
,
0
},
{
FULL_pd
,
FULL_pd
,
FULL_pd
,
FULL_pd
},
};
return
FVEC_SUFFIX
(
_mm256_load_
)((
const
FVEC_SCAL_T
*
)
FVEC_SUFFIX
(
LUT_
)[
n
]);
}
VEC_INLINE
static
BVEC_NAME
after
(
int
n
)
{
static
const
unsigned
int
FULL_ps
=
(
unsigned
int
)
-
1
;
static
const
unsigned
int
LUT_ps
[
9
][
8
]
=
{
{
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
},
{
0
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
},
{
0
,
0
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
},
{
0
,
0
,
0
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
},
{
0
,
0
,
0
,
0
,
FULL_ps
,
FULL_ps
,
FULL_ps
,
FULL_ps
},
{
0
,
0
,
0
,
0
,
0
,
FULL_ps
,
FULL_ps
,
FULL_ps
},
{
0
,
0
,
0
,
0
,
0
,
0
,
FULL_ps
,
FULL_ps
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
FULL_ps
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
};
static
const
unsigned
long
long
FULL_pd
=
(
unsigned
long
long
)
-
1
;
static
const
unsigned
long
long
LUT_pd
[
5
][
4
]
=
{
{
FULL_pd
,
FULL_pd
,
FULL_pd
,
FULL_pd
},
{
0
,
FULL_pd
,
FULL_pd
,
FULL_pd
},
{
0
,
0
,
FULL_pd
,
FULL_pd
},
{
0
,
0
,
0
,
FULL_pd
},
{
0
,
0
,
0
,
0
},
};
return
FVEC_SUFFIX
(
_mm256_load_
)((
const
FVEC_SCAL_T
*
)
FVEC_SUFFIX
(
LUT_
)[
n
]);
}
VEC_INLINE
static
BVEC_NAME
onlyafter
(
int
only_
,
int
after_
)
{
return
kand
(
after
(
after_
),
only
(
after_
+
only_
));
}
VEC_INLINE
static
int
popcnt
(
const
BVEC_NAME
&
a
)
{
return
_popcnt32
(
FVEC_SUFFIX
(
_mm256_movemask_
)(
a
.
val_
));
}
VEC_INLINE
static
bool
test_all_unset
(
const
BVEC_NAME
&
a
)
{
return
FVEC_SUFFIX
(
_mm256_testz_
)(
a
.
val_
,
a
.
val_
);
}
VEC_INLINE
static
bool
test_any_set
(
const
BVEC_NAME
&
a
)
{
return
!
test_all_unset
(
a
);
}
VEC_INLINE
static
bool
test_at
(
const
BVEC_NAME
&
a
,
int
i
)
{
assert
(
i
<
FVEC_LEN
);
return
FVEC_SUFFIX
(
_mm256_movemask_
)(
a
.
val_
)
&
(
1
<<
i
);
}
VEC_INLINE
BVEC_NAME
operator
&
(
const
BVEC_NAME
&
b
)
const
{
return
FVEC_SUFFIX
(
_mm256_and_
)(
val_
,
b
.
val_
);
}
VEC_INLINE
BVEC_NAME
operator
|
(
const
BVEC_NAME
&
b
)
const
{
return
FVEC_SUFFIX
(
_mm256_or_
)(
val_
,
b
.
val_
);
}
VEC_INLINE
BVEC_NAME
operator
~
()
const
{
return
FVEC_SUFFIX
(
_mm256_andnot_
)(
val_
,
full
().
val_
);
}
};
class
IVEC_NAME
{
friend
class
FVEC_NAME
;
friend
class
AVEC_NAME
;
# if FVEC_LEN==8
friend
class
avec8pd
;
# endif
__m256i
val_
;
VEC_INLINE
IVEC_NAME
(
const
__m256i
&
v
)
:
val_
(
v
)
{}
VEC_INLINE
static
__m256i
to
(
const
FVEC_VEC_T
&
a
)
{
# if FVEC_LEN==4
return
_mm256_castpd_si256
(
a
);
# else
return
_mm256_castps_si256
(
a
);
# endif
}
VEC_INLINE
static
FVEC_VEC_T
from
(
const
__m256i
&
a
)
{
return
FVEC_SUFFIX
(
_mm256_castsi256_
)(
a
);
}
public:
static
const
int
VL
=
8
;
VEC_INLINE
IVEC_NAME
()
{}
#define IVEC_MASK_BINFN_B(the_name) \
VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a, \
const IVEC_NAME &b) { \
return _mm256_##the_name##_epi32(a.val_, b.val_); \
} \
VEC_INLINE static BVEC_NAME mask_##the_name( \
const BVEC_NAME &mask, \
const IVEC_NAME &a, const IVEC_NAME &b \
) { \
BVEC_NAME ret = _mm256_##the_name##_epi32( \
a.val_, b.val_); \
return mask & ret; \
}
IVEC_MASK_BINFN_B
(
cmpeq
)
IVEC_MASK_BINFN_B
(
cmpgt
)
VEC_INLINE
static
__m256i
_mm256_cmplt_epi32
(
__m256i
a
,
__m256i
b
)
{
__m256i
le
=
_mm256_cmpgt_epi32
(
b
,
a
);
__m256i
eq
=
_mm256_cmpeq_epi32
(
a
,
b
);
return
_mm256_andnot_si256
(
eq
,
le
);
}
VEC_INLINE
static
__m256i
_mm256_cmpneq_epi32
(
__m256i
a
,
__m256i
b
)
{
__m256i
eq
=
_mm256_cmpeq_epi32
(
a
,
b
);
__m256i
t
=
_mm256_undefined_si256
();
__m256i
f
=
_mm256_cmpeq_epi32
(
t
,
t
);
return
_mm256_andnot_si256
(
eq
,
f
);
}
IVEC_MASK_BINFN_B
(
cmplt
)
IVEC_MASK_BINFN_B
(
cmpneq
)
#undef IVEC_MASK_BINFN_B
VEC_INLINE
static
IVEC_NAME
mask_blend
(
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
a
,
const
IVEC_NAME
&
b
)
{
return
to
(
FVEC_SUFFIX
(
_mm256_blendv_
)(
from
(
a
.
val_
),
from
(
b
.
val_
),
mask
.
val_
));
}
#define IVEC_MASK_BINFN_I(the_name) \
VEC_INLINE static IVEC_NAME mask_##the_name( \
const IVEC_NAME &src, const BVEC_NAME &mask, \
const IVEC_NAME &a, const IVEC_NAME &b \
) { \
IVEC_NAME ret = _mm256_##the_name##_epi32( \
a.val_, b.val_); \
return mask_blend(mask, src, ret); \
}
IVEC_MASK_BINFN_I
(
add
)
#undef IVEC_MASK_BINFN_I
#define IVEC_BINFN_I(the_name) \
VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a, \
const IVEC_NAME &b) { \
return _mm256_##the_name##_epi32(a.val_, b.val_); \
}
IVEC_BINFN_I
(
mullo
)
IVEC_BINFN_I
(
srlv
)
#undef IVEC_BINFN_I
VEC_INLINE
static
IVEC_NAME
the_and
(
const
IVEC_NAME
&
a
,
const
IVEC_NAME
&
b
)
{
return
_mm256_and_si256
(
a
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
IVEC_NAME
masku_compress
(
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
b
)
{
return
to
(
FVEC_SUFFIX
(
_mm256_compress_
)(
mask
.
val_
,
from
(
b
.
val_
)));
}
VEC_INLINE
static
IVEC_NAME
mask_expand
(
const
IVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
b
)
{
FVEC_VEC_T
ret
=
FVEC_SUFFIX
(
_mm256_expand_
)(
mask
.
val_
,
from
(
b
.
val_
));
ret
=
FVEC_SUFFIX
(
_mm256_and_
)(
mask
.
val_
,
ret
);
ret
=
FVEC_SUFFIX
(
_mm256_or_
)(
ret
,
FVEC_SUFFIX
(
_mm256_andnot_
)
(
mask
.
val_
,
from
(
src
.
val_
)));
return
to
(
ret
);
}
VEC_INLINE
static
void
store
(
int
*
dest
,
const
IVEC_NAME
&
src
)
{
_mm256_store_si256
((
__m256i
*
)
dest
,
src
.
val_
);
# if FVEC_LEN==4
dest
[
1
]
=
dest
[
2
];
dest
[
2
]
=
dest
[
4
];
dest
[
3
]
=
dest
[
6
];
# endif
}
VEC_INLINE
static
int
at
(
const
IVEC_NAME
&
a
,
int
b
)
{
int
data
[
8
]
__attribute__
((
aligned
(
32
)));
store
(
data
,
a
);
return
data
[
b
];
}
VEC_INLINE
static
void
print
(
const
char
*
str
,
const
IVEC_NAME
&
a
)
{
int
data
[
8
]
__attribute__
((
aligned
(
32
)));
store
(
data
,
a
);
printf
(
"%s:"
,
str
);
for
(
int
i
=
0
;
i
<
FVEC_LEN
;
i
++
)
{
printf
(
" %d"
,
data
[
i
]);
}
printf
(
"
\n
"
);
}
VEC_INLINE
static
IVEC_NAME
maskz_loadu
(
const
BVEC_NAME
&
mask
,
const
int
*
src
)
{
FVEC_VEC_T
mask_val
=
mask
.
val_
;
# if FVEC_LEN==4
# ifdef __AVX2__
static
const
unsigned
int
mask_shuffle
[
8
]
__attribute__
((
aligned
(
32
)))
=
{
0
,
2
,
4
,
6
,
0
,
0
,
0
,
0
};
__m256
m
=
_mm256_castpd_ps
(
mask_val
);
m
=
_mm256_permutevar8x32_ps
(
m
,
_mm256_load_si256
((
__m256i
*
)
mask_shuffle
));
__m128i
ret
=
_mm_maskload_epi32
(
src
,
_mm256_castsi256_si128
(
_mm256_castps_si256
(
m
)));
static
const
unsigned
int
load_shuffle
[
8
]
__attribute__
((
aligned
(
32
)))
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
};
return
_mm256_permutevar8x32_epi32
(
_mm256_castsi128_si256
(
ret
),
_mm256_load_si256
((
__m256i
*
)
load_shuffle
));
# else
int
dest
[
8
]
__attribute__
((
aligned
(
32
)))
=
{
0
};
int
mask_buf
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_pd
((
double
*
)
mask_buf
,
mask
.
val_
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
if
(
mask_buf
[
2
*
i
])
{
int
val
=
src
[
i
];
dest
[
2
*
i
+
0
]
=
val
;
dest
[
2
*
i
+
1
]
=
val
;
}
}
return
_mm256_load_si256
((
__m256i
*
)
dest
);
# endif
# else
return
_mm256_maskload_epi32
(
src
,
to
(
mask_val
));
# endif
}
VEC_INLINE
static
IVEC_NAME
mask_gather
(
const
IVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
int
*
mem
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
int
));
return
_mm256_mask_i32gather_epi32
(
src
.
val_
,
mem
,
idx
.
val_
,
to
(
mask
.
val_
),
sizeof
(
int
));
}
VEC_INLINE
static
void
mask_compressstore
(
const
BVEC_NAME
&
mask
,
int
*
dest
,
const
IVEC_NAME
&
src
)
{
int
buf
[
8
]
__attribute__
((
aligned
(
64
)));
const
int
stride
=
FVEC_LEN
==
4
?
2
:
1
;
_mm256_store_si256
((
__m256i
*
)
buf
,
src
.
val_
);
int
mask_val
=
FVEC_SUFFIX
(
_mm256_movemask_
)(
mask
.
val_
);
int
k
=
0
;
#pragma unroll
for
(
int
i
=
0
;
i
<
FVEC_LEN
;
i
++
)
{
if
(
mask_val
&
(
1
<<
i
))
dest
[
k
++
]
=
buf
[
stride
*
i
];
}
}
VEC_INLINE
static
IVEC_NAME
set1
(
int
i
)
{
return
_mm256_set1_epi32
(
i
);
}
VEC_INLINE
static
IVEC_NAME
setzero
()
{
return
_mm256_setzero_si256
();
}
VEC_INLINE
static
IVEC_NAME
undefined
()
{
return
_mm256_undefined_si256
();
}
VEC_INLINE
IVEC_NAME
operator
+
(
const
IVEC_NAME
&
b
)
const
{
return
_mm256_add_epi32
(
this
->
val_
,
b
.
val_
);
}
};
class
FVEC_NAME
{
friend
class
AVEC_NAME
;
#if FVEC_LEN==8
friend
class
avec8pd
;
#endif
FVEC_VEC_T
val_
;
VEC_INLINE
FVEC_NAME
(
const
FVEC_VEC_T
&
v
)
:
val_
(
v
)
{}
public:
static
const
int
VL
=
FVEC_LEN
;
# if defined(__AVX2__) || defined(__MIC__) || defined(__AVX512F__)
VEC_INLINE
static
bool
fast_compress
()
{
return
true
;
}
# else
VEC_INLINE
static
bool
fast_compress
()
{
return
false
;
}
# endif
VEC_INLINE
FVEC_NAME
()
{}
VEC_INLINE
static
FVEC_SCAL_T
at
(
const
FVEC_NAME
&
a
,
int
i
)
{
assert
(
i
<
FVEC_LEN
);
FVEC_SCAL_T
data
[
FVEC_LEN
]
__attribute__
((
aligned
(
64
)));
FVEC_SUFFIX
(
_mm256_store_
)(
data
,
a
.
val_
);
return
data
[
i
];
}
#define FVEC_MASK_BINFN_B(the_name, the_imm) \
VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a, \
const FVEC_NAME &b) { \
return FVEC_SUFFIX(_mm256_cmp_)(a.val_, b.val_, the_imm); \
} \
VEC_INLINE static BVEC_NAME mask_##the_name( \
const BVEC_NAME &mask, \
const FVEC_NAME &a, const FVEC_NAME &b \
) { \
BVEC_NAME ret = FVEC_SUFFIX(_mm256_cmp_)( \
a.val_, b.val_, the_imm); \
return mask & ret; \
}
FVEC_MASK_BINFN_B
(
cmple
,
_CMP_LE_OS
)
FVEC_MASK_BINFN_B
(
cmplt
,
_CMP_LT_OS
)
FVEC_MASK_BINFN_B
(
cmpneq
,
_CMP_NEQ_UQ
)
FVEC_MASK_BINFN_B
(
cmpnle
,
_CMP_NLE_US
)
FVEC_MASK_BINFN_B
(
cmpnlt
,
_CMP_NLT_US
)
#undef FVEC_MASK_BINFN_B
VEC_INLINE
static
__m256d
_mm256_recip_pd
(
__m256d
a
)
{
__m256d
c_1
=
_mm256_set1_pd
(
1
);
return
_mm256_div_pd
(
c_1
,
a
);
}
VEC_INLINE
static
__m256
_mm256_recip_ps
(
__m256
a
)
{
return
_mm256_rcp_ps
(
a
);
}
VEC_INLINE
static
__m256d
_mm256_abs_pd
(
__m256d
a
)
{
const
unsigned
long
long
abs_mask
=
0x7FFFFFFFFFFFFFFF
;
const
unsigned
long
long
abs_full
[
8
]
=
{
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
};
return
_mm256_and_pd
(
_mm256_load_pd
((
double
*
)
abs_full
),
a
);
}
VEC_INLINE
static
__m256
_mm256_abs_ps
(
__m256
a
)
{
const
unsigned
long
long
abs_mask
=
0x7FFFFFFF
;
const
unsigned
long
long
abs_full
[
16
]
=
{
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
,
abs_mask
};
return
_mm256_and_ps
(
_mm256_load_ps
((
float
*
)
abs_full
),
a
);
}
#define FVEC_UNFN_F(the_name) \
VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) { \
return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_); \
}
FVEC_UNFN_F
(
abs
)
FVEC_UNFN_F
(
exp
)
FVEC_UNFN_F
(
invsqrt
)
FVEC_UNFN_F
(
recip
)
FVEC_UNFN_F
(
sqrt
)
#undef FVEC_UNFN_F
VEC_INLINE
static
FVEC_NAME
mask_blend
(
const
BVEC_NAME
&
mask
,
const
FVEC_NAME
&
a
,
const
FVEC_NAME
&
b
)
{
return
FVEC_SUFFIX
(
_mm256_blendv_
)(
a
.
val_
,
b
.
val_
,
mask
.
val_
);
}
#define FVEC_MASK_UNFN_F(the_name) \
VEC_INLINE static FVEC_NAME mask_##the_name( \
const FVEC_NAME &src, const BVEC_NAME &mask, \
const FVEC_NAME &a \
) { \
FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)( \
a.val_); \
return mask_blend(mask, src, ret); \
}
FVEC_MASK_UNFN_F
(
cos
)
FVEC_MASK_UNFN_F
(
recip
)
FVEC_MASK_UNFN_F
(
sqrt
)
#undef FVEC_MASK_UNFN_F
#define FVEC_BINFN_F(the_name) \
VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a, \
const FVEC_NAME &b) { \
return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_, b.val_); \
}
FVEC_BINFN_F
(
max
)
FVEC_BINFN_F
(
min
)
#undef FVEC_BINFN_F
#define FVEC_MASK_BINFN_F(the_name) \
VEC_INLINE static FVEC_NAME mask_##the_name( \
const FVEC_NAME &src, const BVEC_NAME &mask, \
const FVEC_NAME &a, const FVEC_NAME &b \
) { \
FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)( \
a.val_, b.val_); \
return mask_blend(mask, src, ret); \
}
FVEC_MASK_BINFN_F
(
add
)
FVEC_MASK_BINFN_F
(
div
)
FVEC_MASK_BINFN_F
(
mul
)
FVEC_MASK_BINFN_F
(
sub
)
#undef FVEC_MASK_BINFN_F
VEC_INLINE
static
FVEC_NAME
mask_expand
(
const
FVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
FVEC_NAME
&
b
)
{
FVEC_VEC_T
ret
=
FVEC_SUFFIX
(
_mm256_expand_
)(
mask
.
val_
,
b
.
val_
);
ret
=
FVEC_SUFFIX
(
_mm256_and_
)(
mask
.
val_
,
ret
);
ret
=
FVEC_SUFFIX
(
_mm256_or_
)(
ret
,
FVEC_SUFFIX
(
_mm256_andnot_
)
(
mask
.
val_
,
src
.
val_
));
return
ret
;
}
VEC_INLINE
static
FVEC_NAME
masku_compress
(
const
BVEC_NAME
&
mask
,
const
FVEC_NAME
&
b
)
{
return
FVEC_SUFFIX
(
_mm256_compress_
)(
mask
.
val_
,
b
.
val_
);
}
VEC_INLINE
static
FVEC_NAME
set1
(
const
FVEC_SCAL_T
&
a
)
{
return
FVEC_SUFFIX
(
_mm256_set1_
)(
a
);
}
VEC_INLINE
static
FVEC_NAME
setzero
()
{
return
FVEC_SUFFIX
(
_mm256_setzero_
)();
}
VEC_INLINE
static
FVEC_NAME
undefined
()
{
return
FVEC_SUFFIX
(
_mm256_undefined_
)();
}
VEC_INLINE
static
FVEC_NAME
load
(
const
FVEC_SCAL_T
*
mem
)
{
return
FVEC_SUFFIX
(
_mm256_load_
)(
mem
);
}
VEC_INLINE
static
void
store
(
FVEC_SCAL_T
*
dest
,
const
FVEC_NAME
&
a
)
{
FVEC_SUFFIX
(
_mm256_store_
)(
dest
,
a
.
val_
);
}
VEC_INLINE
static
FVEC_NAME
gather
(
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
# if FVEC_LEN==4
# ifdef __AVX2__
static
const
unsigned
int
mask_shuffle
[
8
]
__attribute__
((
aligned
(
32
)))
=
{
0
,
2
,
4
,
6
,
0
,
0
,
0
,
0
};
__m256i
m
=
_mm256_permutevar8x32_epi32
(
idx
.
val_
,
_mm256_load_si256
((
__m256i
*
)
mask_shuffle
));
__m128i
idx_short
=
_mm256_castsi256_si128
(
m
);
return
FVEC_SUFFIX
(
_mm256_i32gather_
)(
mem
,
idx_short
,
sizeof
(
FVEC_SCAL_T
));
# else
int
idx_buf
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_si256
((
__m256i
*
)
idx_buf
,
idx
.
val_
);
double
dest
[
4
]
__attribute__
((
aligned
(
32
)));
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
dest
[
i
]
=
mem
[
idx_buf
[
2
*
i
]];
}
return
_mm256_load_pd
(
dest
);
# endif
# else
return
FVEC_SUFFIX
(
_mm256_i32gather_
)(
mem
,
idx
.
val_
,
sizeof
(
FVEC_SCAL_T
));
# endif
}
VEC_INLINE
static
FVEC_NAME
mask_gather
(
const
FVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
# if FVEC_LEN==4
# ifdef __AVX2__
static
const
unsigned
int
mask_shuffle
[
8
]
__attribute__
((
aligned
(
32
)))
=
{
0
,
2
,
4
,
6
,
0
,
0
,
0
,
0
};
__m256i
m
=
_mm256_permutevar8x32_epi32
(
idx
.
val_
,
_mm256_load_si256
((
__m256i
*
)
mask_shuffle
));
__m128i
idx_short
=
_mm256_castsi256_si128
(
m
);
return
FVEC_SUFFIX
(
_mm256_mask_i32gather_
)(
src
.
val_
,
mem
,
idx_short
,
mask
.
val_
,
sizeof
(
FVEC_SCAL_T
));
# else
int
idx_buf
[
8
]
__attribute__
((
aligned
(
32
)));
int
mask_buf
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_si256
((
__m256i
*
)
idx_buf
,
idx
.
val_
);
_mm256_store_pd
((
double
*
)
mask_buf
,
mask
.
val_
);
double
dest
[
4
]
__attribute__
((
aligned
(
32
)));
_mm256_store_pd
((
double
*
)
dest
,
src
.
val_
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
if
(
mask_buf
[
2
*
i
])
dest
[
i
]
=
mem
[
idx_buf
[
2
*
i
]];
}
return
_mm256_load_pd
(
dest
);
# endif
# else
return
FVEC_SUFFIX
(
_mm256_mask_i32gather_
)(
src
.
val_
,
mem
,
idx
.
val_
,
mask
.
val_
,
sizeof
(
FVEC_SCAL_T
));
# endif
}
VEC_INLINE
static
void
gather_4_adjacent
(
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
,
FVEC_NAME
*
out_0
,
FVEC_NAME
*
out_1
,
FVEC_NAME
*
out_2
,
FVEC_NAME
*
out_3
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
int
idx_buf
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_si256
((
__m256i
*
)
idx_buf
,
idx
.
val_
);
# if FVEC_LEN==4
__m256d
a0
=
_mm256_load_pd
(
&
mem
[
idx_buf
[
0
]]);
__m256d
a1
=
_mm256_load_pd
(
&
mem
[
idx_buf
[
2
]]);
__m256d
a2
=
_mm256_load_pd
(
&
mem
[
idx_buf
[
4
]]);
__m256d
a3
=
_mm256_load_pd
(
&
mem
[
idx_buf
[
6
]]);
__m256d
b0
=
_mm256_unpacklo_pd
(
a0
,
a1
);
__m256d
b1
=
_mm256_unpackhi_pd
(
a0
,
a1
);
__m256d
b2
=
_mm256_unpacklo_pd
(
a2
,
a3
);
__m256d
b3
=
_mm256_unpackhi_pd
(
a2
,
a3
);
*
out_0
=
_mm256_permute2f128_pd
(
b0
,
b2
,
0x20
);
*
out_1
=
_mm256_permute2f128_pd
(
b1
,
b3
,
0x20
);
*
out_2
=
_mm256_permute2f128_pd
(
b0
,
b2
,
0x31
);
*
out_3
=
_mm256_permute2f128_pd
(
b1
,
b3
,
0x31
);
# else
const
float
*
e0
=
&
mem
[
idx_buf
[
0
]];
const
float
*
e1
=
&
mem
[
idx_buf
[
1
]];
const
float
*
e2
=
&
mem
[
idx_buf
[
2
]];
const
float
*
e3
=
&
mem
[
idx_buf
[
3
]];
const
float
*
e4
=
&
mem
[
idx_buf
[
4
]];
const
float
*
e5
=
&
mem
[
idx_buf
[
5
]];
const
float
*
e6
=
&
mem
[
idx_buf
[
6
]];
const
float
*
e7
=
&
mem
[
idx_buf
[
7
]];
__m256
a0
=
_mm256_loadu2_m128
(
e4
,
e0
);
__m256
a1
=
_mm256_loadu2_m128
(
e5
,
e1
);
__m256
b0
=
_mm256_unpacklo_ps
(
a0
,
a1
);
__m256
b1
=
_mm256_unpackhi_ps
(
a0
,
a1
);
__m256
a2
=
_mm256_loadu2_m128
(
e6
,
e2
);
__m256
a3
=
_mm256_loadu2_m128
(
e7
,
e3
);
__m256
b2
=
_mm256_unpacklo_ps
(
a2
,
a3
);
__m256
b3
=
_mm256_unpackhi_ps
(
a2
,
a3
);
*
out_0
=
_mm256_shuffle_ps
(
b0
,
b2
,
0x44
);
*
out_1
=
_mm256_shuffle_ps
(
b0
,
b2
,
0xEE
);
*
out_2
=
_mm256_shuffle_ps
(
b1
,
b3
,
0x44
);
*
out_3
=
_mm256_shuffle_ps
(
b1
,
b3
,
0xEE
);
# endif
}
VEC_INLINE
static
void
gather_3_adjacent
(
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
,
FVEC_NAME
*
out_0
,
FVEC_NAME
*
out_1
,
FVEC_NAME
*
out_2
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
FVEC_NAME
tmp_3
;
gather_4_adjacent
(
idx
,
mem
,
scale
,
out_0
,
out_1
,
out_2
,
&
tmp_3
);
}
VEC_INLINE
static
double
_mm256_reduce_add_pd
(
__m256d
a
)
{
__m256d
t1
=
_mm256_hadd_pd
(
a
,
a
);
__m128d
t2
=
_mm256_extractf128_pd
(
t1
,
1
);
__m128d
t3
=
_mm256_castpd256_pd128
(
t1
);
return
_mm_cvtsd_f64
(
_mm_add_pd
(
t2
,
t3
));
}
VEC_INLINE
static
float
_mm256_reduce_add_ps
(
__m256
a
)
{
__m256
t1
=
_mm256_hadd_ps
(
a
,
a
);
__m128
t2
=
_mm256_extractf128_ps
(
t1
,
1
);
__m128
t3
=
_mm256_castps256_ps128
(
t1
);
__m128
t4
=
_mm_add_ps
(
t2
,
t3
);
__m128
t5
=
_mm_permute_ps
(
t4
,
0x1B
);
// 0x1B = reverse
return
_mm_cvtss_f32
(
_mm_add_ps
(
t4
,
t5
));
}
VEC_INLINE
static
FVEC_SCAL_T
reduce_add
(
const
FVEC_NAME
&
a
)
{
return
FVEC_SUFFIX
(
_mm256_reduce_add_
)(
a
.
val_
);
}
VEC_INLINE
static
FVEC_SCAL_T
mask_reduce_add
(
const
BVEC_NAME
&
mask
,
const
FVEC_NAME
&
a
)
{
return
reduce_add
(
FVEC_SUFFIX
(
_mm256_and_
)(
mask
.
val_
,
a
.
val_
));
}
VEC_INLINE
static
IVEC_NAME
unpackloepi32
(
const
FVEC_NAME
&
a
)
{
# if FVEC_LEN==4
# if __AVX2__
static
const
unsigned
int
mask_shuffle
[
8
]
__attribute__
((
aligned
(
32
)))
=
{
0
,
0
,
2
,
2
,
4
,
4
,
6
,
6
};
__m256
m
=
_mm256_permutevar8x32_ps
(
_mm256_castpd_ps
(
a
.
val_
),
_mm256_load_si256
((
__m256i
*
)
mask_shuffle
));
return
_mm256_castps_si256
(
m
);
# else
__m128i
a_lo
=
_mm256_castsi256_si128
(
_mm256_castpd_si256
(
a
.
val_
));
__m128i
a_hi
=
_mm256_extractf128_si256
(
_mm256_castpd_si256
(
a
.
val_
),
1
);
__m128i
c_lo
=
_mm_shuffle_epi32
(
a_lo
,
0xA0
);
/*1010 0000*/
__m128i
c_hi
=
_mm_shuffle_epi32
(
a_hi
,
0xA0
);
__m256i
ret
=
_mm256_setr_m128i
(
c_lo
,
c_hi
);
return
ret
;
# endif
# else
return
_mm256_castps_si256
(
a
.
val_
);
# endif
}
VEC_INLINE
static
FVEC_NAME
mask_sincos
(
FVEC_NAME
*
cos
,
const
FVEC_NAME
&
src_a
,
const
FVEC_NAME
&
src_b
,
const
BVEC_NAME
&
mask
,
const
FVEC_NAME
&
arg
)
{
FVEC_VEC_T
c
,
s
=
FVEC_SUFFIX
(
_mm256_sincos_
)(
&
c
,
arg
.
val_
);
*
cos
=
mask_blend
(
mask
,
src_b
,
c
);
return
mask_blend
(
mask
,
src_a
,
s
);
}
#define FVEC_BINOP(the_sym, the_name) \
VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \
return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_); \
}
FVEC_BINOP
(
+
,
add
)
FVEC_BINOP
(
-
,
sub
)
FVEC_BINOP
(
*
,
mul
)
FVEC_BINOP
(
/
,
div
)
#undef FVEC_BINOP
VEC_INLINE
static
void
gather_prefetch0
(
const
IVEC_NAME
&
a
,
void
*
mem
)
{
/* NOP */
}
};
class
AVEC_NAME
{
friend
class
avec8pd
;
FVEC_VEC_T
val_
;
VEC_INLINE
AVEC_NAME
(
const
FVEC_VEC_T
&
a
)
:
val_
(
a
)
{}
public:
VEC_INLINE
AVEC_NAME
(
const
FVEC_NAME
&
a
)
:
val_
(
a
.
val_
)
{}
VEC_INLINE
static
AVEC_NAME
undefined
()
{
return
FVEC_SUFFIX
(
_mm256_undefined_
)();
}
VEC_INLINE
static
AVEC_NAME
mask_gather
(
const
AVEC_NAME
&
src
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
FVEC_SCAL_T
*
mem
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
return
FVEC_NAME
::
mask_gather
(
src
.
val_
,
mask
,
idx
,
mem
,
scale
);
}
VEC_INLINE
static
void
mask_i32loscatter
(
FVEC_SCAL_T
*
mem
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
AVEC_NAME
&
a
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
FVEC_SCAL_T
));
for
(
int
l
=
0
;
l
<
FVEC_NAME
::
VL
;
l
++
)
{
if
(
BVEC_NAME
::
test_at
(
mask
,
l
))
mem
[
IVEC_NAME
::
at
(
idx
,
l
)]
=
FVEC_NAME
::
at
(
a
.
val_
,
l
);
}
}
#define AVEC_BINOP(the_sym, the_name) \
VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \
return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_); \
}
AVEC_BINOP
(
-
,
sub
)
#undef AVEC_BINOP
};
#if FVEC_LEN==8
class
avec8pd
{
__m256d
lo_
,
hi_
;
VEC_INLINE
avec8pd
(
const
__m256d
&
lo
,
const
__m256d
&
hi
)
:
lo_
(
lo
),
hi_
(
hi
)
{}
VEC_INLINE
static
__m128
get_ps_hi
(
__m256
a
)
{
return
_mm256_extractf128_ps
(
a
,
1
);
}
VEC_INLINE
static
__m128
get_ps_lo
(
__m256
a
)
{
return
_mm256_castps256_ps128
(
a
);
}
VEC_INLINE
static
__m128i
get_si_hi
(
__m256i
a
)
{
return
_mm_castps_si128
(
get_ps_hi
(
_mm256_castsi256_ps
(
a
)));
}
VEC_INLINE
static
__m128i
get_si_lo
(
__m256i
a
)
{
return
_mm_castps_si128
(
get_ps_lo
(
_mm256_castsi256_ps
(
a
)));
}
public:
VEC_INLINE
avec8pd
(
const
FVEC_NAME
&
a
)
{
lo_
=
_mm256_cvtps_pd
(
get_ps_lo
(
a
.
val_
));
hi_
=
_mm256_cvtps_pd
(
get_ps_hi
(
a
.
val_
));
}
VEC_INLINE
static
avec8pd
undefined
()
{
return
avec8pd
(
_mm256_undefined_pd
(),
_mm256_undefined_pd
());
}
VEC_INLINE
static
avec8pd
mask_gather
(
const
avec8pd
&
src
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
double
*
mem
,
const
int
scale
)
{
# ifndef __AVX2__
assert
(
scale
==
sizeof
(
double
));
int
idx_buf
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_si256
((
__m256i
*
)
idx_buf
,
idx
.
val_
);
int
mask_val
=
_mm256_movemask_ps
(
mask
.
val_
);
double
ret_buf
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_pd
(
&
ret_buf
[
0
],
src
.
lo_
);
_mm256_store_pd
(
&
ret_buf
[
4
],
src
.
hi_
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
if
(
mask_val
&
(
1
<<
i
))
{
ret_buf
[
i
]
=
mem
[
idx_buf
[
i
]];
}
}
__m256d
lo
=
_mm256_load_pd
(
&
ret_buf
[
0
]);
__m256d
hi
=
_mm256_load_pd
(
&
ret_buf
[
4
]);
# else
static
const
unsigned
int
lo_shuffle
[
8
]
__attribute__
((
aligned
(
32
)))
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
};
static
const
unsigned
int
hi_shuffle
[
8
]
__attribute__
((
aligned
(
32
)))
=
{
4
,
4
,
5
,
5
,
6
,
6
,
7
,
7
};
__m256d
lo_mask
=
_mm256_castps_pd
(
_mm256_permutevar8x32_ps
(
mask
.
val_
,
_mm256_load_si256
((
__m256i
*
)
lo_shuffle
)));
__m256d
hi_mask
=
_mm256_castps_pd
(
_mm256_permutevar8x32_ps
(
mask
.
val_
,
_mm256_load_si256
((
__m256i
*
)
hi_shuffle
)));
__m256d
lo
=
_mm256_mask_i32gather_pd
(
src
.
lo_
,
mem
,
get_si_lo
(
idx
.
val_
),
lo_mask
,
sizeof
(
double
));
__m256d
hi
=
_mm256_mask_i32gather_pd
(
src
.
hi_
,
mem
,
get_si_hi
(
idx
.
val_
),
hi_mask
,
sizeof
(
double
));
# endif
return
avec8pd
(
lo
,
hi
);
}
VEC_INLINE
static
void
mask_i32loscatter
(
double
*
mem
,
const
BVEC_NAME
&
mask
,
const
IVEC_NAME
&
idx
,
const
avec8pd
&
a
,
const
int
scale
)
{
assert
(
scale
==
sizeof
(
double
));
double
a_buf
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_pd
(
a_buf
,
a
.
lo_
);
_mm256_store_pd
(
&
a_buf
[
4
],
a
.
hi_
);
int
idx_buf
[
8
]
__attribute__
((
aligned
(
32
)));
_mm256_store_si256
((
__m256i
*
)
idx_buf
,
idx
.
val_
);
int
mask_val
=
_mm256_movemask_ps
(
mask
.
val_
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
if
(
mask_val
&
(
1
<<
i
))
mem
[
idx_buf
[
i
]]
=
a_buf
[
i
];
}
}
#define AVEC2_BINOP(the_sym, the_name) \
VEC_INLINE inline avec8pd operator the_sym(const avec8pd &b) const { \
__m256d lo = _mm256_##the_name##_pd(this->lo_, b.lo_); \
__m256d hi = _mm256_##the_name##_pd(this->hi_, b.hi_); \
return avec8pd(lo, hi); \
}
AVEC2_BINOP
(
-
,
sub
)
};
#endif
}
#ifdef FVEC_FIRST_PASS
template
<
typename
flt_t
,
typename
acc_t
>
struct
intr_types
;
template
<>
struct
intr_types
<
double
,
double
>
{
typedef
mm256
::
fvec4pd
fvec
;
typedef
mm256
::
ivec4
ivec
;
typedef
mm256
::
bvec4
bvec
;
typedef
mm256
::
avec4pd
avec
;
};
template
<>
struct
intr_types
<
float
,
float
>
{
typedef
mm256
::
fvec8ps
fvec
;
typedef
mm256
::
ivec8
ivec
;
typedef
mm256
::
bvec8
bvec
;
typedef
mm256
::
avec8ps
avec
;
};
template
<>
struct
intr_types
<
float
,
double
>
{
typedef
mm256
::
fvec8ps
fvec
;
typedef
mm256
::
ivec8
ivec
;
typedef
mm256
::
bvec8
bvec
;
typedef
mm256
::
avec8pd
avec
;
};
#endif
#ifndef FVEC_FIRST_PASS
# define FVEC_FIRST_PASS
# include "intel_intrinsics_airebo.h"
#endif
#endif
#ifdef LMP_INTEL_AIREBO_SCALAR
#include <cassert>
#include <cmath>
#include <immintrin.h>
#define VEC_INLINE __attribute__((always_inline))
template
<
typename
flt_t
,
typename
acc_t
>
struct
intr_types
{
class
fvec
;
class
ivec
;
class
avec
;
class
bvec
{
friend
class
fvec
;
friend
class
ivec
;
friend
class
avec
;
bool
val_
;
VEC_INLINE
bvec
(
const
bool
&
v
)
:
val_
(
v
)
{}
public:
VEC_INLINE
bvec
()
{}
VEC_INLINE
static
bvec
kand
(
const
bvec
&
a
,
const
bvec
&
b
)
{
return
a
.
val_
&&
b
.
val_
;
}
VEC_INLINE
static
bvec
kandn
(
const
bvec
&
a
,
const
bvec
&
b
)
{
return
(
!
a
.
val_
)
&&
b
.
val_
;
}
VEC_INLINE
static
bvec
knot
(
const
bvec
&
a
)
{
return
!
a
.
val_
;
}
VEC_INLINE
static
int
kortestz
(
const
bvec
&
a
,
const
bvec
&
b
)
{
return
(
!
a
.
val_
)
&&
(
!
b
.
val_
)
?
true
:
false
;
}
VEC_INLINE
static
bvec
masku_compress
(
const
bvec
&
mask
,
const
bvec
&
a
)
{
return
mask
.
val_
?
a
.
val_
:
false
;
}
VEC_INLINE
static
bvec
mask_expand
(
const
bvec
&
src
,
const
bvec
&
mask
,
const
bvec
&
a
)
{
return
mask
.
val_
?
a
.
val_
:
src
.
val_
;
}
VEC_INLINE
static
bvec
full
()
{
return
true
;
}
VEC_INLINE
static
bvec
empty
()
{
return
false
;
}
VEC_INLINE
static
bvec
only
(
int
n
)
{
return
n
==
1
?
true
:
false
;
}
VEC_INLINE
static
bvec
after
(
int
n
)
{
return
n
==
0
?
true
:
false
;
}
VEC_INLINE
static
bvec
onlyafter
(
int
only
,
int
after
)
{
return
after
==
0
&&
only
==
1
?
true
:
false
;
}
VEC_INLINE
static
int
popcnt
(
const
bvec
&
a
)
{
return
static_cast
<
int
>
(
a
.
val_
);
}
VEC_INLINE
static
bool
test_all_unset
(
const
bvec
&
a
)
{
return
kortestz
(
a
,
a
);
}
VEC_INLINE
static
bool
test_any_set
(
const
bvec
&
a
)
{
return
!
test_all_unset
(
a
);
}
VEC_INLINE
static
bool
test_at
(
const
bvec
&
a
,
int
i
)
{
assert
(
i
<
1
);
return
a
.
val_
;
}
VEC_INLINE
bvec
operator
&
(
const
bvec
&
b
)
const
{
return
val_
&&
b
.
val_
;
}
VEC_INLINE
bvec
operator
|
(
const
bvec
&
b
)
const
{
return
val_
||
b
.
val_
;
}
VEC_INLINE
bvec
operator
~
()
const
{
return
!
val_
;
}
};
class
ivec
{
friend
class
fvec
;
friend
class
avec
;
int
val_
;
VEC_INLINE
ivec
(
const
int
&
v
)
:
val_
(
v
)
{}
public:
static
const
int
VL
=
1
;
VEC_INLINE
ivec
()
{}
#define IVEC_MASK_BINFN_B(the_name, the_op) \
VEC_INLINE static bvec the_name(const ivec &a, const ivec &b) { \
return a.val_ the_op b.val_; \
} \
VEC_INLINE static bvec mask_##the_name( \
const bvec &mask, \
const ivec &a, const ivec &b \
) { \
return mask.val_ && (a.val_ the_op b.val_); \
\
}
IVEC_MASK_BINFN_B
(
cmpeq
,
==
)
IVEC_MASK_BINFN_B
(
cmplt
,
<
)
IVEC_MASK_BINFN_B
(
cmpneq
,
!=
)
IVEC_MASK_BINFN_B
(
cmpgt
,
>
)
#define IVEC_MASK_BINFN_I(the_name, the_op) \
VEC_INLINE static ivec mask_##the_name( \
const ivec &src, const bvec &mask, \
const ivec &a, const ivec &b \
) { \
return mask.val_ ? a.val_ the_op b.val_ : src.val_; \
}
IVEC_MASK_BINFN_I
(
add
,
+
)
VEC_INLINE
static
ivec
mask_blend
(
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
mask
.
val_
?
b
.
val_
:
a
.
val_
;
}
#define IVEC_BINFN_I(the_name, the_op) \
VEC_INLINE static ivec the_name(const ivec &a, const ivec &b) { \
return a.val_ the_op b.val_; \
}
IVEC_BINFN_I
(
mullo
,
*
)
IVEC_BINFN_I
(
srlv
,
>>
)
VEC_INLINE
static
ivec
the_and
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
a
.
val_
&
b
.
val_
;
}
VEC_INLINE
static
ivec
mask_expand
(
const
ivec
&
src
,
const
bvec
&
a
,
const
ivec
&
b
)
{
return
a
.
val_
?
b
.
val_
:
src
.
val_
;
}
VEC_INLINE
static
ivec
masku_compress
(
const
bvec
&
a
,
const
ivec
&
b
)
{
return
a
.
val_
?
b
.
val_
:
0
;
}
VEC_INLINE
static
int
at
(
const
ivec
&
a
,
int
b
)
{
assert
(
b
==
0
);
return
a
.
val_
;
}
VEC_INLINE
static
ivec
load
(
const
int
*
src
)
{
return
*
src
;
}
VEC_INLINE
static
ivec
mask_loadu
(
const
bvec
&
mask
,
const
int
*
src
)
{
return
mask
.
val_
?
*
src
:
0xDEAD
;
}
VEC_INLINE
static
ivec
maskz_loadu
(
const
bvec
&
mask
,
const
int
*
src
)
{
return
mask
.
val_
?
*
src
:
0
;
}
VEC_INLINE
static
void
mask_storeu
(
const
bvec
&
mask
,
int
*
dest
,
const
ivec
&
src
)
{
if
(
mask
.
val_
)
*
dest
=
src
.
val_
;
}
VEC_INLINE
static
void
store
(
int
*
dest
,
const
ivec
&
src
)
{
*
dest
=
src
.
val_
;
}
VEC_INLINE
static
ivec
mask_gather
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
int
*
mem
,
const
int
scale
)
{
return
mask
.
val_
?
*
reinterpret_cast
<
const
int
*>
(
reinterpret_cast
<
const
char
*>
(
mem
)
+
scale
*
idx
.
val_
)
:
src
.
val_
;
}
VEC_INLINE
static
void
mask_i32scatter
(
int
*
mem
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
ivec
&
a
,
const
int
scale
)
{
if
(
mask
.
val_
)
*
reinterpret_cast
<
int
*>
(
reinterpret_cast
<
char
*>
(
mem
)
+
scale
*
idx
.
val_
)
=
a
.
val_
;
}
VEC_INLINE
static
void
mask_compressstore
(
const
bvec
&
mask
,
int
*
dest
,
const
ivec
&
src
)
{
if
(
mask
.
val_
)
*
dest
=
src
.
val_
;
}
VEC_INLINE
static
ivec
set
(
int
i15
,
int
i14
,
int
i13
,
int
i12
,
int
i11
,
int
i10
,
int
i9
,
int
i8
,
int
i7
,
int
i6
,
int
i5
,
int
i4
,
int
i3
,
int
i2
,
int
i1
,
int
i0
)
{
return
i0
;
}
VEC_INLINE
static
ivec
set1
(
int
i
)
{
return
i
;
}
VEC_INLINE
static
ivec
setzero
()
{
return
0
;
}
VEC_INLINE
static
ivec
undefined
()
{
return
0xDEAD
;
}
VEC_INLINE
ivec
operator
+
(
const
ivec
&
b
)
const
{
return
val_
+
b
.
val_
;
}
};
class
fvec
{
friend
class
avec
;
flt_t
val_
;
VEC_INLINE
fvec
(
const
flt_t
&
v
)
:
val_
(
v
)
{}
public:
static
const
int
VL
=
1
;
VEC_INLINE
fvec
()
{}
VEC_INLINE
static
flt_t
at
(
const
fvec
&
a
,
int
i
)
{
assert
(
i
<
1
);
return
a
.
val_
;
}
VEC_INLINE
static
bool
fast_compress
()
{
return
false
;
}
#define FVEC_MASK_BINFN_B(the_name, the_op) \
VEC_INLINE static bvec the_name(const fvec &a, const fvec &b) { \
return a.val_ the_op b.val_; \
} \
VEC_INLINE static bvec mask_##the_name( \
const bvec &mask, \
const fvec &a, const fvec &b \
) { \
return mask.val_ && (a.val_ the_op b.val_); \
}
FVEC_MASK_BINFN_B
(
cmple
,
<=
)
FVEC_MASK_BINFN_B
(
cmplt
,
<
)
FVEC_MASK_BINFN_B
(
cmpneq
,
!=
)
FVEC_MASK_BINFN_B
(
cmpnle
,
>
)
FVEC_MASK_BINFN_B
(
cmpnlt
,
>=
)
#define FVEC_UNFN_F(the_name, the_fn) \
VEC_INLINE static fvec the_name(const fvec &a) { \
return the_fn(a.val_); \
}
FVEC_UNFN_F
(
abs
,
fabs
)
FVEC_UNFN_F
(
exp
,
::
exp
)
FVEC_UNFN_F
(
invsqrt
,
1
/
std
::
sqrt
)
FVEC_UNFN_F
(
recip
,
1
/
)
FVEC_UNFN_F
(
sqrt
,
std
::
sqrt
)
#define FVEC_MASK_UNFN_F(the_name, the_fn) \
VEC_INLINE static fvec mask_##the_name( \
const fvec &src, const bvec &mask, \
const fvec &a \
) { \
return mask.val_ ? the_fn(a.val_) : src.val_; \
}
FVEC_MASK_UNFN_F
(
cos
,
std
::
cos
)
FVEC_MASK_UNFN_F
(
recip
,
1
/
)
FVEC_MASK_UNFN_F
(
sqrt
,
std
::
sqrt
)
#define FVEC_BINFN_F(the_name, the_fn) \
VEC_INLINE static fvec the_name(const fvec &a, const fvec &b) { \
return the_fn(a.val_, b.val_); \
}
FVEC_BINFN_F
(
max
,
::
fmax
)
FVEC_BINFN_F
(
min
,
::
fmin
)
#define FVEC_MASK_BINFN_F(the_name, the_op) \
VEC_INLINE static fvec mask_##the_name( \
const fvec &src, const bvec &mask, \
const fvec &a, const fvec &b \
) { \
return mask.val_ ? a.val_ the_op b.val_ : src.val_; \
}
FVEC_MASK_BINFN_F
(
add
,
+
)
FVEC_MASK_BINFN_F
(
div
,
/
)
FVEC_MASK_BINFN_F
(
mul
,
*
)
FVEC_MASK_BINFN_F
(
sub
,
-
)
VEC_INLINE
static
fvec
mask_blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
mask
.
val_
?
b
.
val_
:
a
.
val_
;
}
VEC_INLINE
static
fvec
mask_expand
(
const
fvec
&
src
,
const
bvec
&
a
,
const
fvec
&
b
)
{
return
a
.
val_
?
b
.
val_
:
src
.
val_
;
}
VEC_INLINE
static
fvec
masku_compress
(
const
bvec
&
a
,
const
fvec
&
b
)
{
return
a
.
val_
?
b
.
val_
:
0
;
}
VEC_INLINE
static
fvec
set1
(
const
flt_t
&
a
)
{
return
a
;
}
VEC_INLINE
static
fvec
setzero
()
{
return
0
;
}
VEC_INLINE
static
fvec
undefined
()
{
return
1337.1337
;
}
VEC_INLINE
static
fvec
load
(
const
flt_t
*
mem
)
{
return
*
mem
;
}
VEC_INLINE
static
void
mask_storeu
(
const
bvec
&
mask
,
flt_t
*
dest
,
const
fvec
&
a
)
{
if
(
mask
.
val_
)
*
dest
=
a
.
val_
;
}
VEC_INLINE
static
void
store
(
flt_t
*
dest
,
const
fvec
&
a
)
{
*
dest
=
a
.
val_
;
}
VEC_INLINE
static
fvec
gather
(
const
ivec
&
idx
,
const
flt_t
*
mem
,
const
int
scale
)
{
return
*
reinterpret_cast
<
const
flt_t
*>
(
reinterpret_cast
<
const
char
*>
(
mem
)
+
scale
*
idx
.
val_
);
}
VEC_INLINE
static
fvec
mask_gather
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
flt_t
*
mem
,
const
int
scale
)
{
return
mask
.
val_
?
*
reinterpret_cast
<
const
flt_t
*>
(
reinterpret_cast
<
const
char
*>
(
mem
)
+
scale
*
idx
.
val_
)
:
src
.
val_
;
}
VEC_INLINE
static
void
gather_3_adjacent
(
const
ivec
&
idx
,
const
flt_t
*
mem
,
const
int
scale
,
fvec
*
out_0
,
fvec
*
out_1
,
fvec
*
out_2
)
{
assert
(
scale
==
sizeof
(
flt_t
));
*
out_0
=
gather
(
idx
,
mem
+
0
,
scale
);
*
out_1
=
gather
(
idx
,
mem
+
1
,
scale
);
*
out_2
=
gather
(
idx
,
mem
+
2
,
scale
);
}
VEC_INLINE
static
void
gather_4_adjacent
(
const
ivec
&
idx
,
const
flt_t
*
mem
,
const
int
scale
,
fvec
*
out_0
,
fvec
*
out_1
,
fvec
*
out_2
,
fvec
*
out_3
)
{
assert
(
scale
==
sizeof
(
flt_t
));
*
out_0
=
gather
(
idx
,
mem
+
0
,
scale
);
*
out_1
=
gather
(
idx
,
mem
+
1
,
scale
);
*
out_2
=
gather
(
idx
,
mem
+
2
,
scale
);
*
out_3
=
gather
(
idx
,
mem
+
3
,
scale
);
}
VEC_INLINE
static
flt_t
mask_reduce_add
(
const
bvec
&
mask
,
const
fvec
&
a
)
{
return
mask
.
val_
?
a
.
val_
:
0
;
}
VEC_INLINE
static
flt_t
reduce_add
(
const
fvec
&
a
)
{
return
a
.
val_
;
}
VEC_INLINE
static
ivec
unpackloepi32
(
const
fvec
&
a
)
{
return
reinterpret_cast
<
const
int
*>
(
&
a
.
val_
)[
0
];
}
VEC_INLINE
static
fvec
mask_sincos
(
fvec
*
cos_out
,
const
fvec
&
src_a
,
const
fvec
&
src_b
,
const
bvec
&
mask
,
const
fvec
&
arg
)
{
cos_out
->
val_
=
mask
.
val_
?
::
cos
(
arg
.
val_
)
:
src_b
.
val_
;
return
mask
.
val_
?
::
sin
(
arg
.
val_
)
:
src_a
.
val_
;
}
#define FVEC_BINOP(the_sym, the_name) \
VEC_INLINE inline fvec operator the_sym(const fvec &b) const { \
return this->val_ the_sym b.val_; \
}
FVEC_BINOP
(
+
,
add
)
FVEC_BINOP
(
-
,
sub
)
FVEC_BINOP
(
*
,
mul
)
FVEC_BINOP
(
/
,
div
)
VEC_INLINE
static
void
gather_prefetch0
(
const
ivec
&
idx
,
const
void
*
mem
)
{}
};
class
avec
{
acc_t
val_
;
VEC_INLINE
avec
(
const
acc_t
&
a
)
:
val_
(
a
)
{}
public:
VEC_INLINE
avec
(
const
fvec
&
a
)
:
val_
(
a
.
val_
)
{}
VEC_INLINE
static
avec
undefined
()
{
return
1337.1337
;
}
VEC_INLINE
static
avec
mask_gather
(
const
avec
&
src
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
acc_t
*
mem
,
const
int
scale
)
{
return
mask
.
val_
?
*
reinterpret_cast
<
const
acc_t
*>
(
reinterpret_cast
<
const
char
*>
(
mem
)
+
scale
*
idx
.
val_
)
:
src
.
val_
;
}
VEC_INLINE
static
void
mask_i32loscatter
(
acc_t
*
mem
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
avec
&
a
,
const
int
scale
)
{
if
(
mask
.
val_
)
*
reinterpret_cast
<
acc_t
*>
(
reinterpret_cast
<
char
*>
(
mem
)
+
idx
.
val_
*
scale
)
=
a
.
val_
;
}
#define AVEC_BINOP(the_sym, the_name) \
VEC_INLINE inline avec operator the_sym(const avec &b) const { \
return this->val_ the_sym b.val_; \
}
AVEC_BINOP
(
-
,
sub
)
};
};
#endif
Event Timeline
Log In to Comment