Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F86664140
intel_intrinsics.h
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Oct 7, 21:23
Size
86 KB
Mime Type
text/x-c
Expires
Wed, Oct 9, 21:23 (1 d, 22 h)
Engine
blob
Format
Raw Data
Handle
21463361
Attached To
rLAMMPS lammps
intel_intrinsics.h
View Options
This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Markus Höhnerbach (RWTH)
------------------------------------------------------------------------- */
// This file provides an intrinsics abstraction that allows access to the
// underlying SIMD registers.
// It allows for algorithms that are templated over the used vector length
// The final interface is provided by vector_routines, which provides the
// support for different precision modes.
// The vector_ops interface provides routines specific to one floating point
// data type, and is specialized for various architectures.
// The routines work best with AVX-512 and AVX2, as both support the gather
// instructions.
// For both AVX and SSE we miss some optimization opportunities in the gather
// implementations.
// Vector classes provided with the intel compiler
#ifdef __MIC__
#include <mic/micvec.h>
#else
#include <dvec.h>
// icc-mmic hates generating movq
#include <fvec.h>
#endif
// Vector classes for Cilk array notation
// This is experimental and doesn't yield good code yet
template
<
int
VL
,
typename
fscal
>
struct
lmp_intel_an_fvec
{
fscal
data
[
VL
];
lmp_intel_an_fvec
()
{}
explicit
lmp_intel_an_fvec
(
const
fscal
f
)
{
data
[
:
]
=
f
;
}
explicit
lmp_intel_an_fvec
(
fscal
f
[
VL
])
{
data
[
:
]
=
f
[
:
];
}
lmp_intel_an_fvec
(
const
lmp_intel_an_fvec
&
a
)
{
data
[
:
]
=
a
.
data
[
:
];
}
lmp_intel_an_fvec
&
operator
=
(
const
lmp_intel_an_fvec
&
a
)
{
data
[
:
]
=
a
.
data
[
:
];
return
*
this
;
}
const
lmp_intel_an_fvec
operator
+
(
const
lmp_intel_an_fvec
&
b
)
const
{
lmp_intel_an_fvec
ret
=
*
this
;
ret
.
data
[
:
]
+=
b
.
data
[
:
];
return
ret
;
}
const
lmp_intel_an_fvec
operator
-
(
const
lmp_intel_an_fvec
&
b
)
const
{
lmp_intel_an_fvec
ret
=
*
this
;
ret
.
data
[
:
]
-=
b
.
data
[
:
];
return
ret
;
}
const
lmp_intel_an_fvec
operator
*
(
const
lmp_intel_an_fvec
&
b
)
const
{
lmp_intel_an_fvec
ret
=
*
this
;
ret
.
data
[
:
]
*=
b
.
data
[
:
];
return
ret
;
}
const
lmp_intel_an_fvec
operator
/
(
const
lmp_intel_an_fvec
&
b
)
const
{
lmp_intel_an_fvec
ret
=
*
this
;
ret
.
data
[
:
]
/=
b
.
data
[
:
];
return
ret
;
}
lmp_intel_an_fvec
&
operator
+=
(
const
lmp_intel_an_fvec
&
b
)
{
data
[
:
]
+=
b
.
data
[
:
];
return
*
this
;
}
lmp_intel_an_fvec
&
operator
-=
(
const
lmp_intel_an_fvec
&
b
)
{
data
[
:
]
-=
b
.
data
[
:
];
return
*
this
;
}
lmp_intel_an_fvec
&
operator
*=
(
const
lmp_intel_an_fvec
&
b
)
{
data
[
:
]
*=
b
.
data
[
:
];
return
*
this
;
}
lmp_intel_an_fvec
&
operator
/=
(
const
lmp_intel_an_fvec
&
b
)
{
data
[
:
]
/=
b
.
data
[
:
];
return
*
this
;
}
friend
lmp_intel_an_fvec
sqrt
(
const
lmp_intel_an_fvec
&
a
)
__attribute__
((
always_inline
))
{
lmp_intel_an_fvec
ret
;
ret
.
data
[
:
]
=
sqrt
(
a
.
data
[
:
]);
return
ret
;
}
friend
lmp_intel_an_fvec
exp
(
const
lmp_intel_an_fvec
&
a
)
__attribute__
((
always_inline
))
{
lmp_intel_an_fvec
ret
;
ret
.
data
[
:
]
=
exp
(
a
.
data
[
:
]);
return
ret
;
}
friend
lmp_intel_an_fvec
sin
(
const
lmp_intel_an_fvec
&
a
)
__attribute__
((
always_inline
))
{
lmp_intel_an_fvec
ret
;
ret
.
data
[
:
]
=
sin
(
a
.
data
[
:
]);
return
ret
;
}
friend
lmp_intel_an_fvec
invsqrt
(
const
lmp_intel_an_fvec
&
a
)
__attribute__
((
always_inline
))
{
lmp_intel_an_fvec
ret
;
ret
.
data
[
:
]
=
((
fscal
)
1.
)
/
sqrt
(
a
.
data
[
:
]);
return
ret
;
}
friend
lmp_intel_an_fvec
pow
(
const
lmp_intel_an_fvec
&
a
,
const
lmp_intel_an_fvec
&
b
)
__attribute__
((
always_inline
))
{
lmp_intel_an_fvec
ret
;
ret
.
data
[
:
]
=
pow
(
a
.
data
[
:
],
b
.
data
[
:
]);
return
ret
;
}
lmp_intel_an_fvec
operator
-
()
const
{
lmp_intel_an_fvec
ret
;
ret
.
data
[
:
]
=
-
data
[
:
];
return
ret
;
}
};
template
<
int
VL
>
struct
lmp_intel_an_ivec
{
int
data
[
VL
];
lmp_intel_an_ivec
()
{}
explicit
lmp_intel_an_ivec
(
int
i
)
{
data
[
:
]
=
i
;
}
explicit
lmp_intel_an_ivec
(
const
int
*
a
)
{
data
[
:
]
=
a
[
0
:
VL
];
}
const
lmp_intel_an_ivec
operator
&
(
const
lmp_intel_an_ivec
&
b
)
{
lmp_intel_an_ivec
ret
=
*
this
;
ret
.
data
[
:
]
&=
b
.
data
[
:
];
return
ret
;
}
const
lmp_intel_an_ivec
operator
|
(
const
lmp_intel_an_ivec
&
b
)
{
lmp_intel_an_ivec
ret
=
*
this
;
ret
.
data
[
:
]
|=
b
.
data
[
:
];
return
ret
;
}
const
lmp_intel_an_ivec
operator
+
(
const
lmp_intel_an_ivec
&
b
)
{
lmp_intel_an_ivec
ret
=
*
this
;
ret
.
data
[
:
]
+=
b
.
data
[
:
];
return
ret
;
}
};
template
<
int
VL
>
struct
lmp_intel_an_bvec
{
bool
data
[
VL
];
lmp_intel_an_bvec
()
{}
lmp_intel_an_bvec
(
const
lmp_intel_an_bvec
&
a
)
{
data
[
:
]
=
a
.
data
[
:
];
}
lmp_intel_an_bvec
&
operator
=
(
const
lmp_intel_an_bvec
&
a
)
{
data
[
:
]
=
a
.
data
[
:
];
return
*
this
;
}
explicit
lmp_intel_an_bvec
(
int
i
)
{
data
[
:
]
=
i
;
}
friend
lmp_intel_an_bvec
operator
&
(
const
lmp_intel_an_bvec
&
a
,
const
lmp_intel_an_bvec
&
b
)
__attribute__
((
always_inline
))
{
lmp_intel_an_bvec
ret
;
ret
.
data
[
:
]
=
a
.
data
[
:
]
&
b
.
data
[
:
];
return
ret
;
}
friend
lmp_intel_an_bvec
operator
|
(
const
lmp_intel_an_bvec
&
a
,
const
lmp_intel_an_bvec
&
b
)
__attribute__
((
always_inline
))
{
lmp_intel_an_bvec
ret
;
ret
.
data
[
:
]
=
a
.
data
[
:
]
|
b
.
data
[
:
];
return
ret
;
}
friend
lmp_intel_an_bvec
operator
~
(
const
lmp_intel_an_bvec
&
a
)
__attribute__
((
always_inline
))
{
lmp_intel_an_bvec
ret
;
ret
.
data
[
:
]
=
!
a
.
data
[
:
];
return
ret
;
}
lmp_intel_an_bvec
&
operator
&=
(
const
lmp_intel_an_bvec
&
a
)
__attribute__
((
always_inline
))
{
data
[
:
]
&=
a
.
data
[
:
];
return
*
this
;
}
};
namespace
lmp_intel
{
// Self explanatory mostly, KNC=IMCI and AVX-512, NONE=Scalar, AN=Array Not.
enum
CalculationMode
{
KNC
,
AVX
,
AVX2
,
SSE
,
NONE
,
AN
};
#ifdef __MIC__
#ifdef LMP_INTEL_VECTOR_MIC
static
const
CalculationMode
mode
=
LMP_INTEL_VECTOR_MIC
;
#else
static
const
CalculationMode
mode
=
KNC
;
#endif
#else
#ifdef LMP_INTEL_VECTOR_HOST
static
const
CalculationMode
mode
=
LMP_INTEL_VECTOR_HOST
;
#else
#ifdef __AVX512F__
static
const
CalculationMode
mode
=
KNC
;
#else
#ifdef __AVX2__
static
const
CalculationMode
mode
=
AVX2
;
#else
#ifdef __AVX__
static
const
CalculationMode
mode
=
AVX
;
#else
static
const
CalculationMode
mode
=
SSE
;
#endif
#endif
#endif
#endif
#endif
// This is used in the selection logic
template
<
CalculationMode
mode
>
struct
vector_traits
{
static
const
bool
support_integer_and_gather_ops
=
true
;
};
template
<>
struct
vector_traits
<
AVX
>
{
static
const
bool
support_integer_and_gather_ops
=
false
;
};
// This is the base template for all the different architectures
// It will get specialized
template
<
class
flt_t
,
CalculationMode
mode
>
struct
vector_ops
{};
// Intrinsic routines for IMCI and AVX-512
#if defined(__MIC__) || defined(__AVX512F__)
// Integer vector class
#pragma pack(push,64)
struct
ivec32x16
{
__m512i
vec
;
ivec32x16
()
{}
ivec32x16
(
__m512i
m
)
{
vec
=
m
;
}
ivec32x16
(
const
int
*
a
)
{
vec
=
_mm512_load_epi32
(
reinterpret_cast
<
const
int
*>
(
a
));
}
explicit
ivec32x16
(
int
i
)
{
vec
=
_mm512_set1_epi32
(
i
);
}
operator
__m512i
()
const
{
return
vec
;
}
friend
ivec32x16
operator
&
(
const
ivec32x16
&
a
,
const
ivec32x16
&
b
)
{
return
_mm512_and_epi32
(
a
,
b
);
}
friend
ivec32x16
operator
|
(
const
ivec32x16
&
a
,
const
ivec32x16
&
b
)
{
return
_mm512_or_epi32
(
a
,
b
);
}
friend
ivec32x16
operator
+
(
const
ivec32x16
&
a
,
const
ivec32x16
&
b
)
{
return
_mm512_add_epi32
(
a
,
b
);
}
};
#pragma pack(pop)
// Double precision routines
template
<>
struct
vector_ops
<
double
,
KNC
>
{
static
const
int
VL
=
8
;
typedef
double
fscal
;
typedef
F64vec8
fvec
;
typedef
ivec32x16
ivec
;
typedef
__mmask
bvec
;
typedef
double
farr
[
8
]
__attribute__
((
aligned
(
64
)));
typedef
int
iarr
[
16
]
__attribute__
((
aligned
(
64
)));
static
fvec
recip
(
const
fvec
&
a
)
{
return
_mm512_recip_pd
(
a
);
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
bvec
mask
,
const
void
*
base
)
{
_mm512_mask_prefetch_i32gather_ps
(
idx
,
mask
,
base
,
scale
,
_MM_HINT_T0
);
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
return
_mm512_mask_i32logather_pd
(
from
,
mask
,
idx
,
base
,
scale
);
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_mask_blend_pd
(
mask
,
a
,
b
);
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
return
_mm512_fmadd_pd
(
a
,
b
,
c
);
}
static
fvec
zero
()
{
return
_mm512_setzero_pd
();
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_cmp_pd_mask
(
a
,
b
,
_CMP_EQ_OQ
);
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_cmp_pd_mask
(
a
,
b
,
_CMP_NLE_US
);
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_cmp_pd_mask
(
a
,
b
,
_CMP_LE_OS
);
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_cmp_pd_mask
(
a
,
b
,
_CMP_LT_OS
);
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm512_cmpneq_epi32_mask
(
a
,
b
);
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm512_cmplt_epi32_mask
(
a
,
b
);
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
return
_mm512_invsqrt_pd
(
a
);
}
static
fvec
sincos
(
fvec
*
cos
,
const
fvec
&
a
)
{
return
_mm512_sincos_pd
(
reinterpret_cast
<
__m512d
*>
(
cos
),
a
);
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
return
_mm512_reduce_add_pd
(
a
);
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm512_mullo_epi32
(
a
,
b
);
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm512_mask_add_epi32
(
src
,
mask
,
a
,
b
);
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
return
_mm512_mask_i32gather_epi32
(
from
,
mask
,
idx
,
base
,
scale
);
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_mask_add_pd
(
src
,
mask
,
a
,
b
);
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
_mm512_store_pd
(
at
,
a
);
}
static
void
int_store
(
void
*
at
,
const
ivec
&
a
)
{
_mm512_store_epi32
(
at
,
a
);
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
at
[
i
]
=
(
a
>>
i
)
&
1
;
}
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_min_pd
(
a
,
b
);
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
mask
&
(
1
<<
at
);
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
mask
==
0
;
}
static
bvec
mask_enable_lower
(
int
n
)
{
return
0xFF
>>
(
VL
-
n
);
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
return
_mm512_load_epi32
(
a
);
}
static
void
int_clear_arr
(
int
*
a
)
{
_mm512_store_epi32
(
a
,
ivec
(
0
));
}
static
void
int_print
(
const
ivec
&
a
)
{
iarr
tmp
;
_mm512_store_epi32
(
tmp
,
a
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
printf
(
"%d "
,
tmp
[
i
]);
printf
(
"
\n
"
);
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
*
x
=
gather
<
1
>
(
*
x
,
mask
,
idxs
,
&
base
->
x
);
*
y
=
gather
<
1
>
(
*
y
,
mask
,
idxs
,
&
base
->
y
);
*
z
=
gather
<
1
>
(
*
z
,
mask
,
idxs
,
&
base
->
z
);
*
w
=
int_gather
<
1
>
(
*
w
,
mask
,
idxs
,
&
base
->
w
);
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
);
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
8
);
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
16
);
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
24
);
*
r4
=
gather
<
4
>
(
*
r4
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
32
);
*
r5
=
gather
<
4
>
(
*
r5
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
40
);
*
r6
=
gather
<
4
>
(
*
r6
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
48
);
*
r7
=
gather
<
4
>
(
*
r7
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
56
);
}
static
void
gather_4
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
);
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
8
);
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
16
);
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
24
);
}
};
template
<>
struct
vector_ops
<
float
,
KNC
>
{
static
const
int
VL
=
16
;
static
const
int
ALIGN
=
64
;
typedef
float
fscal
;
typedef
F32vec16
fvec
;
typedef
ivec32x16
ivec
;
typedef
__mmask
bvec
;
typedef
float
farr
[
16
]
__attribute__
((
aligned
(
64
)));
typedef
int
iarr
[
16
]
__attribute__
((
aligned
(
64
)));
static
const
bvec
full_mask
=
0xFFFF
;
static
fvec
recip
(
const
fvec
&
a
)
{
return
_mm512_recip_ps
(
a
);
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
bvec
mask
,
const
void
*
base
)
{
_mm512_mask_prefetch_i32gather_ps
(
idx
,
mask
,
base
,
scale
,
_MM_HINT_T0
);
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
return
_mm512_mask_i32gather_ps
(
from
,
mask
,
idx
,
base
,
scale
);
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_mask_blend_ps
(
mask
,
a
,
b
);
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
return
_mm512_fmadd_ps
(
a
,
b
,
c
);
}
static
fvec
zero
()
{
return
_mm512_setzero_ps
();
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_cmpeq_ps_mask
(
a
,
b
);
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_cmpnle_ps_mask
(
a
,
b
);
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_cmple_ps_mask
(
a
,
b
);
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_cmplt_ps_mask
(
a
,
b
);
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm512_cmpneq_epi32_mask
(
a
,
b
);
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm512_cmplt_epi32_mask
(
a
,
b
);
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
return
_mm512_invsqrt_ps
(
a
);
}
static
fvec
sincos
(
fvec
*
cos
,
const
fvec
&
a
)
{
return
_mm512_sincos_ps
(
reinterpret_cast
<
__m512
*>
(
cos
),
a
);
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
return
_mm512_reduce_add_ps
(
a
);
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm512_mullo_epi32
(
a
,
b
);
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm512_mask_add_epi32
(
src
,
mask
,
a
,
b
);
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
return
_mm512_mask_i32gather_epi32
(
from
,
mask
,
idx
,
base
,
scale
);
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_mask_add_ps
(
src
,
mask
,
a
,
b
);
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
_mm512_store_ps
(
at
,
a
);
}
static
void
int_store
(
void
*
at
,
const
ivec
&
a
)
{
_mm512_store_epi32
(
at
,
a
);
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
for
(
int
i
=
0
;
i
<
16
;
i
++
)
{
at
[
i
]
=
(
a
>>
i
)
&
1
;
}
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm512_min_ps
(
a
,
b
);
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
mask
&
(
1
<<
at
);
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
mask
==
0
;
}
static
bvec
mask_enable_lower
(
int
n
)
{
return
0xFFFF
>>
(
VL
-
n
);
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
return
_mm512_load_epi32
(
a
);
}
static
void
int_clear_arr
(
int
*
a
)
{
_mm512_store_epi32
(
a
,
ivec
(
0
));
}
static
void
int_print
(
const
ivec
&
a
)
{
iarr
tmp
;
_mm512_store_epi32
(
tmp
,
a
);
for
(
int
i
=
0
;
i
<
16
;
i
++
)
printf
(
"%d "
,
tmp
[
i
]);
printf
(
"
\n
"
);
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
*
x
=
gather
<
1
>
(
*
x
,
mask
,
idxs
,
&
base
->
x
);
*
y
=
gather
<
1
>
(
*
y
,
mask
,
idxs
,
&
base
->
y
);
*
z
=
gather
<
1
>
(
*
z
,
mask
,
idxs
,
&
base
->
z
);
*
w
=
int_gather
<
1
>
(
*
w
,
mask
,
idxs
,
&
base
->
w
);
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
);
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
4
);
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
8
);
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
12
);
*
r4
=
gather
<
4
>
(
*
r4
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
16
);
*
r5
=
gather
<
4
>
(
*
r5
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
20
);
*
r6
=
gather
<
4
>
(
*
r6
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
24
);
*
r7
=
gather
<
4
>
(
*
r7
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
28
);
}
static
void
gather_4
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
);
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
4
);
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
8
);
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
12
);
}
// Additional routines needed for the implementation of mixed precision
static
fvec
cvtdown
(
const
vector_ops
<
double
,
KNC
>::
fvec
&
lo
,
const
vector_ops
<
double
,
KNC
>::
fvec
&
hi
)
{
__m512
t1
=
_mm512_cvtpd_pslo
(
lo
);
__m512
t2
=
_mm512_cvtpd_pslo
(
hi
);
return
_mm512_mask_permute4f128_ps
(
t1
,
0xFF00
,
t2
,
_MM_PERM_BADC
);
}
static
vector_ops
<
double
,
KNC
>::
fvec
cvtup_lo
(
const
fvec
&
a
)
{
return
_mm512_cvtpslo_pd
(
a
);
}
static
vector_ops
<
double
,
KNC
>::
fvec
cvtup_hi
(
const
fvec
&
a
)
{
return
_mm512_cvtpslo_pd
(
_mm512_permute4f128_ps
(
a
,
_MM_PERM_BADC
));
// permute DCBA -> BADC
}
static
void
mask_cvtup
(
const
bvec
&
a
,
vector_ops
<
double
,
KNC
>::
bvec
*
blo
,
vector_ops
<
double
,
KNC
>::
bvec
*
bhi
)
{
*
blo
=
a
&
0xFF
;
*
bhi
=
a
>>
8
;
}
};
#endif
//////////////////////////////////////////////////////////////////////////////
// AVX/SSE
//////////////////////////////////////////////////////////////////////////////
#ifndef __MIC__
// class definitions for integer and masks for AVX
// Note that we have to lower a number of operations to SSE, notably comparison
// and integer operations.
// The gather operations are emulated.
struct
ivec32x8
{
__m256i
vec
;
ivec32x8
()
{}
ivec32x8
(
__m256i
m
)
{
vec
=
m
;
}
ivec32x8
(
const
int
*
a
)
{
vec
=
_mm256_load_si256
(
reinterpret_cast
<
const
__m256i
*>
(
a
));
}
explicit
ivec32x8
(
int
i
)
{
vec
=
_mm256_set1_epi32
(
i
);
}
operator
__m256i
()
const
{
return
vec
;
}
friend
ivec32x8
operator
&
(
const
ivec32x8
&
a
,
const
ivec32x8
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_and_pd
(
_mm256_castsi256_pd
(
a
),
_mm256_castsi256_pd
(
b
)));
}
friend
ivec32x8
operator
|
(
const
ivec32x8
&
a
,
const
ivec32x8
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_or_pd
(
_mm256_castsi256_pd
(
a
),
_mm256_castsi256_pd
(
b
)));
}
friend
ivec32x8
operator
+
(
const
ivec32x8
&
a
,
const
ivec32x8
&
b
)
{
__m128i
alo
=
_mm256_castsi256_si128
(
a
);
__m128i
ahi
=
_mm256_extractf128_si256
(
a
,
1
);
__m128i
blo
=
_mm256_castsi256_si128
(
b
);
__m128i
bhi
=
_mm256_extractf128_si256
(
b
,
1
);
__m128i
rlo
=
_mm_add_epi32
(
alo
,
blo
);
__m128i
rhi
=
_mm_add_epi32
(
ahi
,
bhi
);
return
_mm256_setr_m128i
(
rlo
,
rhi
);
}
};
struct
avx_bvec
{
__m256i
vec
;
avx_bvec
()
{}
avx_bvec
(
__m256i
m
)
{
vec
=
m
;
}
explicit
avx_bvec
(
int
i
)
{
vec
=
_mm256_set1_epi32
(
i
);
}
operator
__m256i
()
const
{
return
vec
;
}
operator
F64vec4
()
const
{
return
_mm256_castsi256_pd
(
vec
);
}
operator
F32vec8
()
const
{
return
_mm256_castsi256_ps
(
vec
);
}
operator
ivec32x8
()
const
{
return
vec
;
}
friend
avx_bvec
operator
&
(
const
avx_bvec
&
a
,
const
avx_bvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_and_pd
(
_mm256_castsi256_pd
(
a
),
_mm256_castsi256_pd
(
b
)));
}
friend
avx_bvec
operator
|
(
const
avx_bvec
&
a
,
const
avx_bvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_or_pd
(
_mm256_castsi256_pd
(
a
),
_mm256_castsi256_pd
(
b
)));
}
friend
avx_bvec
operator
~
(
const
avx_bvec
&
a
)
{
return
_mm256_castpd_si256
(
_mm256_andnot_pd
(
_mm256_castsi256_pd
(
a
),
_mm256_castsi256_pd
(
avx_bvec
(
0xFFFFFFFF
))));
}
avx_bvec
&
operator
&=
(
const
avx_bvec
&
a
)
{
return
*
this
=
_mm256_castpd_si256
(
_mm256_and_pd
(
_mm256_castsi256_pd
(
vec
),
_mm256_castsi256_pd
(
a
)));
}
};
template
<>
struct
vector_ops
<
double
,
AVX
>
{
static
const
int
VL
=
4
;
typedef
double
fscal
;
typedef
F64vec4
fvec
;
typedef
ivec32x8
ivec
;
typedef
avx_bvec
bvec
;
typedef
double
farr
[
4
]
__attribute__
((
aligned
(
32
)));
typedef
int
iarr
[
8
]
__attribute__
((
aligned
(
32
)));
static
fvec
recip
(
const
fvec
&
a
)
{
// newton-raphson
fvec
b
=
_mm256_cvtps_pd
(
_mm_rcp_ps
(
_mm256_cvtpd_ps
(
a
)));
b
=
b
+
b
-
a
*
b
*
b
;
return
b
+
b
-
a
*
b
*
b
;
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
const
bvec
&
mask
,
const
void
*
base
)
{
// nop
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
farr
result
;
farr
src
;
iarr
idxs
;
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
idxs
),
idx
);
_mm256_store_pd
(
reinterpret_cast
<
double
*>
(
src
),
from
);
for
(
int
i
=
0
;
i
<
VL
;
i
++
)
{
result
[
i
]
=
mask_test_at
(
mask
,
i
)
?
*
reinterpret_cast
<
const
double
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idxs
[
2
*
i
])
:
src
[
i
];
}
return
_mm256_load_pd
(
reinterpret_cast
<
double
*>
(
result
));
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
iarr
i
,
m
;
int_store
(
i
,
idxs
);
mask_store
(
m
,
mask
);
__m256d
a0
=
m
[
0
]
?
_mm256_load_pd
(
reinterpret_cast
<
const
double
*>
(
&
base
[
i
[
0
]
/
32
]))
:
_mm256_setzero_pd
();
__m256d
a1
=
m
[
2
]
?
_mm256_load_pd
(
reinterpret_cast
<
const
double
*>
(
&
base
[
i
[
1
]
/
32
]))
:
_mm256_setzero_pd
();
__m256d
b0
=
_mm256_unpacklo_pd
(
a0
,
a1
);
__m256d
b1
=
_mm256_unpackhi_pd
(
a0
,
a1
);
__m256d
a2
=
m
[
4
]
?
_mm256_load_pd
(
reinterpret_cast
<
const
double
*>
(
&
base
[
i
[
2
]
/
32
]))
:
_mm256_setzero_pd
();
__m256d
a3
=
m
[
6
]
?
_mm256_load_pd
(
reinterpret_cast
<
const
double
*>
(
&
base
[
i
[
3
]
/
32
]))
:
_mm256_setzero_pd
();
__m256d
b2
=
_mm256_unpacklo_pd
(
a2
,
a3
);
__m256d
b3
=
_mm256_unpackhi_pd
(
a2
,
a3
);
__m256d
c0
=
_mm256_permute2f128_pd
(
b0
,
b2
,
0x20
);
__m256d
c1
=
_mm256_permute2f128_pd
(
b1
,
b3
,
0x20
);
__m256d
c2
=
_mm256_permute2f128_pd
(
b0
,
b2
,
0x31
);
__m256d
c3
=
_mm256_permute2f128_pd
(
b1
,
b3
,
0x31
);
*
x
=
blend
(
mask
,
*
x
,
c0
);
*
y
=
blend
(
mask
,
*
y
,
c1
);
*
z
=
blend
(
mask
,
*
z
,
c2
);
*
w
=
int_blend
(
mask
,
*
w
,
_mm256_castps_si256
(
_mm256_permute_ps
(
_mm256_castpd_ps
(
c3
),
0xA0
)));
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
fvec
a
=
zero
(),
b
=
zero
(),
c
=
zero
(),
d
=
zero
();
gather_4
(
idxs
,
mask
,
base
,
r0
,
r1
,
r2
,
r3
);
gather_4
(
idxs
,
mask
,
reinterpret_cast
<
const
char
*>
(
base
)
+
32
,
r4
,
r5
,
r6
,
r7
);
}
static
void
gather_4
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
iarr
i
,
m
;
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
i
),
idxs
);
mask_store
(
m
,
mask
);
__m256d
z
=
_mm256_setzero_pd
();
const
char
*
m0
=
reinterpret_cast
<
const
char
*>
(
base
)
+
4
*
i
[
0
];
const
char
*
m1
=
reinterpret_cast
<
const
char
*>
(
base
)
+
4
*
i
[
2
];
const
char
*
m2
=
reinterpret_cast
<
const
char
*>
(
base
)
+
4
*
i
[
4
];
const
char
*
m3
=
reinterpret_cast
<
const
char
*>
(
base
)
+
4
*
i
[
6
];
const
double
*
e0
=
reinterpret_cast
<
const
double
*>
(
m
[
0
]
==
0
?
reinterpret_cast
<
const
char
*>
(
&
z
)
:
m0
);
const
double
*
e1
=
reinterpret_cast
<
const
double
*>
(
m
[
2
]
==
0
?
reinterpret_cast
<
const
char
*>
(
&
z
)
:
m1
);
const
double
*
e2
=
reinterpret_cast
<
const
double
*>
(
m
[
4
]
==
0
?
reinterpret_cast
<
const
char
*>
(
&
z
)
:
m2
);
const
double
*
e3
=
reinterpret_cast
<
const
double
*>
(
m
[
6
]
==
0
?
reinterpret_cast
<
const
char
*>
(
&
z
)
:
m3
);
__m256d
a0
=
_mm256_load_pd
(
e0
);
__m256d
a1
=
_mm256_load_pd
(
e1
);
__m256d
b0
=
_mm256_unpacklo_pd
(
a0
,
a1
);
__m256d
b1
=
_mm256_unpackhi_pd
(
a0
,
a1
);
__m256d
a2
=
_mm256_load_pd
(
e2
);
__m256d
a3
=
_mm256_load_pd
(
e3
);
__m256d
b2
=
_mm256_unpacklo_pd
(
a2
,
a3
);
__m256d
b3
=
_mm256_unpackhi_pd
(
a2
,
a3
);
__m256d
c0
=
_mm256_permute2f128_pd
(
b0
,
b2
,
0x20
);
__m256d
c1
=
_mm256_permute2f128_pd
(
b1
,
b3
,
0x20
);
__m256d
c2
=
_mm256_permute2f128_pd
(
b0
,
b2
,
0x31
);
__m256d
c3
=
_mm256_permute2f128_pd
(
b1
,
b3
,
0x31
);
*
r0
=
blend
(
mask
,
*
r0
,
c0
);
*
r1
=
blend
(
mask
,
*
r1
,
c1
);
*
r2
=
blend
(
mask
,
*
r2
,
c2
);
*
r3
=
blend
(
mask
,
*
r3
,
c3
);
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
(
b
&
mask
)
|
(
a
&
~
mask
);
}
static
ivec
int_blend
(
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
(
b
&
mask
)
|
(
a
&
~
mask
);
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
return
a
*
b
+
c
;
}
static
fvec
zero
()
{
return
_mm256_setzero_pd
();
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_cmp_pd
(
a
,
b
,
_CMP_EQ_OQ
));
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_cmp_pd
(
a
,
b
,
_CMP_NLE_US
));
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_cmp_pd
(
a
,
b
,
_CMP_LE_OS
));
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_cmp_pd
(
a
,
b
,
_CMP_LT_OS
));
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
__m128i
alo
=
_mm256_castsi256_si128
(
a
);
__m128i
ahi
=
_mm256_extractf128_si256
(
a
,
1
);
__m128i
blo
=
_mm256_castsi256_si128
(
b
);
__m128i
bhi
=
_mm256_extractf128_si256
(
b
,
1
);
__m128i
rlo
=
_mm_andnot_si128
(
_mm_cmpeq_epi32
(
alo
,
blo
),
_mm_cmpeq_epi8
(
alo
,
alo
));
__m128i
rhi
=
_mm_andnot_si128
(
_mm_cmpeq_epi32
(
ahi
,
bhi
),
_mm_cmpeq_epi8
(
alo
,
alo
));
return
_mm256_setr_m128i
(
rlo
,
rhi
);
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
__m128i
alo
=
_mm256_castsi256_si128
(
a
);
__m128i
ahi
=
_mm256_extractf128_si256
(
a
,
1
);
__m128i
blo
=
_mm256_castsi256_si128
(
b
);
__m128i
bhi
=
_mm256_extractf128_si256
(
b
,
1
);
__m128i
rlo
=
_mm_cmplt_epi32
(
alo
,
blo
);
__m128i
rhi
=
_mm_cmplt_epi32
(
ahi
,
bhi
);
return
_mm256_setr_m128i
(
rlo
,
rhi
);
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
return
_mm256_invsqrt_pd
(
a
);
}
static
fvec
sincos
(
fvec
*
cos
,
const
fvec
&
a
)
{
return
_mm256_sincos_pd
(
reinterpret_cast
<
__m256d
*>
(
cos
),
a
);
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
__m256d
t1
=
_mm256_hadd_pd
(
a
,
a
);
__m128d
t2
=
_mm256_extractf128_pd
(
t1
,
1
);
__m128d
t3
=
_mm256_castpd256_pd128
(
t1
);
return
_mm_cvtsd_f64
(
_mm_add_pd
(
t2
,
t3
));
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
__m128i
alo
=
_mm256_castsi256_si128
(
a
);
__m128i
ahi
=
_mm256_extractf128_si256
(
a
,
1
);
__m128i
blo
=
_mm256_castsi256_si128
(
b
);
__m128i
bhi
=
_mm256_extractf128_si256
(
b
,
1
);
__m128i
rlo
=
_mm_mullo_epi32
(
alo
,
blo
);
__m128i
rhi
=
_mm_mullo_epi32
(
ahi
,
bhi
);
return
_mm256_setr_m128i
(
rlo
,
rhi
);
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
(
src
&
~
mask
);
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
iarr
result
;
iarr
src
;
iarr
idxs
;
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
idxs
),
idx
);
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
src
),
from
);
for
(
int
i
=
0
;
i
<
VL
;
i
++
)
{
int
tmp
;
if
(
mask_test_at
(
mask
,
i
))
{
tmp
=
*
reinterpret_cast
<
const
int
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idxs
[
2
*
i
]);
}
else
{
tmp
=
src
[
2
*
i
];
}
result
[
2
*
i
]
=
tmp
;
result
[
2
*
i
+
1
]
=
tmp
;
}
return
_mm256_load_si256
(
reinterpret_cast
<
__m256i
*>
(
result
));
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
(
src
&
~
mask
);
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
_mm256_store_pd
(
reinterpret_cast
<
double
*>
(
at
),
a
);
}
static
void
int_store
(
int
*
at
,
const
ivec
&
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
at
),
a
);
at
[
1
]
=
at
[
2
];
at
[
2
]
=
at
[
4
];
at
[
3
]
=
at
[
6
];
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
at
),
a
);
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_min_pd
(
a
,
b
);
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
reinterpret_cast
<
const
int
*>
(
&
mask
)[
2
*
at
];
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
_mm256_testz_si256
(
mask
,
mask
);
}
static
bvec
mask_enable_lower
(
int
n
)
{
static
const
int
base
[
8
]
__attribute__
((
aligned
(
64
)))
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
};
return
int_cmplt
(
ivec
(
base
),
ivec
(
n
));
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
__m128i
b
=
_mm_load_si128
(
reinterpret_cast
<
const
__m128i
*>
(
a
));
__m128i
c
=
_mm_unpacklo_epi32
(
b
,
b
);
__m128i
d
=
_mm_unpackhi_epi32
(
b
,
b
);
return
_mm256_setr_m128i
(
c
,
d
);
// stolen from http://stackoverflow.com/questions/6687535/unexpected-result-from-avx-m256-unpack-ps-unpack-intrinsic
}
static
void
int_clear_arr
(
int
*
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
a
),
ivec
(
0
));
}
static
bvec
full_mask
()
{
return
bvec
(
0xFFFFFFFF
);
}
static
void
int_print
(
const
ivec
&
a
)
{
iarr
tmp
;
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
tmp
),
a
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
printf
(
"%d "
,
tmp
[
i
]);
printf
(
"
\n
"
);
}
};
template
<>
struct
vector_ops
<
float
,
AVX
>
{
static
const
int
VL
=
8
;
static
const
int
ALIGN
=
32
;
typedef
float
fscal
;
typedef
F32vec8
fvec
;
typedef
ivec32x8
ivec
;
typedef
avx_bvec
bvec
;
typedef
float
farr
[
8
]
__attribute__
((
aligned
(
32
)));
typedef
int
iarr
[
8
]
__attribute__
((
aligned
(
32
)));
static
fvec
recip
(
const
fvec
&
a
)
{
fvec
b
=
_mm256_rcp_ps
(
a
);
b
=
b
+
b
-
a
*
b
*
b
;
return
b
+
b
-
a
*
b
*
b
;
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
bvec
mask
,
const
void
*
base
)
{
// nop
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
farr
result
;
farr
src
;
iarr
idxs
;
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
idxs
),
idx
);
_mm256_store_ps
(
reinterpret_cast
<
float
*>
(
src
),
from
);
for
(
int
i
=
0
;
i
<
VL
;
i
++
)
{
result
[
i
]
=
mask_test_at
(
mask
,
i
)
?
*
reinterpret_cast
<
const
float
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idxs
[
i
])
:
src
[
i
];
}
return
_mm256_load_ps
(
reinterpret_cast
<
float
*>
(
result
));
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
iarr
i
,
m
;
int_store
(
i
,
idxs
);
mask_store
(
m
,
mask
);
farr
zero_mem
;
store
(
zero_mem
,
zero
());
const
float
*
e0
=
m
[
0
]
?
reinterpret_cast
<
const
float
*>
(
&
base
[
i
[
0
]
/
16
])
:
zero_mem
;
const
float
*
e1
=
m
[
1
]
?
reinterpret_cast
<
const
float
*>
(
&
base
[
i
[
1
]
/
16
])
:
zero_mem
;
const
float
*
e2
=
m
[
2
]
?
reinterpret_cast
<
const
float
*>
(
&
base
[
i
[
2
]
/
16
])
:
zero_mem
;
const
float
*
e3
=
m
[
3
]
?
reinterpret_cast
<
const
float
*>
(
&
base
[
i
[
3
]
/
16
])
:
zero_mem
;
const
float
*
e4
=
m
[
4
]
?
reinterpret_cast
<
const
float
*>
(
&
base
[
i
[
4
]
/
16
])
:
zero_mem
;
const
float
*
e5
=
m
[
5
]
?
reinterpret_cast
<
const
float
*>
(
&
base
[
i
[
5
]
/
16
])
:
zero_mem
;
const
float
*
e6
=
m
[
6
]
?
reinterpret_cast
<
const
float
*>
(
&
base
[
i
[
6
]
/
16
])
:
zero_mem
;
const
float
*
e7
=
m
[
7
]
?
reinterpret_cast
<
const
float
*>
(
&
base
[
i
[
7
]
/
16
])
:
zero_mem
;
__m256
a0
=
_mm256_loadu2_m128
(
e4
,
e0
);
__m256
a1
=
_mm256_loadu2_m128
(
e5
,
e1
);
__m256
b0
=
_mm256_unpacklo_ps
(
a0
,
a1
);
__m256
b1
=
_mm256_unpackhi_ps
(
a0
,
a1
);
__m256
a2
=
_mm256_loadu2_m128
(
e6
,
e2
);
__m256
a3
=
_mm256_loadu2_m128
(
e7
,
e3
);
__m256
b2
=
_mm256_unpacklo_ps
(
a2
,
a3
);
__m256
b3
=
_mm256_unpackhi_ps
(
a2
,
a3
);
__m256
c0
=
_mm256_shuffle_ps
(
b0
,
b2
,
0x44
);
__m256
c1
=
_mm256_shuffle_ps
(
b0
,
b2
,
0xEE
);
__m256
c2
=
_mm256_shuffle_ps
(
b1
,
b3
,
0x44
);
__m256
c3
=
_mm256_shuffle_ps
(
b1
,
b3
,
0xEE
);
*
x
=
blend
(
mask
,
*
x
,
c0
);
*
y
=
blend
(
mask
,
*
y
,
c1
);
*
z
=
blend
(
mask
,
*
z
,
c2
);
*
w
=
int_blend
(
mask
,
*
w
,
_mm256_castps_si256
(
c3
));
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
fvec
a
=
zero
(),
b
=
zero
(),
c
=
zero
(),
d
=
zero
();
gather_4
(
idxs
,
mask
,
base
,
r0
,
r1
,
r2
,
r3
);
gather_4
(
idxs
,
mask
,
reinterpret_cast
<
const
char
*>
(
base
)
+
16
,
r4
,
r5
,
r6
,
r7
);
}
static
void
gather_4
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
iarr
i
,
m
;
int_store
(
i
,
idxs
);
mask_store
(
m
,
mask
);
farr
zero_mem
;
store
(
zero_mem
,
zero
());
const
float
*
e0
=
m
[
0
]
?
reinterpret_cast
<
const
float
*>
(
&
reinterpret_cast
<
const
char
*>
(
base
)[
4
*
i
[
0
]])
:
zero_mem
;
const
float
*
e1
=
m
[
1
]
?
reinterpret_cast
<
const
float
*>
(
&
reinterpret_cast
<
const
char
*>
(
base
)[
4
*
i
[
1
]])
:
zero_mem
;
const
float
*
e2
=
m
[
2
]
?
reinterpret_cast
<
const
float
*>
(
&
reinterpret_cast
<
const
char
*>
(
base
)[
4
*
i
[
2
]])
:
zero_mem
;
const
float
*
e3
=
m
[
3
]
?
reinterpret_cast
<
const
float
*>
(
&
reinterpret_cast
<
const
char
*>
(
base
)[
4
*
i
[
3
]])
:
zero_mem
;
const
float
*
e4
=
m
[
4
]
?
reinterpret_cast
<
const
float
*>
(
&
reinterpret_cast
<
const
char
*>
(
base
)[
4
*
i
[
4
]])
:
zero_mem
;
const
float
*
e5
=
m
[
5
]
?
reinterpret_cast
<
const
float
*>
(
&
reinterpret_cast
<
const
char
*>
(
base
)[
4
*
i
[
5
]])
:
zero_mem
;
const
float
*
e6
=
m
[
6
]
?
reinterpret_cast
<
const
float
*>
(
&
reinterpret_cast
<
const
char
*>
(
base
)[
4
*
i
[
6
]])
:
zero_mem
;
const
float
*
e7
=
m
[
7
]
?
reinterpret_cast
<
const
float
*>
(
&
reinterpret_cast
<
const
char
*>
(
base
)[
4
*
i
[
7
]])
:
zero_mem
;
__m256
a0
=
_mm256_loadu2_m128
(
e4
,
e0
);
__m256
a1
=
_mm256_loadu2_m128
(
e5
,
e1
);
__m256
b0
=
_mm256_unpacklo_ps
(
a0
,
a1
);
__m256
b1
=
_mm256_unpackhi_ps
(
a0
,
a1
);
__m256
a2
=
_mm256_loadu2_m128
(
e6
,
e2
);
__m256
a3
=
_mm256_loadu2_m128
(
e7
,
e3
);
__m256
b2
=
_mm256_unpacklo_ps
(
a2
,
a3
);
__m256
b3
=
_mm256_unpackhi_ps
(
a2
,
a3
);
__m256
c0
=
_mm256_shuffle_ps
(
b0
,
b2
,
0x44
);
__m256
c1
=
_mm256_shuffle_ps
(
b0
,
b2
,
0xEE
);
__m256
c2
=
_mm256_shuffle_ps
(
b1
,
b3
,
0x44
);
__m256
c3
=
_mm256_shuffle_ps
(
b1
,
b3
,
0xEE
);
*
r0
=
blend
(
mask
,
*
r0
,
c0
);
*
r1
=
blend
(
mask
,
*
r1
,
c1
);
*
r2
=
blend
(
mask
,
*
r2
,
c2
);
*
r3
=
blend
(
mask
,
*
r3
,
c3
);
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
(
b
&
mask
)
|
(
a
&
~
mask
);
}
static
ivec
int_blend
(
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
(
b
&
mask
)
|
(
a
&
~
mask
);
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
return
a
*
b
+
c
;
}
static
fvec
zero
()
{
return
_mm256_setzero_ps
();
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castps_si256
(
_mm256_cmp_ps
(
a
,
b
,
_CMP_EQ_OQ
));
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castps_si256
(
_mm256_cmp_ps
(
a
,
b
,
_CMP_NLE_US
));
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castps_si256
(
_mm256_cmp_ps
(
a
,
b
,
_CMP_LE_OS
));
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castps_si256
(
_mm256_cmp_ps
(
a
,
b
,
_CMP_LT_OS
));
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
__m128i
alo
=
_mm256_castsi256_si128
(
a
);
__m128i
ahi
=
_mm256_extractf128_si256
(
a
,
1
);
__m128i
blo
=
_mm256_castsi256_si128
(
b
);
__m128i
bhi
=
_mm256_extractf128_si256
(
b
,
1
);
__m128i
rlo
=
_mm_andnot_si128
(
_mm_cmpeq_epi32
(
alo
,
blo
),
_mm_cmpeq_epi8
(
alo
,
alo
));
__m128i
rhi
=
_mm_andnot_si128
(
_mm_cmpeq_epi32
(
ahi
,
bhi
),
_mm_cmpeq_epi8
(
alo
,
alo
));
return
_mm256_setr_m128i
(
rlo
,
rhi
);
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
__m128i
alo
=
_mm256_castsi256_si128
(
a
);
__m128i
ahi
=
_mm256_extractf128_si256
(
a
,
1
);
__m128i
blo
=
_mm256_castsi256_si128
(
b
);
__m128i
bhi
=
_mm256_extractf128_si256
(
b
,
1
);
__m128i
rlo
=
_mm_cmplt_epi32
(
alo
,
blo
);
__m128i
rhi
=
_mm_cmplt_epi32
(
ahi
,
bhi
);
return
_mm256_setr_m128i
(
rlo
,
rhi
);
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
return
_mm256_invsqrt_ps
(
a
);
}
static
fvec
sincos
(
fvec
*
cos
,
const
fvec
&
a
)
{
return
_mm256_sincos_ps
(
reinterpret_cast
<
__m256
*>
(
cos
),
a
);
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
__m256
t1
=
_mm256_hadd_ps
(
a
,
a
);
__m128
t2
=
_mm256_extractf128_ps
(
t1
,
1
);
__m128
t3
=
_mm256_castps256_ps128
(
t1
);
__m128
t4
=
_mm_add_ps
(
t2
,
t3
);
__m128
t5
=
_mm_permute_ps
(
t4
,
0x1B
);
// 0x1B = reverse
return
_mm_cvtss_f32
(
_mm_add_ps
(
t4
,
t5
));
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
__m128i
alo
=
_mm256_castsi256_si128
(
a
);
__m128i
ahi
=
_mm256_extractf128_si256
(
a
,
1
);
__m128i
blo
=
_mm256_castsi256_si128
(
b
);
__m128i
bhi
=
_mm256_extractf128_si256
(
b
,
1
);
__m128i
rlo
=
_mm_mullo_epi32
(
alo
,
blo
);
__m128i
rhi
=
_mm_mullo_epi32
(
ahi
,
bhi
);
return
_mm256_setr_m128i
(
rlo
,
rhi
);
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
(
src
&
~
mask
);
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
iarr
result
;
iarr
src
;
iarr
idxs
;
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
idxs
),
idx
);
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
src
),
from
);
for
(
int
i
=
0
;
i
<
VL
;
i
++
)
{
result
[
i
]
=
mask_test_at
(
mask
,
i
)
?
*
reinterpret_cast
<
const
int
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idxs
[
i
])
:
src
[
i
];
}
return
_mm256_load_si256
(
reinterpret_cast
<
__m256i
*>
(
result
));
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
(
src
&
~
mask
);
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
_mm256_store_ps
(
reinterpret_cast
<
float
*>
(
at
),
a
);
}
static
void
int_store
(
void
*
at
,
const
ivec
&
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
at
),
a
);
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
at
),
a
);
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_min_ps
(
a
,
b
);
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
reinterpret_cast
<
const
int
*>
(
&
mask
)[
at
];
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
_mm256_testz_si256
(
mask
,
mask
);
}
static
bvec
mask_enable_lower
(
int
n
)
{
static
const
int
base
[
8
]
__attribute__
((
aligned
(
64
)))
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
return
int_cmplt
(
ivec
(
base
),
ivec
(
n
));
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
return
_mm256_load_si256
(
reinterpret_cast
<
const
__m256i
*>
(
a
));
}
static
void
int_clear_arr
(
int
*
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
a
),
ivec
(
0
));
}
static
bvec
full_mask
()
{
return
bvec
(
0xFFFFFFFF
);
}
static
void
int_print
(
const
ivec
&
a
)
{
iarr
tmp
;
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
tmp
),
a
);
for
(
int
i
=
0
;
i
<
16
;
i
++
)
printf
(
"%d "
,
tmp
[
i
]);
printf
(
"
\n
"
);
}
static
fvec
cvtdown
(
const
vector_ops
<
double
,
AVX
>::
fvec
&
lo
,
const
vector_ops
<
double
,
AVX
>::
fvec
&
hi
)
{
__m128
t1
=
_mm256_cvtpd_ps
(
lo
);
__m128
t2
=
_mm256_cvtpd_ps
(
hi
);
return
_mm256_setr_m128
(
t1
,
t2
);
}
static
vector_ops
<
double
,
AVX
>::
fvec
cvtup_lo
(
const
fvec
&
a
)
{
return
_mm256_cvtps_pd
(
_mm256_castps256_ps128
(
a
));
}
static
vector_ops
<
double
,
AVX
>::
fvec
cvtup_hi
(
const
fvec
&
a
)
{
return
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
a
,
1
));
// permute DCBA -> BADC
}
static
void
mask_cvtup
(
const
bvec
&
a
,
vector_ops
<
double
,
AVX
>::
bvec
*
blo
,
vector_ops
<
double
,
AVX
>::
bvec
*
bhi
)
{
__m256i
t1
=
_mm256_castps_si256
(
_mm256_unpacklo_ps
(
_mm256_castsi256_ps
(
a
),
_mm256_castsi256_ps
(
a
)));
__m256i
t2
=
_mm256_castps_si256
(
_mm256_unpackhi_ps
(
_mm256_castsi256_ps
(
a
),
_mm256_castsi256_ps
(
a
)));
*
blo
=
_mm256_permute2f128_si256
(
t1
,
t2
,
0x20
);
*
bhi
=
_mm256_permute2f128_si256
(
t1
,
t2
,
0x31
);
}
};
// AVX2
//
struct
avx2_ivec32
{
__m256i
vec
;
avx2_ivec32
()
{}
avx2_ivec32
(
__m256i
m
)
{
vec
=
m
;
}
avx2_ivec32
(
const
int
*
a
)
{
vec
=
_mm256_load_si256
(
reinterpret_cast
<
const
__m256i
*>
(
a
));
}
explicit
avx2_ivec32
(
int
i
)
{
vec
=
_mm256_set1_epi32
(
i
);
}
operator
__m256i
()
const
{
return
vec
;
}
friend
avx2_ivec32
operator
&
(
const
avx2_ivec32
&
a
,
const
avx2_ivec32
&
b
)
{
return
_mm256_and_si256
(
a
,
b
);
}
friend
avx2_ivec32
operator
|
(
const
avx2_ivec32
&
a
,
const
avx2_ivec32
&
b
)
{
return
_mm256_or_si256
(
a
,
b
);
}
friend
avx2_ivec32
operator
+
(
const
avx2_ivec32
&
a
,
const
avx2_ivec32
&
b
)
{
return
_mm256_add_epi32
(
a
,
b
);
}
};
struct
avx2_bvec
{
__m256i
vec
;
avx2_bvec
()
{}
avx2_bvec
(
__m256i
m
)
{
vec
=
m
;
}
explicit
avx2_bvec
(
int
i
)
{
vec
=
_mm256_set1_epi32
(
i
);
}
operator
__m256i
()
const
{
return
vec
;
}
operator
__m256d
()
const
{
return
_mm256_castsi256_pd
(
vec
);
}
operator
__m256
()
const
{
return
_mm256_castsi256_ps
(
vec
);
}
operator
F64vec4
()
const
{
return
_mm256_castsi256_pd
(
vec
);
}
operator
F32vec8
()
const
{
return
_mm256_castsi256_ps
(
vec
);
}
operator
avx2_ivec32
()
const
{
return
vec
;
}
friend
avx2_bvec
operator
&
(
const
avx2_bvec
&
a
,
const
avx2_bvec
&
b
)
{
return
_mm256_and_si256
(
a
,
b
);
}
friend
avx2_bvec
operator
|
(
const
avx2_bvec
&
a
,
const
avx2_bvec
&
b
)
{
return
_mm256_or_si256
(
a
,
b
);
}
friend
avx2_bvec
operator
~
(
const
avx2_bvec
&
a
)
{
return
_mm256_andnot_si256
(
a
,
avx2_bvec
(
0xFFFFFFFF
));
}
avx2_bvec
&
operator
&=
(
const
avx2_bvec
&
a
)
{
return
*
this
=
_mm256_and_si256
(
vec
,
a
);
}
};
template
<>
struct
vector_ops
<
double
,
AVX2
>
{
static
const
int
VL
=
4
;
typedef
double
fscal
;
typedef
F64vec4
fvec
;
typedef
avx2_ivec32
ivec
;
typedef
avx2_bvec
bvec
;
typedef
double
farr
[
4
]
__attribute__
((
aligned
(
32
)));
typedef
int
iarr
[
8
]
__attribute__
((
aligned
(
32
)));
static
fvec
recip
(
const
fvec
&
a
)
{
// newton-raphson
fvec
b
=
_mm256_cvtps_pd
(
_mm_rcp_ps
(
_mm256_cvtpd_ps
(
a
)));
b
=
b
+
b
-
a
*
b
*
b
;
return
b
+
b
-
a
*
b
*
b
;
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
const
bvec
&
mask
,
const
void
*
base
)
{
// nop
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
ivec
idx0
=
_mm256_shuffle_epi32
(
idx
,
0xD8
);
// 11011000 ->3120
ivec
idx1
=
_mm256_permute4x64_epi64
(
idx0
,
0xD8
);
return
_mm256_mask_i32gather_pd
(
from
,
static_cast
<
const
double
*>
(
base
),
_mm256_castsi256_si128
(
idx1
),
mask
,
scale
);
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idx
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
ivec
idx0
=
_mm256_shuffle_epi32
(
idx
,
0xD8
);
// 11011000 ->3120
ivec
idx1
=
_mm256_permute4x64_epi64
(
idx0
,
0xD8
);
*
x
=
_mm256_mask_i32gather_pd
(
*
x
,
&
base
->
x
,
_mm256_castsi256_si128
(
idx1
),
mask
,
1
);
*
y
=
_mm256_mask_i32gather_pd
(
*
y
,
&
base
->
y
,
_mm256_castsi256_si128
(
idx1
),
mask
,
1
);
*
z
=
_mm256_mask_i32gather_pd
(
*
z
,
&
base
->
z
,
_mm256_castsi256_si128
(
idx1
),
mask
,
1
);
*
w
=
_mm256_mask_i32gather_epi32
(
*
w
,
&
base
->
w
,
idx
,
mask
,
1
);
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
fvec
a
=
zero
(),
b
=
zero
(),
c
=
zero
(),
d
=
zero
();
gather_4
(
idxs
,
mask
,
base
,
r0
,
r1
,
r2
,
r3
);
gather_4
(
idxs
,
mask
,
reinterpret_cast
<
const
char
*>
(
base
)
+
32
,
r4
,
r5
,
r6
,
r7
);
}
static
void
gather_4
(
const
ivec
&
idx
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
ivec
idx0
=
_mm256_shuffle_epi32
(
idx
,
0xD8
);
// 11011000 ->3120
ivec
idx1
=
_mm256_permute4x64_epi64
(
idx0
,
0xD8
);
*
r0
=
_mm256_mask_i32gather_pd
(
*
r0
,
static_cast
<
const
double
*>
(
base
)
+
0
,
_mm256_castsi256_si128
(
idx1
),
mask
,
4
);
*
r1
=
_mm256_mask_i32gather_pd
(
*
r1
,
static_cast
<
const
double
*>
(
base
)
+
1
,
_mm256_castsi256_si128
(
idx1
),
mask
,
4
);
*
r2
=
_mm256_mask_i32gather_pd
(
*
r2
,
static_cast
<
const
double
*>
(
base
)
+
2
,
_mm256_castsi256_si128
(
idx1
),
mask
,
4
);
*
r3
=
_mm256_mask_i32gather_pd
(
*
r3
,
static_cast
<
const
double
*>
(
base
)
+
3
,
_mm256_castsi256_si128
(
idx1
),
mask
,
4
);
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
(
b
&
mask
)
|
(
a
&
~
mask
);
}
static
ivec
int_blend
(
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
(
b
&
mask
)
|
(
a
&
~
mask
);
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
return
a
*
b
+
c
;
}
static
fvec
zero
()
{
return
_mm256_setzero_pd
();
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_cmp_pd
(
a
,
b
,
_CMP_EQ_OQ
));
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_cmp_pd
(
a
,
b
,
_CMP_NLE_US
));
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_cmp_pd
(
a
,
b
,
_CMP_LE_OS
));
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castpd_si256
(
_mm256_cmp_pd
(
a
,
b
,
_CMP_LT_OS
));
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
~
bvec
(
_mm256_cmpeq_epi32
(
a
,
b
));
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm256_cmpgt_epi32
(
b
,
a
);
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
return
_mm256_invsqrt_pd
(
a
);
}
static
fvec
sincos
(
fvec
*
cos
,
const
fvec
&
a
)
{
return
_mm256_sincos_pd
(
reinterpret_cast
<
__m256d
*>
(
cos
),
a
);
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
__m256d
t1
=
_mm256_hadd_pd
(
a
,
a
);
__m128d
t2
=
_mm256_extractf128_pd
(
t1
,
1
);
__m128d
t3
=
_mm256_castpd256_pd128
(
t1
);
return
_mm_cvtsd_f64
(
_mm_add_pd
(
t2
,
t3
));
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm256_mullo_epi32
(
a
,
b
);
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
(
src
&
~
mask
);
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
return
_mm256_mask_i32gather_epi32
(
from
,
static_cast
<
const
int
*>
(
base
),
idx
,
mask
,
scale
);
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
(
src
&
~
mask
);
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
_mm256_store_pd
(
reinterpret_cast
<
double
*>
(
at
),
a
);
}
static
void
int_store
(
int
*
at
,
const
ivec
&
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
at
),
a
);
at
[
1
]
=
at
[
2
];
at
[
2
]
=
at
[
4
];
at
[
3
]
=
at
[
6
];
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
at
),
a
);
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_min_pd
(
a
,
b
);
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
reinterpret_cast
<
const
int
*>
(
&
mask
)[
2
*
at
];
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
_mm256_testz_si256
(
mask
,
mask
);
}
static
bvec
mask_enable_lower
(
int
n
)
{
static
const
int
base
[
8
]
__attribute__
((
aligned
(
64
)))
=
{
0
,
0
,
1
,
1
,
2
,
2
,
3
,
3
};
return
int_cmplt
(
ivec
(
base
),
ivec
(
n
));
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
__m128i
b
=
_mm_load_si128
(
reinterpret_cast
<
const
__m128i
*>
(
a
));
__m128i
c
=
_mm_unpacklo_epi32
(
b
,
b
);
__m128i
d
=
_mm_unpackhi_epi32
(
b
,
b
);
return
_mm256_setr_m128i
(
c
,
d
);
// stolen from http://stackoverflow.com/questions/6687535/unexpected-result-from-avx-m256-unpack-ps-unpack-intrinsic
}
static
void
int_clear_arr
(
int
*
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
a
),
ivec
(
0
));
}
static
bvec
full_mask
()
{
return
bvec
(
0xFFFFFFFF
);
}
static
void
int_print
(
const
ivec
&
a
)
{
iarr
tmp
;
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
tmp
),
a
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
printf
(
"%d "
,
tmp
[
i
]);
printf
(
"
\n
"
);
}
};
template
<>
struct
vector_ops
<
float
,
AVX2
>
{
static
const
int
VL
=
8
;
static
const
int
ALIGN
=
32
;
typedef
float
fscal
;
typedef
F32vec8
fvec
;
typedef
avx2_ivec32
ivec
;
typedef
avx2_bvec
bvec
;
typedef
float
farr
[
8
]
__attribute__
((
aligned
(
32
)));
typedef
int
iarr
[
8
]
__attribute__
((
aligned
(
32
)));
static
fvec
recip
(
const
fvec
&
a
)
{
fvec
b
=
_mm256_rcp_ps
(
a
);
b
=
b
+
b
-
a
*
b
*
b
;
return
b
+
b
-
a
*
b
*
b
;
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
bvec
mask
,
const
void
*
base
)
{
// nop
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
return
_mm256_mask_i32gather_ps
(
from
,
static_cast
<
const
float
*>
(
base
),
idx
,
mask
,
scale
);
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idx
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
*
x
=
_mm256_mask_i32gather_ps
(
*
x
,
reinterpret_cast
<
const
float
*>
(
base
)
+
0
,
idx
,
mask
,
1
);
*
y
=
_mm256_mask_i32gather_ps
(
*
y
,
reinterpret_cast
<
const
float
*>
(
base
)
+
1
,
idx
,
mask
,
1
);
*
z
=
_mm256_mask_i32gather_ps
(
*
z
,
reinterpret_cast
<
const
float
*>
(
base
)
+
2
,
idx
,
mask
,
1
);
*
w
=
_mm256_mask_i32gather_epi32
(
*
w
,
reinterpret_cast
<
const
int
*>
(
base
)
+
3
,
idx
,
mask
,
1
);
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
);
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
4
);
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
8
);
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
12
);
*
r4
=
gather
<
4
>
(
*
r4
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
16
);
*
r5
=
gather
<
4
>
(
*
r5
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
20
);
*
r6
=
gather
<
4
>
(
*
r6
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
24
);
*
r7
=
gather
<
4
>
(
*
r7
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
28
);
}
static
void
gather_4
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
);
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
4
);
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
8
);
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
12
);
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
(
b
&
mask
)
|
(
a
&
~
mask
);
}
static
ivec
int_blend
(
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
(
b
&
mask
)
|
(
a
&
~
mask
);
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
return
a
*
b
+
c
;
}
static
fvec
zero
()
{
return
_mm256_setzero_ps
();
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castps_si256
(
_mm256_cmp_ps
(
a
,
b
,
_CMP_EQ_OQ
));
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castps_si256
(
_mm256_cmp_ps
(
a
,
b
,
_CMP_NLE_US
));
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castps_si256
(
_mm256_cmp_ps
(
a
,
b
,
_CMP_LE_OS
));
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_castps_si256
(
_mm256_cmp_ps
(
a
,
b
,
_CMP_LT_OS
));
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
~
bvec
(
_mm256_cmpeq_epi32
(
a
,
b
));
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm256_cmpgt_epi32
(
b
,
a
);
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
return
_mm256_invsqrt_ps
(
a
);
}
static
fvec
sincos
(
fvec
*
cos
,
const
fvec
&
a
)
{
return
_mm256_sincos_ps
(
reinterpret_cast
<
__m256
*>
(
cos
),
a
);
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
__m256
t1
=
_mm256_hadd_ps
(
a
,
a
);
__m128
t2
=
_mm256_extractf128_ps
(
t1
,
1
);
__m128
t3
=
_mm256_castps256_ps128
(
t1
);
__m128
t4
=
_mm_add_ps
(
t2
,
t3
);
__m128
t5
=
_mm_permute_ps
(
t4
,
0x1B
);
// 0x1B = reverse
return
_mm_cvtss_f32
(
_mm_add_ps
(
t4
,
t5
));
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm256_mullo_epi32
(
a
,
b
);
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
(
src
&
~
mask
);
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
return
_mm256_mask_i32gather_epi32
(
from
,
static_cast
<
const
int
*>
(
base
),
idx
,
mask
,
scale
);
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
(
src
&
~
mask
);
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
_mm256_store_ps
(
reinterpret_cast
<
float
*>
(
at
),
a
);
}
static
void
int_store
(
void
*
at
,
const
ivec
&
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
at
),
a
);
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
at
),
a
);
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm256_min_ps
(
a
,
b
);
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
reinterpret_cast
<
const
int
*>
(
&
mask
)[
at
];
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
_mm256_testz_si256
(
mask
,
mask
);
}
static
bvec
mask_enable_lower
(
int
n
)
{
static
const
int
base
[
8
]
__attribute__
((
aligned
(
64
)))
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
return
int_cmplt
(
ivec
(
base
),
ivec
(
n
));
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
return
_mm256_load_si256
(
reinterpret_cast
<
const
__m256i
*>
(
a
));
}
static
void
int_clear_arr
(
int
*
a
)
{
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
a
),
ivec
(
0
));
}
static
bvec
full_mask
()
{
return
bvec
(
0xFFFFFFFF
);
}
static
void
int_print
(
const
ivec
&
a
)
{
iarr
tmp
;
_mm256_store_si256
(
reinterpret_cast
<
__m256i
*>
(
tmp
),
a
);
for
(
int
i
=
0
;
i
<
16
;
i
++
)
printf
(
"%d "
,
tmp
[
i
]);
printf
(
"
\n
"
);
}
static
fvec
cvtdown
(
const
vector_ops
<
double
,
AVX2
>::
fvec
&
lo
,
const
vector_ops
<
double
,
AVX2
>::
fvec
&
hi
)
{
__m128
t1
=
_mm256_cvtpd_ps
(
lo
);
__m128
t2
=
_mm256_cvtpd_ps
(
hi
);
return
_mm256_setr_m128
(
t1
,
t2
);
}
static
vector_ops
<
double
,
AVX2
>::
fvec
cvtup_lo
(
const
fvec
&
a
)
{
return
_mm256_cvtps_pd
(
_mm256_castps256_ps128
(
a
));
}
static
vector_ops
<
double
,
AVX2
>::
fvec
cvtup_hi
(
const
fvec
&
a
)
{
return
_mm256_cvtps_pd
(
_mm256_extractf128_ps
(
a
,
1
));
// permute DCBA -> BADC
}
static
void
mask_cvtup
(
const
bvec
&
a
,
vector_ops
<
double
,
AVX2
>::
bvec
*
blo
,
vector_ops
<
double
,
AVX2
>::
bvec
*
bhi
)
{
__m256i
t1
=
_mm256_castps_si256
(
_mm256_unpacklo_ps
(
_mm256_castsi256_ps
(
a
),
_mm256_castsi256_ps
(
a
)));
__m256i
t2
=
_mm256_castps_si256
(
_mm256_unpackhi_ps
(
_mm256_castsi256_ps
(
a
),
_mm256_castsi256_ps
(
a
)));
*
blo
=
_mm256_permute2f128_si256
(
t1
,
t2
,
0x20
);
*
bhi
=
_mm256_permute2f128_si256
(
t1
,
t2
,
0x31
);
}
};
//////////////////////////////////////////////////////////////////////////////
// SSE
//////////////////////////////////////////////////////////////////////////////
#pragma pack(push,16)
struct
ivec32x4
{
__m128i
vec
;
ivec32x4
()
{}
ivec32x4
(
__m128i
m
)
{
vec
=
m
;
}
ivec32x4
(
const
int
*
a
)
{
vec
=
_mm_load_si128
(
reinterpret_cast
<
const
__m128i
*>
(
a
));
}
explicit
ivec32x4
(
int
i
)
{
vec
=
_mm_set1_epi32
(
i
);
}
operator
__m128i
()
const
{
return
vec
;
}
friend
ivec32x4
operator
&
(
const
ivec32x4
&
a
,
const
ivec32x4
&
b
)
{
return
_mm_castpd_si128
(
_mm_and_pd
(
_mm_castsi128_pd
(
a
),
_mm_castsi128_pd
(
b
)));
}
friend
ivec32x4
operator
|
(
const
ivec32x4
&
a
,
const
ivec32x4
&
b
)
{
return
_mm_castpd_si128
(
_mm_or_pd
(
_mm_castsi128_pd
(
a
),
_mm_castsi128_pd
(
b
)));
}
friend
ivec32x4
operator
+
(
const
ivec32x4
&
a
,
const
ivec32x4
&
b
)
{
return
_mm_add_epi32
(
a
,
b
);
}
};
struct
sse_bvecx4
{
__m128i
vec
;
sse_bvecx4
()
{}
sse_bvecx4
(
__m128i
m
)
{
vec
=
m
;
}
explicit
sse_bvecx4
(
int
i
)
{
vec
=
_mm_set1_epi32
(
i
);
}
operator
__m128i
()
const
{
return
vec
;
}
operator
F64vec2
()
const
{
return
_mm_castsi128_pd
(
vec
);
}
operator
ivec32x4
()
const
{
return
vec
;
}
friend
sse_bvecx4
operator
&
(
const
sse_bvecx4
&
a
,
const
sse_bvecx4
&
b
)
{
return
_mm_castpd_si128
(
_mm_and_pd
(
_mm_castsi128_pd
(
a
),
_mm_castsi128_pd
(
b
)));
}
friend
sse_bvecx4
operator
|
(
const
sse_bvecx4
&
a
,
const
sse_bvecx4
&
b
)
{
return
_mm_castpd_si128
(
_mm_or_pd
(
_mm_castsi128_pd
(
a
),
_mm_castsi128_pd
(
b
)));
}
friend
sse_bvecx4
operator
~
(
const
sse_bvecx4
&
a
)
{
return
_mm_castpd_si128
(
_mm_andnot_pd
(
_mm_castsi128_pd
(
a
),
_mm_castsi128_pd
(
sse_bvecx4
(
0xFFFFFFFF
))));
}
sse_bvecx4
&
operator
&=
(
const
sse_bvecx4
&
a
)
{
return
*
this
=
_mm_and_si128
(
vec
,
a
);
}
};
#pragma pack(pop)
template
<>
struct
vector_ops
<
double
,
SSE
>
{
static
const
int
VL
=
2
;
typedef
double
fscal
;
typedef
F64vec2
fvec
;
typedef
ivec32x4
ivec
;
typedef
sse_bvecx4
bvec
;
typedef
double
farr
[
2
]
__attribute__
((
aligned
(
16
)));
typedef
int
iarr
[
4
]
__attribute__
((
aligned
(
16
)));
static
fvec
recip
(
const
fvec
&
a
)
{
fvec
b
=
_mm_cvtps_pd
(
_mm_rcp_ps
(
_mm_cvtpd_ps
(
a
)));
b
=
b
+
b
-
a
*
b
*
b
;
return
b
+
b
-
a
*
b
*
b
;
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
const
bvec
&
mask
,
const
void
*
base
)
{
// nop
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
fvec
res
=
from
;
if
(
_mm_extract_epi32
(
mask
,
0
))
{
res
=
_mm_loadl_pd
(
res
,
reinterpret_cast
<
const
double
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
_mm_extract_epi32
(
idx
,
0
)));
}
if
(
_mm_extract_epi32
(
mask
,
2
))
{
res
=
_mm_loadh_pd
(
res
,
reinterpret_cast
<
const
double
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
_mm_extract_epi32
(
idx
,
2
)));
}
return
res
;
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
__m128d
a0lo
,
a0hi
,
a1lo
,
a1hi
;
if
(
_mm_extract_epi32
(
mask
,
0
))
{
a0lo
=
_mm_load_pd
(
reinterpret_cast
<
const
double
*>
(
&
base
[
_mm_extract_epi32
(
idxs
,
0
)
/
32
]));
a0hi
=
_mm_load_pd
(
reinterpret_cast
<
const
double
*>
(
&
base
[
_mm_extract_epi32
(
idxs
,
0
)
/
32
].
z
));
}
if
(
_mm_extract_epi32
(
mask
,
2
))
{
a1lo
=
_mm_load_pd
(
reinterpret_cast
<
const
double
*>
(
&
base
[
_mm_extract_epi32
(
idxs
,
2
)
/
32
]));
a1hi
=
_mm_load_pd
(
reinterpret_cast
<
const
double
*>
(
&
base
[
_mm_extract_epi32
(
idxs
,
2
)
/
32
].
z
));
}
__m128d
c0
=
_mm_unpacklo_pd
(
a0lo
,
a1lo
);
__m128d
c1
=
_mm_unpackhi_pd
(
a0lo
,
a1lo
);
__m128d
c2
=
_mm_unpacklo_pd
(
a0hi
,
a1hi
);
__m128d
c3
=
_mm_unpackhi_pd
(
a0hi
,
a1hi
);
*
x
=
blend
(
mask
,
*
x
,
c0
);
*
y
=
blend
(
mask
,
*
y
,
c1
);
*
z
=
blend
(
mask
,
*
z
,
c2
);
*
w
=
int_blend
(
mask
,
*
w
,
_mm_shuffle_epi32
(
_mm_castpd_si128
(
c3
),
0xA0
));
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
fvec
a
=
zero
(),
b
=
zero
(),
c
=
zero
(),
d
=
zero
();
gather_4
(
idxs
,
mask
,
base
,
r0
,
r1
,
r2
,
r3
);
gather_4
(
idxs
,
mask
,
reinterpret_cast
<
const
char
*>
(
base
)
+
32
,
r4
,
r5
,
r6
,
r7
);
}
static
void
gather_4
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
);
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
8
);
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
16
);
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
24
);
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
(
b
&
_mm_castsi128_pd
(
mask
))
|
_mm_andnot_pd
(
_mm_castsi128_pd
(
mask
),
a
);
}
static
ivec
int_blend
(
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
(
b
&
mask
)
|
_mm_andnot_si128
(
mask
,
a
);
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
return
a
*
b
+
c
;
}
static
fvec
zero
()
{
return
_mm_setzero_pd
();
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_castpd_si128
(
_mm_cmpeq_pd
(
a
,
b
));
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_castpd_si128
(
_mm_cmpnle_pd
(
a
,
b
));
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_castpd_si128
(
_mm_cmple_pd
(
a
,
b
));
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_castpd_si128
(
_mm_cmplt_pd
(
a
,
b
));
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
__m128i
we
=
_mm_undefined_si128
();
return
_mm_andnot_si128
(
_mm_cmpeq_epi32
(
a
,
b
),
_mm_cmpeq_epi8
(
we
,
we
));
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm_cmplt_epi32
(
a
,
b
);
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
return
_mm_invsqrt_pd
(
a
);
}
static
fvec
sincos
(
fvec
*
cos
,
const
fvec
&
a
)
{
return
_mm_sincos_pd
(
reinterpret_cast
<
__m128d
*>
(
cos
),
a
);
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
__m128d
t1
=
_mm_shuffle_pd
(
a
,
a
,
1
);
// reverse vector
return
_mm_cvtsd_f64
(
_mm_add_pd
(
t1
,
a
));
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm_mullo_epi32
(
a
,
b
);
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
_mm_andnot_si128
(
mask
,
src
);
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
iarr
result
;
iarr
src
;
iarr
idxs
,
m
;
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
idxs
),
idx
);
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
src
),
from
);
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
m
),
mask
);
for
(
int
i
=
0
;
i
<
VL
;
i
++
)
{
int
tmp
;
if
(
m
[
2
*
i
])
{
tmp
=
*
reinterpret_cast
<
const
int
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idxs
[
2
*
i
]);
}
else
{
tmp
=
src
[
2
*
i
];
}
result
[
2
*
i
]
=
tmp
;
result
[
2
*
i
+
1
]
=
tmp
;
}
return
_mm_load_si128
(
reinterpret_cast
<
__m128i
*>
(
result
));
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
((
a
+
b
)
&
_mm_castsi128_pd
(
mask
))
|
_mm_andnot_pd
(
_mm_castsi128_pd
(
mask
),
src
);
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
_mm_store_pd
(
reinterpret_cast
<
double
*>
(
at
),
a
);
}
static
void
int_store
(
int
*
at
,
const
ivec
&
a
)
{
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
at
),
a
);
at
[
1
]
=
at
[
2
];
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
at
),
a
);
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_min_pd
(
a
,
b
);
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
reinterpret_cast
<
const
int
*>
(
&
mask
)[
2
*
at
];
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
_mm_testz_si128
(
mask
,
mask
);
}
static
bvec
mask_enable_lower
(
int
n
)
{
static
const
int
base
[
4
]
__attribute__
((
aligned
(
32
)))
=
{
0
,
0
,
1
,
1
};
return
int_cmplt
(
ivec
(
base
),
ivec
(
n
));
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
__m128i
b
=
_mm_load_si128
(
reinterpret_cast
<
const
__m128i
*>
(
a
));
return
_mm_unpacklo_epi32
(
b
,
b
);
}
static
void
int_clear_arr
(
int
*
a
)
{
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
a
),
ivec
(
0
));
}
static
bvec
full_mask
()
{
return
bvec
(
0xFFFFFFFF
);
}
static
void
int_print
(
const
ivec
&
a
)
{
iarr
tmp
;
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
tmp
),
a
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
printf
(
"%d "
,
tmp
[
i
]);
printf
(
"
\n
"
);
}
};
template
<>
struct
vector_ops
<
float
,
SSE
>
{
static
const
int
VL
=
4
;
static
const
int
ALIGN
=
16
;
typedef
float
fscal
;
typedef
F32vec4
fvec
;
typedef
ivec32x4
ivec
;
typedef
sse_bvecx4
bvec
;
typedef
float
farr
[
4
]
__attribute__
((
aligned
(
16
)));
typedef
int
iarr
[
4
]
__attribute__
((
aligned
(
16
)));
static
fvec
recip
(
const
fvec
&
a
)
{
fvec
b
=
_mm_rcp_ps
(
a
);
b
=
b
+
b
-
a
*
b
*
b
;
return
b
+
b
-
a
*
b
*
b
;
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
const
bvec
&
mask
,
const
void
*
base
)
{
// nop
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
farr
result
;
farr
src
;
iarr
idxs
,
m
;
mask_store
(
m
,
mask
);
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
idxs
),
idx
);
_mm_store_ps
(
reinterpret_cast
<
float
*>
(
src
),
from
);
for
(
int
i
=
0
;
i
<
VL
;
i
++
)
{
result
[
i
]
=
m
[
i
]
?
*
reinterpret_cast
<
const
float
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idxs
[
i
])
:
src
[
i
];
}
return
_mm_load_ps
(
reinterpret_cast
<
float
*>
(
result
));
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
*
x
=
gather
<
1
>
(
*
x
,
mask
,
idxs
,
&
base
->
x
);
*
y
=
gather
<
1
>
(
*
y
,
mask
,
idxs
,
&
base
->
y
);
*
z
=
gather
<
1
>
(
*
z
,
mask
,
idxs
,
&
base
->
z
);
*
w
=
int_gather
<
1
>
(
*
w
,
mask
,
idxs
,
&
base
->
w
);
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
fvec
a
=
zero
(),
b
=
zero
(),
c
=
zero
(),
d
=
zero
();
gather_4
(
idxs
,
mask
,
base
,
r0
,
r1
,
r2
,
r3
);
gather_4
(
idxs
,
mask
,
reinterpret_cast
<
const
char
*>
(
base
)
+
16
,
r4
,
r5
,
r6
,
r7
);
}
static
void
gather_4
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
);
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
4
);
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
8
);
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
12
);
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
(
b
&
_mm_castsi128_ps
(
mask
))
|
_mm_andnot_ps
(
_mm_castsi128_ps
(
mask
),
a
);
}
static
ivec
int_blend
(
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
(
b
&
mask
)
|
_mm_andnot_si128
(
mask
,
a
);
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
return
a
*
b
+
c
;
}
static
fvec
zero
()
{
return
_mm_setzero_ps
();
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_castps_si128
(
_mm_cmpeq_ps
(
a
,
b
));
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_castps_si128
(
_mm_cmpnle_ps
(
a
,
b
));
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_castps_si128
(
_mm_cmple_ps
(
a
,
b
));
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_castps_si128
(
_mm_cmplt_ps
(
a
,
b
));
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
__m128i
we
=
_mm_undefined_si128
();
return
_mm_andnot_si128
(
_mm_cmpeq_epi32
(
a
,
b
),
_mm_cmpeq_epi8
(
we
,
we
));
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm_cmplt_epi32
(
a
,
b
);
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
return
_mm_invsqrt_ps
(
a
);
}
static
fvec
sincos
(
fvec
*
cos
,
const
fvec
&
a
)
{
return
_mm_sincos_ps
(
reinterpret_cast
<
__m128
*>
(
cos
),
a
);
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
__m128
t1
=
_mm_hadd_ps
(
a
,
a
);
__m128
t2
=
_mm_shuffle_ps
(
t1
,
t1
,
0x1B
);
// reverse
return
_mm_cvtss_f32
(
_mm_add_ps
(
t1
,
t2
));
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
_mm_mullo_epi32
(
a
,
b
);
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
((
a
+
b
)
&
mask
)
|
_mm_andnot_si128
(
mask
,
src
);
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
iarr
result
;
iarr
src
;
iarr
idxs
,
m
;
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
idxs
),
idx
);
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
src
),
from
);
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
m
),
mask
);
for
(
int
i
=
0
;
i
<
VL
;
i
++
)
{
int
tmp
;
if
(
m
[
i
])
{
tmp
=
*
reinterpret_cast
<
const
int
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idxs
[
i
]);
}
else
{
tmp
=
src
[
i
];
}
result
[
i
]
=
tmp
;
}
return
_mm_load_si128
(
reinterpret_cast
<
__m128i
*>
(
result
));
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
((
a
+
b
)
&
_mm_castsi128_ps
(
mask
))
|
_mm_andnot_ps
(
_mm_castsi128_ps
(
mask
),
src
);
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
_mm_store_ps
(
reinterpret_cast
<
float
*>
(
at
),
a
);
}
static
void
int_store
(
int
*
at
,
const
ivec
&
a
)
{
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
at
),
a
);
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
at
),
a
);
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
_mm_min_ps
(
a
,
b
);
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
reinterpret_cast
<
const
int
*>
(
&
mask
)[
at
];
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
_mm_testz_si128
(
mask
,
mask
);
}
static
bvec
mask_enable_lower
(
int
n
)
{
static
const
int
base
[
4
]
__attribute__
((
aligned
(
32
)))
=
{
0
,
1
,
2
,
3
};
return
int_cmplt
(
ivec
(
base
),
ivec
(
n
));
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
return
_mm_load_si128
(
reinterpret_cast
<
const
__m128i
*>
(
a
));
}
static
void
int_clear_arr
(
int
*
a
)
{
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
a
),
ivec
(
0
));
}
static
bvec
full_mask
()
{
return
bvec
(
0xFFFFFFFF
);
}
static
void
int_print
(
const
ivec
&
a
)
{
iarr
tmp
;
_mm_store_si128
(
reinterpret_cast
<
__m128i
*>
(
tmp
),
a
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
printf
(
"%d "
,
tmp
[
i
]);
printf
(
"
\n
"
);
}
static
fvec
cvtdown
(
const
vector_ops
<
double
,
SSE
>::
fvec
&
lo
,
const
vector_ops
<
double
,
SSE
>::
fvec
&
hi
)
{
__m128
t1
=
_mm_cvtpd_ps
(
lo
);
__m128
t2
=
_mm_cvtpd_ps
(
hi
);
return
_mm_shuffle_ps
(
t1
,
t2
,
0x44
);
// t2[1]t2[0]t1[1]t1[0]
}
static
vector_ops
<
double
,
SSE
>::
fvec
cvtup_lo
(
const
fvec
&
a
)
{
return
_mm_cvtps_pd
(
a
);
}
static
vector_ops
<
double
,
SSE
>::
fvec
cvtup_hi
(
const
fvec
&
a
)
{
return
_mm_cvtps_pd
(
_mm_shuffle_ps
(
a
,
a
,
0x4E
));
// permute DCBA -> BADC
}
static
void
mask_cvtup
(
const
bvec
&
a
,
vector_ops
<
double
,
SSE
>::
bvec
*
blo
,
vector_ops
<
double
,
SSE
>::
bvec
*
bhi
)
{
*
blo
=
_mm_unpacklo_epi32
(
a
,
a
);
*
bhi
=
_mm_unpackhi_epi32
(
a
,
a
);
}
};
#endif
// Scalar implementation
template
<
class
flt_t
>
struct
vector_ops
<
flt_t
,
NONE
>
{
static
const
int
VL
=
1
;
static
const
int
ALIGN
=
4
;
typedef
flt_t
fscal
;
typedef
flt_t
fvec
;
typedef
int
ivec
;
typedef
bool
bvec
;
typedef
flt_t
farr
[
1
];
typedef
int
iarr
[
1
];
static
fvec
recip
(
const
fvec
&
a
)
{
return
((
flt_t
)
1.
)
/
a
;
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
const
bvec
&
mask
,
const
void
*
base
)
{
// nop
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
return
mask
?
*
reinterpret_cast
<
const
flt_t
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idx
)
:
from
;
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
*
x
=
gather
<
1
>
(
*
x
,
mask
,
idxs
,
&
base
->
x
);
*
y
=
gather
<
1
>
(
*
y
,
mask
,
idxs
,
&
base
->
y
);
*
z
=
gather
<
1
>
(
*
z
,
mask
,
idxs
,
&
base
->
z
);
*
w
=
int_gather
<
1
>
(
*
w
,
mask
,
idxs
,
&
base
->
w
);
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
fvec
a
=
zero
(),
b
=
zero
(),
c
=
zero
(),
d
=
zero
();
gather_4
(
idxs
,
mask
,
base
,
r0
,
r1
,
r2
,
r3
);
gather_4
(
idxs
,
mask
,
reinterpret_cast
<
const
char
*>
(
base
)
+
4
*
sizeof
(
fscal
),
r4
,
r5
,
r6
,
r7
);
}
static
void
gather_4
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
*
sizeof
(
fscal
));
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
1
*
sizeof
(
fscal
));
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
2
*
sizeof
(
fscal
));
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
3
*
sizeof
(
fscal
));
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
mask
?
b
:
a
;
}
static
ivec
int_blend
(
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
mask
?
b
:
a
;
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
return
a
*
b
+
c
;
}
static
fvec
zero
()
{
return
0.
;
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
a
==
b
;
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
!
(
a
<=
b
);
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
a
<=
b
;
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
a
<
b
;
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
a
!=
b
;
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
a
<
b
;
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
return
1.
/
sqrt
(
a
);
}
static
fvec
sincos
(
fvec
*
c
,
const
fvec
&
a
)
{
*
c
=
cos
(
a
);
return
sin
(
a
);
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
return
a
;
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
return
a
*
b
;
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
return
mask
?
a
+
b
:
src
;
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
return
mask
?
*
reinterpret_cast
<
const
int
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idx
)
:
from
;
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
return
mask
?
a
+
b
:
src
;
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
*
reinterpret_cast
<
flt_t
*>
(
at
)
=
a
;
}
static
void
int_store
(
int
*
at
,
const
ivec
&
a
)
{
*
reinterpret_cast
<
int
*>
(
at
)
=
a
;
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
*
at
=
a
;
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
return
a
<
b
?
a
:
b
;
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
mask
;
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
!
mask
;
}
static
bvec
mask_enable_lower
(
int
n
)
{
return
n
>
0
?
true
:
false
;
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
return
*
a
;
}
static
void
int_clear_arr
(
int
*
a
)
{
*
a
=
0
;
}
static
bvec
full_mask
()
{
return
true
;
}
static
void
int_print
(
const
ivec
&
a
)
{
}
};
// Array notation implementation
template
<
class
flt_t
>
struct
vector_ops
<
flt_t
,
AN
>
{
static
const
int
VL
=
4
;
typedef
flt_t
fscal
;
typedef
lmp_intel_an_fvec
<
VL
,
fscal
>
fvec
;
typedef
lmp_intel_an_ivec
<
VL
>
ivec
;
typedef
lmp_intel_an_bvec
<
VL
>
bvec
;
typedef
flt_t
farr
[
VL
];
typedef
int
iarr
[
VL
];
static
fvec
recip
(
const
fvec
&
a
)
{
fvec
ret
;
ret
.
data
[
:
]
=
((
fscal
)
1.
)
/
a
.
data
[
:
];
return
ret
;
}
template
<
int
scale
>
static
void
gather_prefetch_t0
(
const
ivec
&
idx
,
const
bvec
&
mask
,
const
void
*
base
)
{
// nop
}
template
<
int
scale
>
static
fvec
gather
(
const
fvec
&
from
,
const
bvec
&
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
fvec
ret
=
from
;
if
(
mask
.
data
[
:
])
ret
.
data
[
:
]
=
*
reinterpret_cast
<
const
fscal
*>
(
reinterpret_cast
<
const
char
*>
(
base
)
+
scale
*
idx
.
data
[
:
]);
return
ret
;
}
template
<
class
T
>
static
void
gather_x
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
T
*
base
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
ivec
*
w
)
{
*
x
=
gather
<
1
>
(
*
x
,
mask
,
idxs
,
&
base
->
x
);
*
y
=
gather
<
1
>
(
*
y
,
mask
,
idxs
,
&
base
->
y
);
*
z
=
gather
<
1
>
(
*
z
,
mask
,
idxs
,
&
base
->
z
);
*
w
=
int_gather
<
1
>
(
*
w
,
mask
,
idxs
,
&
base
->
w
);
}
static
void
gather_8
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
,
fvec
*
r4
,
fvec
*
r5
,
fvec
*
r6
,
fvec
*
r7
)
{
fvec
a
=
zero
(),
b
=
zero
(),
c
=
zero
(),
d
=
zero
();
gather_4
(
idxs
,
mask
,
base
,
r0
,
r1
,
r2
,
r3
);
gather_4
(
idxs
,
mask
,
reinterpret_cast
<
const
char
*>
(
base
)
+
4
*
sizeof
(
fscal
),
r4
,
r5
,
r6
,
r7
);
}
static
void
gather_4
(
const
ivec
&
idxs
,
const
bvec
&
mask
,
const
void
*
base
,
fvec
*
r0
,
fvec
*
r1
,
fvec
*
r2
,
fvec
*
r3
)
{
*
r0
=
gather
<
4
>
(
*
r0
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
0
*
sizeof
(
fscal
));
*
r1
=
gather
<
4
>
(
*
r1
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
1
*
sizeof
(
fscal
));
*
r2
=
gather
<
4
>
(
*
r2
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
2
*
sizeof
(
fscal
));
*
r3
=
gather
<
4
>
(
*
r3
,
mask
,
idxs
,
reinterpret_cast
<
const
char
*>
(
base
)
+
3
*
sizeof
(
fscal
));
}
static
fvec
blend
(
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
fvec
ret
=
a
;
if
(
mask
.
data
[
:
])
ret
.
data
[
:
]
=
b
.
data
[
:
];
return
ret
;
}
static
ivec
int_blend
(
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
fvec
ret
=
a
;
if
(
mask
.
data
[
:
])
ret
.
data
[
:
]
=
b
.
data
[
:
];
return
ret
;
}
static
fvec
fmadd
(
const
fvec
&
a
,
const
fvec
&
b
,
const
fvec
&
c
)
{
fvec
ret
;
ret
.
data
[
:
]
=
a
.
data
[
:
]
*
b
.
data
[
:
]
+
c
.
data
[
:
];
return
ret
;
}
static
fvec
zero
()
{
return
fvec
(
0.
);
}
static
bvec
cmpeq
(
const
fvec
&
a
,
const
fvec
&
b
)
{
bvec
ret
;
ret
.
data
[
:
]
=
a
.
data
[
:
]
==
b
.
data
[
:
];
return
ret
;
}
static
bvec
cmpnle
(
const
fvec
&
a
,
const
fvec
&
b
)
{
bvec
ret
;
ret
.
data
[
:
]
=
!
(
a
.
data
[
:
]
<=
b
.
data
[
:
]);
return
ret
;
}
static
bvec
cmple
(
const
fvec
&
a
,
const
fvec
&
b
)
{
bvec
ret
;
ret
.
data
[
:
]
=
a
.
data
[
:
]
<=
b
.
data
[
:
];
return
ret
;
}
static
bvec
cmplt
(
const
fvec
&
a
,
const
fvec
&
b
)
{
bvec
ret
;
ret
.
data
[
:
]
=
a
.
data
[
:
]
<
b
.
data
[
:
];
return
ret
;
}
static
bvec
int_cmpneq
(
const
ivec
&
a
,
const
ivec
&
b
)
{
bvec
ret
;
ret
.
data
[
:
]
=
a
.
data
[
:
]
!=
b
.
data
[
:
];
return
ret
;
}
static
bvec
int_cmplt
(
const
ivec
&
a
,
const
ivec
&
b
)
{
bvec
ret
;
ret
.
data
[
:
]
=
a
.
data
[
:
]
<
b
.
data
[
:
];
return
ret
;
}
static
fvec
invsqrt
(
const
fvec
&
a
)
{
fvec
ret
;
ret
.
data
[
:
]
=
((
fscal
)
1.
)
/
sqrt
(
a
.
data
[
:
]);
return
ret
;
}
static
fvec
sincos
(
fvec
*
c
,
const
fvec
&
a
)
{
c
->
data
[
:
]
=
cos
(
a
.
data
[
:
]);
fvec
ret
;
ret
.
data
[
:
]
=
sin
(
a
.
data
[
:
]);
return
ret
;
}
static
fscal
reduce_add
(
const
fvec
&
a
)
{
return
__sec_reduce_add
(
a
.
data
[
:
]);
}
static
ivec
int_mullo
(
const
ivec
&
a
,
const
ivec
&
b
)
{
ivec
ret
;
ret
.
data
[
:
]
=
a
.
data
[
:
]
*
b
.
data
[
:
];
return
ret
;
}
static
ivec
int_mask_add
(
const
ivec
&
src
,
const
bvec
&
mask
,
const
ivec
&
a
,
const
ivec
&
b
)
{
ivec
ret
=
src
;
if
(
mask
.
data
[
:
])
ret
.
data
[
:
]
=
a
.
data
[
:
]
+
b
.
data
[
:
];
return
ret
;
}
template
<
int
scale
>
static
ivec
int_gather
(
const
ivec
&
from
,
bvec
mask
,
const
ivec
&
idx
,
const
void
*
base
)
{
ivec
ret
=
from
;
if
(
mask
.
data
[
:
])
ret
.
data
[
:
]
=
reinterpret_cast
<
const
int
*>
(
base
)[
scale
*
idx
.
data
[
:
]
/
sizeof
(
int
)];
return
ret
;
}
static
fvec
mask_add
(
const
fvec
&
src
,
const
bvec
&
mask
,
const
fvec
&
a
,
const
fvec
&
b
)
{
fvec
ret
=
src
;
if
(
mask
.
data
[
:
])
ret
.
data
[
:
]
=
a
.
data
[
:
]
+
b
.
data
[
:
];
return
ret
;
}
static
void
store
(
void
*
at
,
const
fvec
&
a
)
{
reinterpret_cast
<
fscal
*>
(
at
)[
0
:
VL
]
=
a
.
data
[
:
];
}
static
void
int_store
(
int
*
at
,
const
ivec
&
a
)
{
reinterpret_cast
<
int
*>
(
at
)[
0
:
VL
]
=
a
.
data
[
:
];
}
static
void
mask_store
(
int
*
at
,
const
bvec
&
a
)
{
at
[
0
:
VL
]
=
a
.
data
[
:
];
}
static
fvec
min
(
const
fvec
&
a
,
const
fvec
&
b
)
{
fvec
ret
=
b
;
if
(
a
.
data
[
:
]
<
b
.
data
[
:
])
ret
.
data
[
:
]
=
a
.
data
[
:
];
return
ret
;
}
static
bool
mask_test_at
(
const
bvec
&
mask
,
int
at
)
{
return
mask
.
data
[
at
];
}
static
bool
mask_testz
(
const
bvec
&
mask
)
{
return
!
__sec_reduce_or
(
mask
.
data
[
:
]);
}
static
bvec
mask_enable_lower
(
int
n
)
{
bvec
ret
;
ret
.
data
[
:
]
=
__sec_implicit_index
(
0
)
<
n
;
return
ret
;
}
static
ivec
int_load_vl
(
const
int
*
a
)
{
return
ivec
(
a
);
}
static
void
int_clear_arr
(
int
*
a
)
{
a
[
0
:
VL
]
=
0
;
}
static
bvec
full_mask
()
{
return
bvec
(
1
);
}
static
void
int_print
(
const
ivec
&
a
)
{
}
};
// Mixins to implement mixed precision and single/single and double/double
// This one is for single/single and double/double
template
<
class
BASE_flt_t
,
CalculationMode
BASE_mic
>
struct
AccumulatorSameMixin
{
typedef
vector_ops
<
BASE_flt_t
,
BASE_mic
>
BASE
;
typedef
typename
BASE
::
fvec
avec
;
typedef
typename
BASE
::
farr
aarr
;
static
avec
acc_mask_add
(
const
avec
&
src
,
const
typename
BASE
::
bvec
&
m
,
const
avec
&
a
,
const
typename
BASE
::
fvec
&
b
)
{
return
BASE
::
mask_add
(
src
,
m
,
a
,
b
);
}
static
typename
BASE
::
fscal
acc_reduce_add
(
const
avec
&
a
)
{
return
BASE
::
reduce_add
(
a
);
}
static
avec
acc_zero
()
{
return
BASE
::
zero
();
}
static
void
acc_store
(
aarr
mem
,
const
avec
&
a
)
{
BASE
::
store
(
mem
,
a
);
}
};
// Mixed precision for cases where double vectors contain fewer elements
template
<
class
BASE_flt_t
,
class
HIGH_flt_t
,
CalculationMode
mic
>
struct
AccumulatorTwiceMixin
{
typedef
vector_ops
<
BASE_flt_t
,
mic
>
BASE
;
typedef
vector_ops
<
HIGH_flt_t
,
mic
>
HIGH
;
struct
avec_t
{
typename
HIGH
::
fvec
lo
,
hi
;
avec_t
(
const
typename
HIGH
::
fvec
&
alo
,
const
typename
HIGH
::
fvec
&
ahi
)
:
lo
(
alo
),
hi
(
ahi
)
{}
avec_t
(
const
typename
BASE
::
fvec
&
a
)
{
lo
=
BASE
::
cvtup_lo
(
a
);
hi
=
BASE
::
cvtup_hi
(
a
);
}
friend
avec_t
operator
+
(
const
avec_t
&
a
,
const
avec_t
&
b
)
{
return
avec_t
(
a
.
lo
+
b
.
lo
,
a
.
hi
+
b
.
hi
);
}
friend
avec_t
operator
-
(
const
avec_t
&
a
,
const
avec_t
&
b
)
{
return
avec_t
(
a
.
lo
-
b
.
lo
,
a
.
hi
-
b
.
hi
);
}
friend
avec_t
operator
*
(
const
avec_t
&
a
,
const
avec_t
&
b
)
{
return
avec_t
(
a
.
lo
*
b
.
lo
,
a
.
hi
*
b
.
hi
);
}
operator
typename
BASE
::
fvec
()
const
{
return
BASE
::
cvtdown
(
lo
,
hi
);
}
};
typedef
avec_t
avec
;
typedef
typename
HIGH
::
fscal
aarr
[
BASE
::
VL
]
__attribute__
((
aligned
(
BASE
::
ALIGN
)));
static
avec
acc_mask_add
(
const
avec
&
src
,
const
typename
BASE
::
bvec
&
m
,
const
avec
&
a
,
const
typename
BASE
::
fvec
&
b
)
{
typename
HIGH
::
fvec
blo
=
BASE
::
cvtup_lo
(
b
);
typename
HIGH
::
fvec
bhi
=
BASE
::
cvtup_hi
(
b
);
typename
HIGH
::
bvec
mlo
,
mhi
;
BASE
::
mask_cvtup
(
m
,
&
mlo
,
&
mhi
);
return
avec
(
HIGH
::
mask_add
(
src
.
lo
,
mlo
,
a
.
lo
,
blo
),
HIGH
::
mask_add
(
src
.
hi
,
mhi
,
a
.
hi
,
bhi
));
}
static
typename
HIGH
::
fscal
acc_reduce_add
(
const
avec
&
a
)
{
return
HIGH
::
reduce_add
(
a
.
lo
+
a
.
hi
);
}
static
avec
acc_zero
()
{
return
avec
(
HIGH
::
zero
(),
HIGH
::
zero
());
}
static
void
acc_store
(
aarr
mem
,
const
avec
&
a
)
{
HIGH
::
store
(
mem
,
a
.
lo
);
HIGH
::
store
(
mem
+
BASE
::
VL
/
2
,
a
.
hi
);
}
};
// For cases where vector_ops<float,x>::VL == vector_ops<double,x>::VL
// i.e. scalar & AN
template
<
class
BASE_flt_t
,
class
HIGH_flt_t
,
CalculationMode
mic
>
struct
AccumulatorTwiceMixinNone
{
typedef
vector_ops
<
BASE_flt_t
,
mic
>
BASE
;
typedef
vector_ops
<
HIGH_flt_t
,
mic
>
HIGH
;
typedef
typename
HIGH
::
fvec
avec
;
typedef
typename
HIGH
::
fscal
aarr
[
BASE
::
VL
];
static
avec
acc_mask_add
(
const
avec
&
src
,
const
typename
BASE
::
bvec
&
m
,
const
avec
&
a
,
const
typename
BASE
::
fvec
&
b
)
{
return
HIGH
::
mask_add
(
src
,
m
,
a
,
static_cast
<
typename
HIGH
::
fvec
>
(
b
));
}
static
typename
HIGH
::
fscal
acc_reduce_add
(
const
avec
&
a
)
{
return
HIGH
::
reduce_add
(
a
);
}
static
avec
acc_zero
()
{
return
HIGH
::
zero
();
}
static
void
acc_store
(
aarr
mem
,
const
avec
&
a
)
{
HIGH
::
store
(
mem
,
a
);
}
};
// This is the interfact that the user will see in the end.
template
<
class
flt_t
,
class
acc_t
,
CalculationMode
mic
>
struct
vector_routines
{};
template
<
CalculationMode
mic
>
struct
vector_routines
<
double
,
double
,
mic
>
:
public
vector_ops
<
double
,
mic
>
,
public
AccumulatorSameMixin
<
double
,
mic
>
{};
template
<
CalculationMode
mic
>
struct
vector_routines
<
float
,
float
,
mic
>
:
public
vector_ops
<
float
,
mic
>
,
public
AccumulatorSameMixin
<
float
,
mic
>
{};
template
<
CalculationMode
mic
>
struct
vector_routines
<
float
,
double
,
mic
>
:
public
vector_ops
<
float
,
mic
>
,
public
AccumulatorTwiceMixin
<
float
,
double
,
mic
>
{};
// Specialize for AN and scalar
template
<>
struct
vector_routines
<
float
,
double
,
NONE
>
:
public
vector_ops
<
float
,
NONE
>
,
public
AccumulatorTwiceMixinNone
<
float
,
double
,
NONE
>
{};
template
<>
struct
vector_routines
<
float
,
double
,
AN
>
:
public
vector_ops
<
float
,
AN
>
,
public
AccumulatorTwiceMixinNone
<
float
,
double
,
AN
>
{};
}
// namespace lmp_intel
Event Timeline
Log In to Comment