Page MenuHomec4science

intel_intrinsics.h
No OneTemporary

File Metadata

Created
Mon, May 27, 00:24

intel_intrinsics.h

This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.
/* *- c++ -*- -----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Markus Höhnerbach (RWTH)
------------------------------------------------------------------------- */
// This file provides an intrinsics abstraction that allows access to the
// underlying SIMD registers.
// It allows for algorithms that are templated over the used vector length
// The final interface is provided by vector_routines, which provides the
// support for different precision modes.
// The vector_ops interface provides routines specific to one floating point
// data type, and is specialized for various architectures.
// The routines work best with AVX-512 and AVX2, as both support the gather
// instructions.
// For both AVX and SSE we miss some optimization opportunities in the gather
// implementations.
// Vector classes provided with the intel compiler
#if defined(__MIC__) && !defined(__AVX512F__)
#include <mic/micvec.h>
#else
#include <dvec.h> // icc-mmic hates generating movq
#include <fvec.h>
#endif
// Vector classes for Cilk array notation
// This is experimental and doesn't yield good code yet
template<int VL, typename fscal>
struct lmp_intel_an_fvec {
fscal data[VL];
lmp_intel_an_fvec() {}
explicit lmp_intel_an_fvec(const fscal f) { data[:] = f; }
explicit lmp_intel_an_fvec(fscal f[VL]) { data[:] = f[:]; }
lmp_intel_an_fvec(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; }
lmp_intel_an_fvec& operator =(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; return *this; }
const lmp_intel_an_fvec operator +(const lmp_intel_an_fvec &b) const {
lmp_intel_an_fvec ret = *this;
ret.data[:] += b.data[:];
return ret;
}
const lmp_intel_an_fvec operator -(const lmp_intel_an_fvec &b) const {
lmp_intel_an_fvec ret = *this;
ret.data[:] -= b.data[:];
return ret;
}
const lmp_intel_an_fvec operator *(const lmp_intel_an_fvec &b) const {
lmp_intel_an_fvec ret = *this;
ret.data[:] *= b.data[:];
return ret;
}
const lmp_intel_an_fvec operator /(const lmp_intel_an_fvec &b) const {
lmp_intel_an_fvec ret = *this;
ret.data[:] /= b.data[:];
return ret;
}
lmp_intel_an_fvec& operator +=(const lmp_intel_an_fvec &b) {
data[:] += b.data[:]; return *this;
}
lmp_intel_an_fvec& operator -=(const lmp_intel_an_fvec &b) {
data[:] -= b.data[:]; return *this;
}
lmp_intel_an_fvec& operator *=(const lmp_intel_an_fvec &b) {
data[:] *= b.data[:]; return *this;
}
lmp_intel_an_fvec& operator /=(const lmp_intel_an_fvec &b) {
data[:] /= b.data[:]; return *this;
}
friend lmp_intel_an_fvec sqrt(const lmp_intel_an_fvec &a) __attribute__((always_inline)) {
lmp_intel_an_fvec ret; ret.data[:] = sqrt(a.data[:]); return ret;
}
friend lmp_intel_an_fvec exp(const lmp_intel_an_fvec &a) __attribute__((always_inline)) {
lmp_intel_an_fvec ret; ret.data[:] = exp(a.data[:]); return ret;
}
friend lmp_intel_an_fvec sin(const lmp_intel_an_fvec &a) __attribute__((always_inline)) {
lmp_intel_an_fvec ret; ret.data[:] = sin(a.data[:]); return ret;
}
friend lmp_intel_an_fvec invsqrt(const lmp_intel_an_fvec &a) __attribute__((always_inline)) {
lmp_intel_an_fvec ret; ret.data[:] = ((fscal)1.) / sqrt(a.data[:]); return ret;
}
friend lmp_intel_an_fvec pow(const lmp_intel_an_fvec &a, const lmp_intel_an_fvec &b) __attribute__((always_inline)) {
lmp_intel_an_fvec ret; ret.data[:] = pow(a.data[:], b.data[:]); return ret;
}
lmp_intel_an_fvec operator - () const {
lmp_intel_an_fvec ret; ret.data[:] = - data[:]; return ret;
}
};
template<int VL>
struct lmp_intel_an_ivec {
int data[VL];
lmp_intel_an_ivec() {}
explicit lmp_intel_an_ivec(int i) { data[:] = i; }
explicit lmp_intel_an_ivec(const int * a) { data[:] = a[0:VL]; }
const lmp_intel_an_ivec operator &(const lmp_intel_an_ivec &b) {
lmp_intel_an_ivec ret = *this;
ret.data[:] &= b.data[:];
return ret;
}
const lmp_intel_an_ivec operator |(const lmp_intel_an_ivec &b) {
lmp_intel_an_ivec ret = *this;
ret.data[:] |= b.data[:];
return ret;
}
const lmp_intel_an_ivec operator +(const lmp_intel_an_ivec &b) {
lmp_intel_an_ivec ret = *this;
ret.data[:] += b.data[:];
return ret;
}
};
template<int VL>
struct lmp_intel_an_bvec {
bool data[VL];
lmp_intel_an_bvec() {}
lmp_intel_an_bvec(const lmp_intel_an_bvec &a) { data[:] = a.data[:]; }
lmp_intel_an_bvec& operator =(const lmp_intel_an_bvec &a) { data[:] = a.data[:]; return *this; }
explicit lmp_intel_an_bvec(int i) { data[:] = i; }
friend lmp_intel_an_bvec operator &(const lmp_intel_an_bvec &a, const lmp_intel_an_bvec &b) __attribute__((always_inline)) {
lmp_intel_an_bvec ret; ret.data[:] = a.data[:] & b.data[:]; return ret;
}
friend lmp_intel_an_bvec operator |(const lmp_intel_an_bvec &a, const lmp_intel_an_bvec &b) __attribute__((always_inline)) {
lmp_intel_an_bvec ret; ret.data[:] = a.data[:] | b.data[:]; return ret;
}
friend lmp_intel_an_bvec operator ~(const lmp_intel_an_bvec &a) __attribute__((always_inline)) {
lmp_intel_an_bvec ret; ret.data[:] = ! a.data[:]; return ret;
}
lmp_intel_an_bvec& operator &=(const lmp_intel_an_bvec &a) __attribute__((always_inline)) {
data[:] &= a.data[:]; return *this;
}
};
namespace lmp_intel {
// Self explanatory mostly, KNC=IMCI and AVX-512, NONE=Scalar, AN=Array Not.
enum CalculationMode { KNC, AVX, AVX2, SSE, NONE, AN };
#ifdef __MIC__
#ifdef LMP_INTEL_VECTOR_MIC
static const CalculationMode mode = LMP_INTEL_VECTOR_MIC;
#else
static const CalculationMode mode = KNC;
#endif
#else
#ifdef LMP_INTEL_VECTOR_HOST
static const CalculationMode mode = LMP_INTEL_VECTOR_HOST;
#else
#ifdef __AVX512F__
static const CalculationMode mode = KNC;
#else
#ifdef __AVX2__
static const CalculationMode mode = AVX2;
#else
#ifdef __AVX__
static const CalculationMode mode = AVX;
#else
static const CalculationMode mode = SSE;
#endif
#endif
#endif
#endif
#endif
// This is used in the selection logic
template<CalculationMode mode>
struct vector_traits {
static const bool support_integer_and_gather_ops = true;
};
template<>
struct vector_traits<AVX> {
static const bool support_integer_and_gather_ops = false;
};
// This is the base template for all the different architectures
// It will get specialized
template<class flt_t, CalculationMode mode>
struct vector_ops {};
// Intrinsic routines for IMCI and AVX-512
#if defined(__MIC__) || defined(__AVX512F__)
// Integer vector class
#pragma pack(push,64)
struct ivec32x16 {
__m512i vec;
ivec32x16() {}
ivec32x16(__m512i m) { vec = m; }
ivec32x16(const int * a) {
vec = _mm512_load_epi32(reinterpret_cast<const int *>(a));
}
explicit ivec32x16(int i) { vec = _mm512_set1_epi32(i); }
operator __m512i() const { return vec; }
friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) {
return _mm512_and_epi32(a, b);
}
friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) {
return _mm512_or_epi32(a, b);
}
friend ivec32x16 operator +(const ivec32x16 &a, const ivec32x16 &b) {
return _mm512_add_epi32(a, b);
}
};
#pragma pack(pop)
// Double precision routines
template<>
struct vector_ops<double, KNC> {
static const int VL = 8;
typedef double fscal;
typedef F64vec8 fvec;
typedef ivec32x16 ivec;
typedef __mmask bvec;
typedef double farr[8] __attribute__((aligned(64)));
typedef int iarr[16] __attribute__((aligned(64)));
static fvec recip(const fvec &a) { return _mm512_recip_pd(a); }
template<int scale>
static void gather_prefetch_t0(const ivec &idx, bvec mask, const void *base) {
_mm512_mask_prefetch_i32gather_ps(idx, mask, base, scale, _MM_HINT_T0);
}
template<int scale>
static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
return _mm512_mask_i32logather_pd(from, mask, idx, base, scale);
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return _mm512_mask_blend_pd(mask, a, b);
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
return _mm512_fmadd_pd(a, b, c);
}
static fvec zero() {
return _mm512_setzero_pd();
}
static bvec cmpeq(const fvec &a, const fvec &b) {
return _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
}
static bvec cmpnle(const fvec &a, const fvec &b) {
return _mm512_cmp_pd_mask(a, b, _CMP_NLE_US);
}
static bvec cmple(const fvec &a, const fvec &b) {
return _mm512_cmp_pd_mask(a, b, _CMP_LE_OS);
}
static bvec cmplt(const fvec &a, const fvec &b) {
return _mm512_cmp_pd_mask(a, b, _CMP_LT_OS);
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
return _mm512_cmpneq_epi32_mask(a, b);
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
return _mm512_cmplt_epi32_mask(a, b);
}
static fvec invsqrt(const fvec &a) {
return _mm512_invsqrt_pd(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
#if __INTEL_COMPILER+0 < 1500
*reinterpret_cast<__m512d *>(cos) = _mm512_cos_pd(a);
return _mm512_sin_pd(a);
#else
return _mm512_sincos_pd(reinterpret_cast<__m512d *>(cos), a);
#endif
}
static fscal reduce_add(const fvec &a) {
return _mm512_reduce_add_pd(a);
}
static ivec int_mullo(const ivec &a, const ivec &b) {
return _mm512_mullo_epi32(a, b);
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
return _mm512_mask_add_epi32(src, mask, a, b);
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
return _mm512_mask_i32gather_epi32(from, mask, idx, base, scale);
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
return _mm512_mask_add_pd(src, mask, a, b);
}
static void store(void *at, const fvec &a) {
_mm512_store_pd(at, a);
}
static void int_store(void *at, const ivec &a) {
_mm512_store_epi32(at, a);
}
static void mask_store(int *at, const bvec &a) {
for (int i = 0; i < 8; i++) {
at[i] = (a >> i) & 1;
}
}
static fvec min(const fvec &a, const fvec &b) {
return _mm512_min_pd(a, b);
}
static bool mask_test_at(const bvec &mask, int at) {
return mask & (1 << at);
}
static bool mask_testz(const bvec &mask) {
return mask == 0;
}
static bvec mask_enable_lower(int n) {
return 0xFF >> (VL - n);
}
static ivec int_load_vl(const int *a) {
return _mm512_load_epi32(a);
}
static void int_clear_arr(int *a) {
_mm512_store_epi32(a, ivec(0));
}
static void int_print(const ivec &a) {
iarr tmp;
_mm512_store_epi32(tmp, a);
for (int i = 0; i < 8; i++) printf("%d ", tmp[i]);
printf("\n");
}
template<class T>
static void gather_x(const ivec &idxs, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
*x = gather<1>(*x, mask, idxs, &base->x);
*y = gather<1>(*y, mask, idxs, &base->y);
*z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w);
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char *>(base) + 16);
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char *>(base) + 24);
*r4 = gather<4>(*r4, mask, idxs, reinterpret_cast<const char *>(base) + 32);
*r5 = gather<4>(*r5, mask, idxs, reinterpret_cast<const char *>(base) + 40);
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 48);
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 56);
}
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char *>(base) + 16);
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char *>(base) + 24);
}
};
template<>
struct vector_ops<float, KNC> {
static const int VL = 16;
static const int ALIGN = 64;
typedef float fscal;
typedef F32vec16 fvec;
typedef ivec32x16 ivec;
typedef __mmask bvec;
typedef float farr[16] __attribute__((aligned(64)));
typedef int iarr[16] __attribute__((aligned(64)));
static const bvec full_mask = 0xFFFF;
static fvec recip(const fvec &a) { return _mm512_recip_ps(a); }
template<int scale>
static void gather_prefetch_t0(const ivec &idx, bvec mask, const void *base) {
_mm512_mask_prefetch_i32gather_ps(idx, mask, base, scale, _MM_HINT_T0);
}
template<int scale>
static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
return _mm512_mask_i32gather_ps(from, mask, idx, base, scale);
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return _mm512_mask_blend_ps(mask, a, b);
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
return _mm512_fmadd_ps(a, b, c);
}
static fvec zero() {
return _mm512_setzero_ps();
}
static bvec cmpeq(const fvec &a, const fvec &b) {
return _mm512_cmpeq_ps_mask(a, b);
}
static bvec cmpnle(const fvec &a, const fvec &b) {
return _mm512_cmpnle_ps_mask(a, b);
}
static bvec cmple(const fvec &a, const fvec &b) {
return _mm512_cmple_ps_mask(a, b);
}
static bvec cmplt(const fvec &a, const fvec &b) {
return _mm512_cmplt_ps_mask(a, b);
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
return _mm512_cmpneq_epi32_mask(a, b);
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
return _mm512_cmplt_epi32_mask(a, b);
}
static fvec invsqrt(const fvec &a) {
return _mm512_invsqrt_ps(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
#if __INTEL_COMPILER+0 < 1500
*reinterpret_cast<__m512 *>(cos) = _mm512_cos_ps(a);
return _mm512_sin_ps(a);
#else
return _mm512_sincos_ps(reinterpret_cast<__m512 *>(cos), a);
#endif
}
static fscal reduce_add(const fvec &a) {
return _mm512_reduce_add_ps(a);
}
static ivec int_mullo(const ivec &a, const ivec &b) {
return _mm512_mullo_epi32(a, b);
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
return _mm512_mask_add_epi32(src, mask, a, b);
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
return _mm512_mask_i32gather_epi32(from, mask, idx, base, scale);
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
return _mm512_mask_add_ps(src, mask, a, b);
}
static void store(void *at, const fvec &a) {
_mm512_store_ps(at, a);
}
static void int_store(void *at, const ivec &a) {
_mm512_store_epi32(at, a);
}
static void mask_store(int *at, const bvec &a) {
for (int i = 0; i < 16; i++) {
at[i] = (a >> i) & 1;
}
}
static fvec min(const fvec &a, const fvec &b) {
return _mm512_min_ps(a, b);
}
static bool mask_test_at(const bvec &mask, int at) {
return mask & (1 << at);
}
static bool mask_testz(const bvec &mask) {
return mask == 0;
}
static bvec mask_enable_lower(int n) {
return 0xFFFF >> (VL - n);
}
static ivec int_load_vl(const int *a) {
return _mm512_load_epi32(a);
}
static void int_clear_arr(int *a) {
_mm512_store_epi32(a, ivec(0));
}
static void int_print(const ivec &a) {
iarr tmp;
_mm512_store_epi32(tmp, a);
for (int i = 0; i < 16; i++) printf("%d ", tmp[i]);
printf("\n");
}
template<class T>
static void gather_x(const ivec &idxs, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
*x = gather<1>(*x, mask, idxs, &base->x);
*y = gather<1>(*y, mask, idxs, &base->y);
*z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w);
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char *>(base) + 8);
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
*r4 = gather<4>(*r4, mask, idxs, reinterpret_cast<const char *>(base) + 16);
*r5 = gather<4>(*r5, mask, idxs, reinterpret_cast<const char *>(base) + 20);
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
}
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char *>(base) + 8);
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
}
// Additional routines needed for the implementation of mixed precision
static fvec cvtdown(const vector_ops<double,KNC>::fvec &lo, const vector_ops<double,KNC>::fvec &hi) {
__m512 t1 = _mm512_cvtpd_pslo(lo);
__m512 t2 = _mm512_cvtpd_pslo(hi);
return _mm512_mask_permute4f128_ps(t1, 0xFF00, t2, _MM_PERM_BADC);
}
static vector_ops<double,KNC>::fvec cvtup_lo(const fvec &a) {
return _mm512_cvtpslo_pd(a);
}
static vector_ops<double,KNC>::fvec cvtup_hi(const fvec &a) {
return _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a, _MM_PERM_BADC)); // permute DCBA -> BADC
}
static void mask_cvtup(const bvec &a, vector_ops<double,KNC>::bvec *blo, vector_ops<double,KNC>::bvec *bhi) {
*blo = a & 0xFF;
*bhi = a >> 8;
}
};
#endif
//////////////////////////////////////////////////////////////////////////////
// AVX/SSE
//////////////////////////////////////////////////////////////////////////////
#ifndef __MIC__
// class definitions for integer and masks for AVX
// Note that we have to lower a number of operations to SSE, notably comparison
// and integer operations.
// The gather operations are emulated.
struct ivec32x8 {
__m256i vec;
ivec32x8() {}
ivec32x8(__m256i m) { vec = m; }
ivec32x8(const int * a) {
vec = _mm256_load_si256(reinterpret_cast<const __m256i *>(a));
}
explicit ivec32x8(int i) { vec = _mm256_set1_epi32(i); }
operator __m256i() const { return vec; }
friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) {
return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
}
friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) {
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
}
friend ivec32x8 operator +(const ivec32x8 &a, const ivec32x8 &b) {
__m128i alo = _mm256_castsi256_si128(a);
__m128i ahi = _mm256_extractf128_si256(a, 1);
__m128i blo = _mm256_castsi256_si128(b);
__m128i bhi = _mm256_extractf128_si256(b, 1);
__m128i rlo = _mm_add_epi32(alo, blo);
__m128i rhi = _mm_add_epi32(ahi, bhi);
return _mm256_setr_m128i(rlo, rhi);
}
};
struct avx_bvec {
__m256i vec;
avx_bvec() {}
avx_bvec(__m256i m) { vec = m; }
explicit avx_bvec(int i) { vec = _mm256_set1_epi32(i); }
operator __m256i() const { return vec; }
operator F64vec4() const { return _mm256_castsi256_pd(vec); }
operator F32vec8() const { return _mm256_castsi256_ps(vec); }
operator ivec32x8() const { return vec; }
friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) {
return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
}
friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) {
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
}
friend avx_bvec operator ~(const avx_bvec &a) { return _mm256_castpd_si256(_mm256_andnot_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(avx_bvec(0xFFFFFFFF)))); }
avx_bvec& operator &=(const avx_bvec &a) { return *this = _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(vec), _mm256_castsi256_pd(a))); }
};
template<>
struct vector_ops<double, AVX> {
static const int VL = 4;
typedef double fscal;
typedef F64vec4 fvec;
typedef ivec32x8 ivec;
typedef avx_bvec bvec;
typedef double farr[4] __attribute__((aligned(32)));
typedef int iarr[8] __attribute__((aligned(32)));
static fvec recip(const fvec &a) {
// newton-raphson
fvec b = _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a)));
b = b + b - a * b * b;
return b + b - a * b * b;
}
template<int scale>
static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
// nop
}
template<int scale>
static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
farr result;
farr src;
iarr idxs;
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
_mm256_store_pd(reinterpret_cast<double*>(src), from);
for (int i = 0; i < VL; i++) {
result[i] = mask_test_at(mask, i)
? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i])
: src[i];
}
return _mm256_load_pd(reinterpret_cast<double*>(result));
}
template<class T>
static void gather_x(const ivec &idxs, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
iarr i, m;
int_store(i, idxs);
mask_store(m, mask);
__m256d a0 = m[0] ? _mm256_load_pd(reinterpret_cast<const double*>(&base[i[0]/32])) : _mm256_setzero_pd();
__m256d a1 = m[2] ? _mm256_load_pd(reinterpret_cast<const double*>(&base[i[1]/32])) : _mm256_setzero_pd();
__m256d b0 = _mm256_unpacklo_pd(a0, a1);
__m256d b1 = _mm256_unpackhi_pd(a0, a1);
__m256d a2 = m[4] ? _mm256_load_pd(reinterpret_cast<const double*>(&base[i[2]/32])) : _mm256_setzero_pd();
__m256d a3 = m[6] ? _mm256_load_pd(reinterpret_cast<const double*>(&base[i[3]/32])) : _mm256_setzero_pd();
__m256d b2 = _mm256_unpacklo_pd(a2, a3);
__m256d b3 = _mm256_unpackhi_pd(a2, a3);
__m256d c0 = _mm256_permute2f128_pd(b0, b2, 0x20);
__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
*x = blend(mask, *x, c0);
*y = blend(mask, *y, c1);
*z = blend(mask, *z, c2);
*w = int_blend(mask, *w, _mm256_castps_si256(_mm256_permute_ps(_mm256_castpd_ps(c3), 0xA0)));
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
}
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
iarr i, m;
_mm256_store_si256(reinterpret_cast<__m256i*>(i), idxs);
mask_store(m, mask);
__m256d z = _mm256_setzero_pd();
const char * m0 = reinterpret_cast<const char*>(base) + 4 * i[0];
const char * m1 = reinterpret_cast<const char*>(base) + 4 * i[2];
const char * m2 = reinterpret_cast<const char*>(base) + 4 * i[4];
const char * m3 = reinterpret_cast<const char*>(base) + 4 * i[6];
const double * e0 = reinterpret_cast<const double*>(m[0] == 0 ? reinterpret_cast<const char*>(&z) : m0);
const double * e1 = reinterpret_cast<const double*>(m[2] == 0 ? reinterpret_cast<const char*>(&z) : m1);
const double * e2 = reinterpret_cast<const double*>(m[4] == 0 ? reinterpret_cast<const char*>(&z) : m2);
const double * e3 = reinterpret_cast<const double*>(m[6] == 0 ? reinterpret_cast<const char*>(&z) : m3);
__m256d a0 = _mm256_load_pd(e0);
__m256d a1 = _mm256_load_pd(e1);
__m256d b0 = _mm256_unpacklo_pd(a0, a1);
__m256d b1 = _mm256_unpackhi_pd(a0, a1);
__m256d a2 = _mm256_load_pd(e2);
__m256d a3 = _mm256_load_pd(e3);
__m256d b2 = _mm256_unpacklo_pd(a2, a3);
__m256d b3 = _mm256_unpackhi_pd(a2, a3);
__m256d c0 = _mm256_permute2f128_pd(b0, b2, 0x20);
__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
*r0 = blend(mask, *r0, c0);
*r1 = blend(mask, *r1, c1);
*r2 = blend(mask, *r2, c2);
*r3 = blend(mask, *r3, c3);
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return (b & mask) | (a & ~ mask);
}
static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
return (b & mask) | (a & ~ mask);
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
return a*b + c;
}
static fvec zero() {
return _mm256_setzero_pd();
}
static bvec cmpeq(const fvec &a, const fvec &b) {
return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_EQ_OQ));
}
static bvec cmpnle(const fvec &a, const fvec &b) {
return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_NLE_US));
}
static bvec cmple(const fvec &a, const fvec &b) {
return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LE_OS));
}
static bvec cmplt(const fvec &a, const fvec &b) {
return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LT_OS));
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
__m128i alo = _mm256_castsi256_si128(a);
__m128i ahi = _mm256_extractf128_si256(a, 1);
__m128i blo = _mm256_castsi256_si128(b);
__m128i bhi = _mm256_extractf128_si256(b, 1);
__m128i rlo = _mm_andnot_si128(_mm_cmpeq_epi32(alo, blo), _mm_cmpeq_epi8(alo, alo));
__m128i rhi = _mm_andnot_si128(_mm_cmpeq_epi32(ahi, bhi), _mm_cmpeq_epi8(alo, alo));
return _mm256_setr_m128i(rlo, rhi);
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
__m128i alo = _mm256_castsi256_si128(a);
__m128i ahi = _mm256_extractf128_si256(a, 1);
__m128i blo = _mm256_castsi256_si128(b);
__m128i bhi = _mm256_extractf128_si256(b, 1);
__m128i rlo = _mm_cmplt_epi32(alo, blo);
__m128i rhi = _mm_cmplt_epi32(ahi, bhi);
return _mm256_setr_m128i(rlo, rhi);
}
static fvec invsqrt(const fvec &a) {
return _mm256_invsqrt_pd(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
return _mm256_sincos_pd(reinterpret_cast<__m256d *>(cos), a);
}
static fscal reduce_add(const fvec &a) {
__m256d t1 = _mm256_hadd_pd(a, a);
__m128d t2 = _mm256_extractf128_pd(t1, 1);
__m128d t3 = _mm256_castpd256_pd128(t1);
return _mm_cvtsd_f64(_mm_add_pd(t2, t3));
}
static ivec int_mullo(const ivec &a, const ivec &b) {
__m128i alo = _mm256_castsi256_si128(a);
__m128i ahi = _mm256_extractf128_si256(a, 1);
__m128i blo = _mm256_castsi256_si128(b);
__m128i bhi = _mm256_extractf128_si256(b, 1);
__m128i rlo = _mm_mullo_epi32(alo, blo);
__m128i rhi = _mm_mullo_epi32(ahi, bhi);
return _mm256_setr_m128i(rlo, rhi);
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
return ((a + b) & mask) | (src & ~ mask);
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
iarr result;
iarr src;
iarr idxs;
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
_mm256_store_si256(reinterpret_cast<__m256i*>(src), from);
for (int i = 0; i < VL; i++) {
int tmp;
if (mask_test_at(mask, i)) {
tmp = *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i]);
} else {
tmp = src[2*i];
}
result[2*i] = tmp;
result[2*i+1] = tmp;
}
return _mm256_load_si256(reinterpret_cast<__m256i*>(result));
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
return ((a + b) & mask) | (src & ~ mask);
}
static void store(void *at, const fvec &a) {
_mm256_store_pd(reinterpret_cast<double*>(at), a);
}
static void int_store(int *at, const ivec &a) {
_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
at[1] = at[2];
at[2] = at[4];
at[3] = at[6];
}
static void mask_store(int *at, const bvec &a) {
_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
}
static fvec min(const fvec &a, const fvec &b) {
return _mm256_min_pd(a, b);
}
static bool mask_test_at(const bvec &mask, int at) {
return reinterpret_cast<const int*>(&mask)[2*at];
}
static bool mask_testz(const bvec &mask) {
return _mm256_testz_si256(mask, mask);
}
static bvec mask_enable_lower(int n) {
static const int base[8] __attribute__((aligned(64))) = {0,0,1,1,2,2,3,3};
return int_cmplt(ivec(base), ivec(n));
}
static ivec int_load_vl(const int *a) {
__m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(a));
__m128i c = _mm_unpacklo_epi32(b, b);
__m128i d = _mm_unpackhi_epi32(b, b);
return _mm256_setr_m128i(c, d);
// stolen from http://stackoverflow.com/questions/6687535/unexpected-result-from-avx-m256-unpack-ps-unpack-intrinsic
}
static void int_clear_arr(int *a) {
_mm256_store_si256(reinterpret_cast<__m256i *>(a), ivec(0));
}
static bvec full_mask() {
return bvec(0xFFFFFFFF);
}
static void int_print(const ivec &a) {
iarr tmp;
_mm256_store_si256(reinterpret_cast<__m256i *>(tmp), a);
for (int i = 0; i < 8; i++) printf("%d ", tmp[i]);
printf("\n");
}
};
template<>
struct vector_ops<float, AVX> {
static const int VL = 8;
static const int ALIGN = 32;
typedef float fscal;
typedef F32vec8 fvec;
typedef ivec32x8 ivec;
typedef avx_bvec bvec;
typedef float farr[8] __attribute__((aligned(32)));
typedef int iarr[8] __attribute__((aligned(32)));
static fvec recip(const fvec &a) {
fvec b = _mm256_rcp_ps(a);
b = b + b - a * b * b;
return b + b - a * b * b;
}
template<int scale>
static void gather_prefetch_t0(const ivec &idx, bvec mask, const void *base) {
// nop
}
template<int scale>
static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
farr result;
farr src;
iarr idxs;
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
_mm256_store_ps(reinterpret_cast<float*>(src), from);
for (int i = 0; i < VL; i++) {
result[i] = mask_test_at(mask, i)
? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
: src[i];
}
return _mm256_load_ps(reinterpret_cast<float*>(result));
}
template<class T>
static void gather_x(const ivec &idxs, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
iarr i, m;
int_store(i, idxs);
mask_store(m, mask);
farr zero_mem;
store(zero_mem, zero());
const float *e0 = m[0] ? reinterpret_cast<const float*>(&base[i[0]/16]) : zero_mem;
const float *e1 = m[1] ? reinterpret_cast<const float*>(&base[i[1]/16]) : zero_mem;
const float *e2 = m[2] ? reinterpret_cast<const float*>(&base[i[2]/16]) : zero_mem;
const float *e3 = m[3] ? reinterpret_cast<const float*>(&base[i[3]/16]) : zero_mem;
const float *e4 = m[4] ? reinterpret_cast<const float*>(&base[i[4]/16]) : zero_mem;
const float *e5 = m[5] ? reinterpret_cast<const float*>(&base[i[5]/16]) : zero_mem;
const float *e6 = m[6] ? reinterpret_cast<const float*>(&base[i[6]/16]) : zero_mem;
const float *e7 = m[7] ? reinterpret_cast<const float*>(&base[i[7]/16]) : zero_mem;
__m256 a0 = _mm256_loadu2_m128(e4, e0);
__m256 a1 = _mm256_loadu2_m128(e5, e1);
__m256 b0 = _mm256_unpacklo_ps(a0, a1);
__m256 b1 = _mm256_unpackhi_ps(a0, a1);
__m256 a2 = _mm256_loadu2_m128(e6, e2);
__m256 a3 = _mm256_loadu2_m128(e7, e3);
__m256 b2 = _mm256_unpacklo_ps(a2, a3);
__m256 b3 = _mm256_unpackhi_ps(a2, a3);
__m256 c0 = _mm256_shuffle_ps(b0, b2, 0x44);
__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
*x = blend(mask, *x, c0);
*y = blend(mask, *y, c1);
*z = blend(mask, *z, c2);
*w = int_blend(mask, *w, _mm256_castps_si256(c3));
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
}
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
iarr i, m;
int_store(i, idxs);
mask_store(m, mask);
farr zero_mem;
store(zero_mem, zero());
const float *e0 = m[0] ? reinterpret_cast<const float*>(&reinterpret_cast<const char *>(base)[4*i[0]]) : zero_mem;
const float *e1 = m[1] ? reinterpret_cast<const float*>(&reinterpret_cast<const char *>(base)[4*i[1]]) : zero_mem;
const float *e2 = m[2] ? reinterpret_cast<const float*>(&reinterpret_cast<const char *>(base)[4*i[2]]) : zero_mem;
const float *e3 = m[3] ? reinterpret_cast<const float*>(&reinterpret_cast<const char *>(base)[4*i[3]]) : zero_mem;
const float *e4 = m[4] ? reinterpret_cast<const float*>(&reinterpret_cast<const char *>(base)[4*i[4]]) : zero_mem;
const float *e5 = m[5] ? reinterpret_cast<const float*>(&reinterpret_cast<const char *>(base)[4*i[5]]) : zero_mem;
const float *e6 = m[6] ? reinterpret_cast<const float*>(&reinterpret_cast<const char *>(base)[4*i[6]]) : zero_mem;
const float *e7 = m[7] ? reinterpret_cast<const float*>(&reinterpret_cast<const char *>(base)[4*i[7]]) : zero_mem;
__m256 a0 = _mm256_loadu2_m128(e4, e0);
__m256 a1 = _mm256_loadu2_m128(e5, e1);
__m256 b0 = _mm256_unpacklo_ps(a0, a1);
__m256 b1 = _mm256_unpackhi_ps(a0, a1);
__m256 a2 = _mm256_loadu2_m128(e6, e2);
__m256 a3 = _mm256_loadu2_m128(e7, e3);
__m256 b2 = _mm256_unpacklo_ps(a2, a3);
__m256 b3 = _mm256_unpackhi_ps(a2, a3);
__m256 c0 = _mm256_shuffle_ps(b0, b2, 0x44);
__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
*r0 = blend(mask, *r0, c0);
*r1 = blend(mask, *r1, c1);
*r2 = blend(mask, *r2, c2);
*r3 = blend(mask, *r3, c3);
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return (b & mask) | (a & ~ mask);
}
static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
return (b & mask) | (a & ~ mask);
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
return a*b + c;
}
static fvec zero() {
return _mm256_setzero_ps();
}
static bvec cmpeq(const fvec &a, const fvec &b) {
return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_EQ_OQ));
}
static bvec cmpnle(const fvec &a, const fvec &b) {
return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_NLE_US));
}
static bvec cmple(const fvec &a, const fvec &b) {
return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LE_OS));
}
static bvec cmplt(const fvec &a, const fvec &b) {
return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LT_OS));
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
__m128i alo = _mm256_castsi256_si128(a);
__m128i ahi = _mm256_extractf128_si256(a, 1);
__m128i blo = _mm256_castsi256_si128(b);
__m128i bhi = _mm256_extractf128_si256(b, 1);
__m128i rlo = _mm_andnot_si128(_mm_cmpeq_epi32(alo, blo), _mm_cmpeq_epi8(alo, alo));
__m128i rhi = _mm_andnot_si128(_mm_cmpeq_epi32(ahi, bhi), _mm_cmpeq_epi8(alo, alo));
return _mm256_setr_m128i(rlo, rhi);
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
__m128i alo = _mm256_castsi256_si128(a);
__m128i ahi = _mm256_extractf128_si256(a, 1);
__m128i blo = _mm256_castsi256_si128(b);
__m128i bhi = _mm256_extractf128_si256(b, 1);
__m128i rlo = _mm_cmplt_epi32(alo, blo);
__m128i rhi = _mm_cmplt_epi32(ahi, bhi);
return _mm256_setr_m128i(rlo, rhi);
}
static fvec invsqrt(const fvec &a) {
return _mm256_invsqrt_ps(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
return _mm256_sincos_ps(reinterpret_cast<__m256 *>(cos), a);
}
static fscal reduce_add(const fvec &a) {
__m256 t1 = _mm256_hadd_ps(a, a);
__m128 t2 = _mm256_extractf128_ps(t1, 1);
__m128 t3 = _mm256_castps256_ps128(t1);
__m128 t4 = _mm_add_ps(t2, t3);
__m128 t5 = _mm_permute_ps(t4, 0x1B); // 0x1B = reverse
return _mm_cvtss_f32(_mm_add_ps(t4, t5));
}
static ivec int_mullo(const ivec &a, const ivec &b) {
__m128i alo = _mm256_castsi256_si128(a);
__m128i ahi = _mm256_extractf128_si256(a, 1);
__m128i blo = _mm256_castsi256_si128(b);
__m128i bhi = _mm256_extractf128_si256(b, 1);
__m128i rlo = _mm_mullo_epi32(alo, blo);
__m128i rhi = _mm_mullo_epi32(ahi, bhi);
return _mm256_setr_m128i(rlo, rhi);
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
return ((a + b) & mask) | (src & ~ mask);
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
iarr result;
iarr src;
iarr idxs;
_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
_mm256_store_si256(reinterpret_cast<__m256i*>(src), from);
for (int i = 0; i < VL; i++) {
result[i] = mask_test_at(mask, i)
? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
: src[i];
}
return _mm256_load_si256(reinterpret_cast<__m256i*>(result));
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
return ((a + b) & mask) | (src & ~ mask);
}
static void store(void *at, const fvec &a) {
_mm256_store_ps(reinterpret_cast<float*>(at), a);
}
static void int_store(void *at, const ivec &a) {
_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
}
static void mask_store(int *at, const bvec &a) {
_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
}
static fvec min(const fvec &a, const fvec &b) {
return _mm256_min_ps(a, b);
}
static bool mask_test_at(const bvec &mask, int at) {
return reinterpret_cast<const int*>(&mask)[at];
}
static bool mask_testz(const bvec &mask) {
return _mm256_testz_si256(mask, mask);
}
static bvec mask_enable_lower(int n) {
static const int base[8] __attribute__((aligned(64))) = {0,1,2,3,4,5,6,7};
return int_cmplt(ivec(base), ivec(n));
}
static ivec int_load_vl(const int *a) {
return _mm256_load_si256(reinterpret_cast<const __m256i*>(a));
}
static void int_clear_arr(int *a) {
_mm256_store_si256(reinterpret_cast<__m256i *>(a), ivec(0));
}
static bvec full_mask() {
return bvec(0xFFFFFFFF);
}
static void int_print(const ivec &a) {
iarr tmp;
_mm256_store_si256(reinterpret_cast<__m256i *>(tmp), a);
for (int i = 0; i < 16; i++) printf("%d ", tmp[i]);
printf("\n");
}
static fvec cvtdown(const vector_ops<double,AVX>::fvec &lo, const vector_ops<double,AVX>::fvec &hi) {
__m128 t1 = _mm256_cvtpd_ps(lo);
__m128 t2 = _mm256_cvtpd_ps(hi);
return _mm256_setr_m128(t1, t2);
}
static vector_ops<double,AVX>::fvec cvtup_lo(const fvec &a) {
return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
}
static vector_ops<double,AVX>::fvec cvtup_hi(const fvec &a) {
return _mm256_cvtps_pd(_mm256_extractf128_ps(a, 1)); // permute DCBA -> BADC
}
static void mask_cvtup(const bvec &a, vector_ops<double,AVX>::bvec *blo, vector_ops<double,AVX>::bvec *bhi) {
__m256i t1 = _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(a)));
__m256i t2 = _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(a)));
*blo = _mm256_permute2f128_si256(t1, t2, 0x20);
*bhi = _mm256_permute2f128_si256(t1, t2, 0x31);
}
};
// AVX2
//
struct avx2_ivec32 {
__m256i vec;
avx2_ivec32() {}
avx2_ivec32(__m256i m) { vec = m; }
avx2_ivec32(const int * a) {
vec = _mm256_load_si256(reinterpret_cast<const __m256i *>(a));
}
explicit avx2_ivec32(int i) { vec = _mm256_set1_epi32(i); }
operator __m256i() const { return vec; }
friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) {
return _mm256_and_si256(a, b);
}
friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) {
return _mm256_or_si256(a, b);
}
friend avx2_ivec32 operator +(const avx2_ivec32 &a, const avx2_ivec32 &b) {
return _mm256_add_epi32(a, b);
}
};
struct avx2_bvec {
__m256i vec;
avx2_bvec() {}
avx2_bvec(__m256i m) { vec = m; }
explicit avx2_bvec(int i) { vec = _mm256_set1_epi32(i); }
operator __m256i() const { return vec; }
operator __m256d() const { return _mm256_castsi256_pd(vec); }
operator __m256() const { return _mm256_castsi256_ps(vec); }
operator F64vec4() const { return _mm256_castsi256_pd(vec); }
operator F32vec8() const { return _mm256_castsi256_ps(vec); }
operator avx2_ivec32() const { return vec; }
friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) {
return _mm256_and_si256(a, b);
}
friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) {
return _mm256_or_si256(a, b);
}
friend avx2_bvec operator ~(const avx2_bvec &a) {
return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF));
}
avx2_bvec& operator &=(const avx2_bvec &a) { return *this = _mm256_and_si256(vec,a); }
};
template<>
struct vector_ops<double, AVX2> {
static const int VL = 4;
typedef double fscal;
typedef F64vec4 fvec;
typedef avx2_ivec32 ivec;
typedef avx2_bvec bvec;
typedef double farr[4] __attribute__((aligned(32)));
typedef int iarr[8] __attribute__((aligned(32)));
static fvec recip(const fvec &a) {
// newton-raphson
fvec b = _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a)));
b = b + b - a * b * b;
return b + b - a * b * b;
}
template<int scale>
static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
// nop
}
template<int scale>
static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
return _mm256_mask_i32gather_pd(from, static_cast<const double*>(base), _mm256_castsi256_si128(idx1), mask, scale);
}
template<class T>
static void gather_x(const ivec &idx, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
*x = _mm256_mask_i32gather_pd(*x, &base->x, _mm256_castsi256_si128(idx1), mask, 1);
*y = _mm256_mask_i32gather_pd(*y, &base->y, _mm256_castsi256_si128(idx1), mask, 1);
*z = _mm256_mask_i32gather_pd(*z, &base->z, _mm256_castsi256_si128(idx1), mask, 1);
*w = _mm256_mask_i32gather_epi32(*w, &base->w, idx, mask, 1);
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
}
static void gather_4(const ivec &idx, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
*r0 = _mm256_mask_i32gather_pd(*r0, static_cast<const double*>(base) + 0, _mm256_castsi256_si128(idx1), mask, 4);
*r1 = _mm256_mask_i32gather_pd(*r1, static_cast<const double*>(base) + 1, _mm256_castsi256_si128(idx1), mask, 4);
*r2 = _mm256_mask_i32gather_pd(*r2, static_cast<const double*>(base) + 2, _mm256_castsi256_si128(idx1), mask, 4);
*r3 = _mm256_mask_i32gather_pd(*r3, static_cast<const double*>(base) + 3, _mm256_castsi256_si128(idx1), mask, 4);
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return (b & mask) | (a & ~ mask);
}
static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
return (b & mask) | (a & ~ mask);
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
return a*b + c;
}
static fvec zero() {
return _mm256_setzero_pd();
}
static bvec cmpeq(const fvec &a, const fvec &b) {
return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_EQ_OQ));
}
static bvec cmpnle(const fvec &a, const fvec &b) {
return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_NLE_US));
}
static bvec cmple(const fvec &a, const fvec &b) {
return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LE_OS));
}
static bvec cmplt(const fvec &a, const fvec &b) {
return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LT_OS));
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
return ~bvec(_mm256_cmpeq_epi32(a, b));
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
return _mm256_cmpgt_epi32(b, a);
}
static fvec invsqrt(const fvec &a) {
return _mm256_invsqrt_pd(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
return _mm256_sincos_pd(reinterpret_cast<__m256d *>(cos), a);
}
static fscal reduce_add(const fvec &a) {
__m256d t1 = _mm256_hadd_pd(a, a);
__m128d t2 = _mm256_extractf128_pd(t1, 1);
__m128d t3 = _mm256_castpd256_pd128(t1);
return _mm_cvtsd_f64(_mm_add_pd(t2, t3));
}
static ivec int_mullo(const ivec &a, const ivec &b) {
return _mm256_mullo_epi32(a, b);
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
return ((a + b) & mask) | (src & ~ mask);
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
return _mm256_mask_i32gather_epi32(from, static_cast<const int*>(base), idx, mask, scale);
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
return ((a + b) & mask) | (src & ~ mask);
}
static void store(void *at, const fvec &a) {
_mm256_store_pd(reinterpret_cast<double*>(at), a);
}
static void int_store(int *at, const ivec &a) {
_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
at[1] = at[2];
at[2] = at[4];
at[3] = at[6];
}
static void mask_store(int *at, const bvec &a) {
_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
}
static fvec min(const fvec &a, const fvec &b) {
return _mm256_min_pd(a, b);
}
static bool mask_test_at(const bvec &mask, int at) {
return reinterpret_cast<const int*>(&mask)[2*at];
}
static bool mask_testz(const bvec &mask) {
return _mm256_testz_si256(mask, mask);
}
static bvec mask_enable_lower(int n) {
static const int base[8] __attribute__((aligned(64))) = {0,0,1,1,2,2,3,3};
return int_cmplt(ivec(base), ivec(n));
}
static ivec int_load_vl(const int *a) {
__m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(a));
__m128i c = _mm_unpacklo_epi32(b, b);
__m128i d = _mm_unpackhi_epi32(b, b);
return _mm256_setr_m128i(c, d);
// stolen from http://stackoverflow.com/questions/6687535/unexpected-result-from-avx-m256-unpack-ps-unpack-intrinsic
}
static void int_clear_arr(int *a) {
_mm256_store_si256(reinterpret_cast<__m256i *>(a), ivec(0));
}
static bvec full_mask() {
return bvec(0xFFFFFFFF);
}
static void int_print(const ivec &a) {
iarr tmp;
_mm256_store_si256(reinterpret_cast<__m256i *>(tmp), a);
for (int i = 0; i < 8; i++) printf("%d ", tmp[i]);
printf("\n");
}
};
template<>
struct vector_ops<float, AVX2> {
static const int VL = 8;
static const int ALIGN = 32;
typedef float fscal;
typedef F32vec8 fvec;
typedef avx2_ivec32 ivec;
typedef avx2_bvec bvec;
typedef float farr[8] __attribute__((aligned(32)));
typedef int iarr[8] __attribute__((aligned(32)));
static fvec recip(const fvec &a) {
fvec b = _mm256_rcp_ps(a);
b = b + b - a * b * b;
return b + b - a * b * b;
}
template<int scale>
static void gather_prefetch_t0(const ivec &idx, bvec mask, const void *base) {
// nop
}
template<int scale>
static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
return _mm256_mask_i32gather_ps(from, static_cast<const float*>(base), idx, mask, scale);
}
template<class T>
static void gather_x(const ivec &idx, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
*x = _mm256_mask_i32gather_ps(*x, reinterpret_cast<const float*>(base) + 0, idx, mask, 1);
*y = _mm256_mask_i32gather_ps(*y, reinterpret_cast<const float*>(base) + 1, idx, mask, 1);
*z = _mm256_mask_i32gather_ps(*z, reinterpret_cast<const float*>(base) + 2, idx, mask, 1);
*w = _mm256_mask_i32gather_epi32(*w, reinterpret_cast<const int*>(base) + 3, idx, mask, 1);
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char *>(base) + 8);
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
*r4 = gather<4>(*r4, mask, idxs, reinterpret_cast<const char *>(base) + 16);
*r5 = gather<4>(*r5, mask, idxs, reinterpret_cast<const char *>(base) + 20);
*r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
*r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
}
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char *>(base) + 8);
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return (b & mask) | (a & ~ mask);
}
static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
return (b & mask) | (a & ~ mask);
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
return a*b + c;
}
static fvec zero() {
return _mm256_setzero_ps();
}
static bvec cmpeq(const fvec &a, const fvec &b) {
return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_EQ_OQ));
}
static bvec cmpnle(const fvec &a, const fvec &b) {
return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_NLE_US));
}
static bvec cmple(const fvec &a, const fvec &b) {
return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LE_OS));
}
static bvec cmplt(const fvec &a, const fvec &b) {
return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LT_OS));
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
return ~bvec(_mm256_cmpeq_epi32(a, b));
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
return _mm256_cmpgt_epi32(b, a);
}
static fvec invsqrt(const fvec &a) {
return _mm256_invsqrt_ps(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
return _mm256_sincos_ps(reinterpret_cast<__m256 *>(cos), a);
}
static fscal reduce_add(const fvec &a) {
__m256 t1 = _mm256_hadd_ps(a, a);
__m128 t2 = _mm256_extractf128_ps(t1, 1);
__m128 t3 = _mm256_castps256_ps128(t1);
__m128 t4 = _mm_add_ps(t2, t3);
__m128 t5 = _mm_permute_ps(t4, 0x1B); // 0x1B = reverse
return _mm_cvtss_f32(_mm_add_ps(t4, t5));
}
static ivec int_mullo(const ivec &a, const ivec &b) {
return _mm256_mullo_epi32(a, b);
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
return ((a + b) & mask) | (src & ~ mask);
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
return _mm256_mask_i32gather_epi32(from, static_cast<const int*>(base), idx, mask, scale);
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
return ((a + b) & mask) | (src & ~ mask);
}
static void store(void *at, const fvec &a) {
_mm256_store_ps(reinterpret_cast<float*>(at), a);
}
static void int_store(void *at, const ivec &a) {
_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
}
static void mask_store(int *at, const bvec &a) {
_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
}
static fvec min(const fvec &a, const fvec &b) {
return _mm256_min_ps(a, b);
}
static bool mask_test_at(const bvec &mask, int at) {
return reinterpret_cast<const int*>(&mask)[at];
}
static bool mask_testz(const bvec &mask) {
return _mm256_testz_si256(mask, mask);
}
static bvec mask_enable_lower(int n) {
static const int base[8] __attribute__((aligned(64))) = {0,1,2,3,4,5,6,7};
return int_cmplt(ivec(base), ivec(n));
}
static ivec int_load_vl(const int *a) {
return _mm256_load_si256(reinterpret_cast<const __m256i*>(a));
}
static void int_clear_arr(int *a) {
_mm256_store_si256(reinterpret_cast<__m256i *>(a), ivec(0));
}
static bvec full_mask() {
return bvec(0xFFFFFFFF);
}
static void int_print(const ivec &a) {
iarr tmp;
_mm256_store_si256(reinterpret_cast<__m256i *>(tmp), a);
for (int i = 0; i < 16; i++) printf("%d ", tmp[i]);
printf("\n");
}
static fvec cvtdown(const vector_ops<double,AVX2>::fvec &lo, const vector_ops<double,AVX2>::fvec &hi) {
__m128 t1 = _mm256_cvtpd_ps(lo);
__m128 t2 = _mm256_cvtpd_ps(hi);
return _mm256_setr_m128(t1, t2);
}
static vector_ops<double,AVX2>::fvec cvtup_lo(const fvec &a) {
return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
}
static vector_ops<double,AVX2>::fvec cvtup_hi(const fvec &a) {
return _mm256_cvtps_pd(_mm256_extractf128_ps(a, 1)); // permute DCBA -> BADC
}
static void mask_cvtup(const bvec &a, vector_ops<double,AVX2>::bvec *blo, vector_ops<double,AVX2>::bvec *bhi) {
__m256i t1 = _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(a)));
__m256i t2 = _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(a)));
*blo = _mm256_permute2f128_si256(t1, t2, 0x20);
*bhi = _mm256_permute2f128_si256(t1, t2, 0x31);
}
};
//////////////////////////////////////////////////////////////////////////////
// SSE
//////////////////////////////////////////////////////////////////////////////
#pragma pack(push,16)
struct ivec32x4 {
__m128i vec;
ivec32x4() {}
ivec32x4(__m128i m) { vec = m; }
ivec32x4(const int * a) {
vec = _mm_load_si128(reinterpret_cast<const __m128i *>(a));
}
explicit ivec32x4(int i) { vec = _mm_set1_epi32(i); }
operator __m128i() const { return vec; }
friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) {
return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
}
friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) {
return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
}
friend ivec32x4 operator +(const ivec32x4 &a, const ivec32x4 &b) {
return _mm_add_epi32(a, b);
}
};
struct sse_bvecx4 {
__m128i vec;
sse_bvecx4() {}
sse_bvecx4(__m128i m) { vec = m; }
explicit sse_bvecx4(int i) { vec = _mm_set1_epi32(i); }
operator __m128i() const { return vec; }
operator F64vec2() const { return _mm_castsi128_pd(vec); }
operator ivec32x4() const { return vec; }
friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) {
return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
}
friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) {
return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
}
friend sse_bvecx4 operator ~(const sse_bvecx4 &a) { return _mm_castpd_si128(_mm_andnot_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(sse_bvecx4(0xFFFFFFFF)))); }
sse_bvecx4& operator &=(const sse_bvecx4 &a) { return *this = _mm_and_si128(vec,a); }
};
#pragma pack(pop)
template<>
struct vector_ops<double, SSE> {
static const int VL = 2;
typedef double fscal;
typedef F64vec2 fvec;
typedef ivec32x4 ivec;
typedef sse_bvecx4 bvec;
typedef double farr[2] __attribute__((aligned(16)));
typedef int iarr[4] __attribute__((aligned(16)));
static fvec recip(const fvec &a) {
fvec b = _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(a)));
b = b + b - a * b * b;
return b + b - a * b * b;
}
template<int scale>
static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
// nop
}
template<int scale>
static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
fvec res = from;
if (_mm_extract_epi32(mask, 0)) {
res = _mm_loadl_pd(res, reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale*_mm_extract_epi32(idx, 0)));
}
if (_mm_extract_epi32(mask, 2)) {
res = _mm_loadh_pd(res, reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale*_mm_extract_epi32(idx, 2)));
}
return res;
}
template<class T>
static void gather_x(const ivec &idxs, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
__m128d a0lo, a0hi, a1lo, a1hi;
if (_mm_extract_epi32(mask, 0)) {
a0lo = _mm_load_pd(reinterpret_cast<const double*>(&base[_mm_extract_epi32(idxs, 0)/32]));
a0hi = _mm_load_pd(reinterpret_cast<const double*>(&base[_mm_extract_epi32(idxs, 0)/32].z));
}
if (_mm_extract_epi32(mask, 2)) {
a1lo = _mm_load_pd(reinterpret_cast<const double*>(&base[_mm_extract_epi32(idxs, 2)/32]));
a1hi = _mm_load_pd(reinterpret_cast<const double*>(&base[_mm_extract_epi32(idxs, 2)/32].z));
}
__m128d c0 = _mm_unpacklo_pd(a0lo, a1lo);
__m128d c1 = _mm_unpackhi_pd(a0lo, a1lo);
__m128d c2 = _mm_unpacklo_pd(a0hi, a1hi);
__m128d c3 = _mm_unpackhi_pd(a0hi, a1hi);
*x = blend(mask, *x, c0);
*y = blend(mask, *y, c1);
*z = blend(mask, *z, c2);
*w = int_blend(mask, *w, _mm_shuffle_epi32(_mm_castpd_si128(c3), 0xA0));
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
}
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 8);
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char*>(base) + 16);
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char*>(base) + 24);
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return (b & _mm_castsi128_pd(mask)) | _mm_andnot_pd(_mm_castsi128_pd(mask), a);
}
static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
return (b & mask) | _mm_andnot_si128(mask, a);
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
return a*b + c;
}
static fvec zero() {
return _mm_setzero_pd();
}
static bvec cmpeq(const fvec &a, const fvec &b) {
return _mm_castpd_si128(_mm_cmpeq_pd(a, b));
}
static bvec cmpnle(const fvec &a, const fvec &b) {
return _mm_castpd_si128(_mm_cmpnle_pd(a, b));
}
static bvec cmple(const fvec &a, const fvec &b) {
return _mm_castpd_si128(_mm_cmple_pd(a, b));
}
static bvec cmplt(const fvec &a, const fvec &b) {
return _mm_castpd_si128(_mm_cmplt_pd(a, b));
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
__m128i we = _mm_undefined_si128();
return _mm_andnot_si128(_mm_cmpeq_epi32(a, b), _mm_cmpeq_epi8(we, we));
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
return _mm_cmplt_epi32(a, b);
}
static fvec invsqrt(const fvec &a) {
return _mm_invsqrt_pd(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
return _mm_sincos_pd(reinterpret_cast<__m128d *>(cos), a);
}
static fscal reduce_add(const fvec &a) {
__m128d t1 = _mm_shuffle_pd(a, a, 1); // reverse vector
return _mm_cvtsd_f64(_mm_add_pd(t1, a));
}
static ivec int_mullo(const ivec &a, const ivec &b) {
return _mm_mullo_epi32(a, b);
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
return ((a + b) & mask) | _mm_andnot_si128(mask, src);
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
iarr result;
iarr src;
iarr idxs, m;
_mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
_mm_store_si128(reinterpret_cast<__m128i*>(src), from);
_mm_store_si128(reinterpret_cast<__m128i*>(m), mask);
for (int i = 0; i < VL; i++) {
int tmp;
if (m[2*i]) {
tmp = *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i]);
} else {
tmp = src[2*i];
}
result[2*i] = tmp;
result[2*i+1] = tmp;
}
return _mm_load_si128(reinterpret_cast<__m128i*>(result));
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
return ((a + b) & _mm_castsi128_pd(mask)) | _mm_andnot_pd(_mm_castsi128_pd(mask), src);
}
static void store(void *at, const fvec &a) {
_mm_store_pd(reinterpret_cast<double*>(at), a);
}
static void int_store(int *at, const ivec &a) {
_mm_store_si128(reinterpret_cast<__m128i*>(at), a);
at[1] = at[2];
}
static void mask_store(int *at, const bvec &a) {
_mm_store_si128(reinterpret_cast<__m128i*>(at), a);
}
static fvec min(const fvec &a, const fvec &b) {
return _mm_min_pd(a, b);
}
static bool mask_test_at(const bvec &mask, int at) {
return reinterpret_cast<const int*>(&mask)[2*at];
}
static bool mask_testz(const bvec &mask) {
return _mm_testz_si128(mask, mask);
}
static bvec mask_enable_lower(int n) {
static const int base[4] __attribute__((aligned(32))) = {0,0,1,1};
return int_cmplt(ivec(base), ivec(n));
}
static ivec int_load_vl(const int *a) {
__m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(a));
return _mm_unpacklo_epi32(b, b);
}
static void int_clear_arr(int *a) {
_mm_store_si128(reinterpret_cast<__m128i *>(a), ivec(0));
}
static bvec full_mask() {
return bvec(0xFFFFFFFF);
}
static void int_print(const ivec &a) {
iarr tmp;
_mm_store_si128(reinterpret_cast<__m128i *>(tmp), a);
for (int i = 0; i < 4; i++) printf("%d ", tmp[i]);
printf("\n");
}
};
template<>
struct vector_ops<float, SSE> {
static const int VL = 4;
static const int ALIGN = 16;
typedef float fscal;
typedef F32vec4 fvec;
typedef ivec32x4 ivec;
typedef sse_bvecx4 bvec;
typedef float farr[4] __attribute__((aligned(16)));
typedef int iarr[4] __attribute__((aligned(16)));
static fvec recip(const fvec &a) {
fvec b = _mm_rcp_ps(a);
b = b + b - a * b * b;
return b + b - a * b * b;
}
template<int scale>
static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
// nop
}
template<int scale>
static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
farr result;
farr src;
iarr idxs, m;
mask_store(m, mask);
_mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
_mm_store_ps(reinterpret_cast<float*>(src), from);
for (int i = 0; i < VL; i++) {
result[i] = m[i]
? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
: src[i];
}
return _mm_load_ps(reinterpret_cast<float*>(result));
}
template<class T>
static void gather_x(const ivec &idxs, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
*x = gather<1>(*x, mask, idxs, &base->x);
*y = gather<1>(*y, mask, idxs, &base->y);
*z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w);
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
}
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 4);
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char*>(base) + 8);
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char*>(base) + 12);
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return (b & _mm_castsi128_ps(mask)) | _mm_andnot_ps(_mm_castsi128_ps(mask), a);
}
static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
return (b & mask) | _mm_andnot_si128(mask, a);
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
return a*b + c;
}
static fvec zero() {
return _mm_setzero_ps();
}
static bvec cmpeq(const fvec &a, const fvec &b) {
return _mm_castps_si128(_mm_cmpeq_ps(a, b));
}
static bvec cmpnle(const fvec &a, const fvec &b) {
return _mm_castps_si128(_mm_cmpnle_ps(a, b));
}
static bvec cmple(const fvec &a, const fvec &b) {
return _mm_castps_si128(_mm_cmple_ps(a, b));
}
static bvec cmplt(const fvec &a, const fvec &b) {
return _mm_castps_si128(_mm_cmplt_ps(a, b));
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
__m128i we = _mm_undefined_si128();
return _mm_andnot_si128(_mm_cmpeq_epi32(a, b), _mm_cmpeq_epi8(we, we));
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
return _mm_cmplt_epi32(a, b);
}
static fvec invsqrt(const fvec &a) {
return _mm_invsqrt_ps(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
return _mm_sincos_ps(reinterpret_cast<__m128 *>(cos), a);
}
static fscal reduce_add(const fvec &a) {
__m128 t1 = _mm_hadd_ps(a, a);
__m128 t2 = _mm_shuffle_ps(t1, t1, 0x1B); // reverse
return _mm_cvtss_f32(_mm_add_ps(t1, t2));
}
static ivec int_mullo(const ivec &a, const ivec &b) {
return _mm_mullo_epi32(a, b);
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
return ((a + b) & mask) | _mm_andnot_si128(mask, src);
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
iarr result;
iarr src;
iarr idxs, m;
_mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
_mm_store_si128(reinterpret_cast<__m128i*>(src), from);
_mm_store_si128(reinterpret_cast<__m128i*>(m), mask);
for (int i = 0; i < VL; i++) {
int tmp;
if (m[i]) {
tmp = *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i]);
} else {
tmp = src[i];
}
result[i] = tmp;
}
return _mm_load_si128(reinterpret_cast<__m128i*>(result));
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
return ((a + b) & _mm_castsi128_ps(mask)) | _mm_andnot_ps(_mm_castsi128_ps(mask), src);
}
static void store(void *at, const fvec &a) {
_mm_store_ps(reinterpret_cast<float*>(at), a);
}
static void int_store(int *at, const ivec &a) {
_mm_store_si128(reinterpret_cast<__m128i*>(at), a);
}
static void mask_store(int *at, const bvec &a) {
_mm_store_si128(reinterpret_cast<__m128i*>(at), a);
}
static fvec min(const fvec &a, const fvec &b) {
return _mm_min_ps(a, b);
}
static bool mask_test_at(const bvec &mask, int at) {
return reinterpret_cast<const int*>(&mask)[at];
}
static bool mask_testz(const bvec &mask) {
return _mm_testz_si128(mask, mask);
}
static bvec mask_enable_lower(int n) {
static const int base[4] __attribute__((aligned(32))) = {0,1,2,3};
return int_cmplt(ivec(base), ivec(n));
}
static ivec int_load_vl(const int *a) {
return _mm_load_si128(reinterpret_cast<const __m128i*>(a));
}
static void int_clear_arr(int *a) {
_mm_store_si128(reinterpret_cast<__m128i *>(a), ivec(0));
}
static bvec full_mask() {
return bvec(0xFFFFFFFF);
}
static void int_print(const ivec &a) {
iarr tmp;
_mm_store_si128(reinterpret_cast<__m128i *>(tmp), a);
for (int i = 0; i < 4; i++) printf("%d ", tmp[i]);
printf("\n");
}
static fvec cvtdown(const vector_ops<double,SSE>::fvec &lo, const vector_ops<double,SSE>::fvec &hi) {
__m128 t1 = _mm_cvtpd_ps(lo);
__m128 t2 = _mm_cvtpd_ps(hi);
return _mm_shuffle_ps(t1, t2, 0x44); // t2[1]t2[0]t1[1]t1[0]
}
static vector_ops<double,SSE>::fvec cvtup_lo(const fvec &a) {
return _mm_cvtps_pd(a);
}
static vector_ops<double,SSE>::fvec cvtup_hi(const fvec &a) {
return _mm_cvtps_pd(_mm_shuffle_ps(a, a, 0x4E)); // permute DCBA -> BADC
}
static void mask_cvtup(const bvec &a, vector_ops<double,SSE>::bvec *blo, vector_ops<double,SSE>::bvec *bhi) {
*blo = _mm_unpacklo_epi32(a, a);
*bhi = _mm_unpackhi_epi32(a, a);
}
};
#endif
// Scalar implementation
template<class flt_t>
struct vector_ops<flt_t, NONE> {
static const int VL = 1;
static const int ALIGN = 4;
typedef flt_t fscal;
typedef flt_t fvec;
typedef int ivec;
typedef bool bvec;
typedef flt_t farr[1];
typedef int iarr[1];
static fvec recip(const fvec &a) {
return ((flt_t) 1.) / a;
}
template<int scale>
static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
// nop
}
template<int scale>
static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
return mask ? *reinterpret_cast<const flt_t*>(reinterpret_cast<const char*>(base) + scale * idx) : from;
}
template<class T>
static void gather_x(const ivec &idxs, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
*x = gather<1>(*x, mask, idxs, &base->x);
*y = gather<1>(*y, mask, idxs, &base->y);
*z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w);
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
}
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal));
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal));
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char*>(base) + 2 * sizeof(fscal));
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char*>(base) + 3 * sizeof(fscal));
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
return mask ? b : a;
}
static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
return mask ? b : a;
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
return a*b + c;
}
static fvec zero() {
return 0.;
}
static bvec cmpeq(const fvec &a, const fvec &b) {
return a == b;
}
static bvec cmpnle(const fvec &a, const fvec &b) {
return !(a <= b);
}
static bvec cmple(const fvec &a, const fvec &b) {
return a <= b;
}
static bvec cmplt(const fvec &a, const fvec &b) {
return a < b;
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
return a != b;
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
return a < b;
}
static fvec invsqrt(const fvec &a) {
return 1. / sqrt(a);
}
static fvec sincos(fvec *c, const fvec &a) {
*c = cos(a);
return sin(a);
}
static fscal reduce_add(const fvec &a) {
return a;
}
static ivec int_mullo(const ivec &a, const ivec &b) {
return a * b;
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
return mask ? a + b : src;
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
return mask ? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idx) : from;
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
return mask ? a + b : src;
}
static void store(void *at, const fvec &a) {
*reinterpret_cast<flt_t*>(at) = a;
}
static void int_store(int *at, const ivec &a) {
*reinterpret_cast<int*>(at) = a;
}
static void mask_store(int *at, const bvec &a) {
*at = a;
}
static fvec min(const fvec &a, const fvec &b) {
return a < b ? a : b;
}
static bool mask_test_at(const bvec &mask, int at) {
return mask;
}
static bool mask_testz(const bvec &mask) {
return ! mask;
}
static bvec mask_enable_lower(int n) {
return n > 0 ? true : false;
}
static ivec int_load_vl(const int *a) {
return *a;
}
static void int_clear_arr(int *a) {
*a = 0;
}
static bvec full_mask() {
return true;
}
static void int_print(const ivec &a) {
}
};
// Array notation implementation
template<class flt_t>
struct vector_ops<flt_t, AN> {
static const int VL = 4;
typedef flt_t fscal;
typedef lmp_intel_an_fvec<VL, fscal> fvec;
typedef lmp_intel_an_ivec<VL> ivec;
typedef lmp_intel_an_bvec<VL> bvec;
typedef flt_t farr[VL];
typedef int iarr[VL];
static fvec recip(const fvec &a) {
fvec ret; ret.data[:] = ((fscal)1.) / a.data[:]; return ret;
}
template<int scale>
static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
// nop
}
template<int scale>
static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
fvec ret = from;
if (mask.data[:]) ret.data[:] = *reinterpret_cast<const fscal *>(reinterpret_cast<const char*>(base) + scale * idx.data[:]);
return ret;
}
template<class T>
static void gather_x(const ivec &idxs, const bvec &mask, const T *base, fvec *x, fvec *y, fvec *z, ivec *w) {
*x = gather<1>(*x, mask, idxs, &base->x);
*y = gather<1>(*y, mask, idxs, &base->y);
*z = gather<1>(*z, mask, idxs, &base->z);
*w = int_gather<1>(*w, mask, idxs, &base->w);
}
static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
fvec a = zero(), b = zero(), c = zero(), d = zero();
gather_4(idxs, mask, base, r0, r1, r2, r3);
gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
}
static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
*r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal));
*r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal));
*r2 = gather<4>(*r2, mask, idxs, reinterpret_cast<const char*>(base) + 2 * sizeof(fscal));
*r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char*>(base) + 3 * sizeof(fscal));
}
static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
fvec ret = a;
if (mask.data[:]) ret.data[:] = b.data[:];
return ret;
}
static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
fvec ret = a;
if (mask.data[:]) ret.data[:] = b.data[:];
return ret;
}
static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
fvec ret; ret.data[:] = a.data[:] * b.data[:] + c.data[:]; return ret;
}
static fvec zero() {
return fvec(0.);
}
static bvec cmpeq(const fvec &a, const fvec &b) {
bvec ret; ret.data[:] = a.data[:] == b.data[:]; return ret;
}
static bvec cmpnle(const fvec &a, const fvec &b) {
bvec ret; ret.data[:] = !(a.data[:] <= b.data[:]); return ret;
}
static bvec cmple(const fvec &a, const fvec &b) {
bvec ret; ret.data[:] = a.data[:] <= b.data[:]; return ret;
}
static bvec cmplt(const fvec &a, const fvec &b) {
bvec ret; ret.data[:] = a.data[:] < b.data[:]; return ret;
}
static bvec int_cmpneq(const ivec &a, const ivec &b) {
bvec ret; ret.data[:] = a.data[:] != b.data[:]; return ret;
}
static bvec int_cmplt(const ivec &a, const ivec &b) {
bvec ret; ret.data[:] = a.data[:] < b.data[:]; return ret;
}
static fvec invsqrt(const fvec &a) {
fvec ret; ret.data[:] = ((fscal)1.) / sqrt(a.data[:]); return ret;
}
static fvec sincos(fvec *c, const fvec &a) {
c->data[:] = cos(a.data[:]);
fvec ret; ret.data[:] = sin(a.data[:]); return ret;
}
static fscal reduce_add(const fvec &a) {
return __sec_reduce_add(a.data[:]);
}
static ivec int_mullo(const ivec &a, const ivec &b) {
ivec ret; ret.data[:] = a.data[:] * b.data[:]; return ret;
}
static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
ivec ret = src;
if (mask.data[:]) ret.data[:] = a.data[:] + b.data[:];
return ret;
}
template<int scale>
static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
ivec ret = from;
if (mask.data[:]) ret.data[:] = reinterpret_cast<const int*>(base)[scale * idx.data[:] / sizeof(int)];
return ret;
}
static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
fvec ret = src;
if (mask.data[:]) ret.data[:] = a.data[:] + b.data[:];
return ret;
}
static void store(void *at, const fvec &a) {
reinterpret_cast<fscal*>(at)[0:VL] = a.data[:];
}
static void int_store(int *at, const ivec &a) {
reinterpret_cast<int*>(at)[0:VL] = a.data[:];
}
static void mask_store(int *at, const bvec &a) {
at[0:VL] = a.data[:];
}
static fvec min(const fvec &a, const fvec &b) {
fvec ret = b;
if (a.data[:] < b.data[:]) ret.data[:] = a.data[:];
return ret;
}
static bool mask_test_at(const bvec &mask, int at) {
return mask.data[at];
}
static bool mask_testz(const bvec &mask) {
return ! __sec_reduce_or(mask.data[:]);
}
static bvec mask_enable_lower(int n) {
bvec ret; ret.data[:] = __sec_implicit_index(0) < n; return ret;
}
static ivec int_load_vl(const int *a) {
return ivec(a);
}
static void int_clear_arr(int *a) {
a[0:VL] = 0;
}
static bvec full_mask() {
return bvec(1);
}
static void int_print(const ivec &a) {
}
};
// Mixins to implement mixed precision and single/single and double/double
// This one is for single/single and double/double
template<class BASE_flt_t, CalculationMode BASE_mic>
struct AccumulatorSameMixin {
typedef vector_ops<BASE_flt_t, BASE_mic> BASE;
typedef typename BASE::fvec avec;
typedef typename BASE::farr aarr;
static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
return BASE::mask_add(src, m, a, b);
}
static typename BASE::fscal acc_reduce_add(const avec &a) {
return BASE::reduce_add(a);
}
static avec acc_zero() {
return BASE::zero();
}
static void acc_store(aarr mem, const avec &a) {
BASE::store(mem, a);
}
};
// Mixed precision for cases where double vectors contain fewer elements
template<class BASE_flt_t, class HIGH_flt_t, CalculationMode mic>
struct AccumulatorTwiceMixin {
typedef vector_ops<BASE_flt_t, mic> BASE;
typedef vector_ops<HIGH_flt_t, mic> HIGH;
struct avec_t {
typename HIGH::fvec lo, hi;
avec_t(const typename HIGH::fvec &alo, const typename HIGH::fvec &ahi) : lo(alo), hi(ahi) {}
avec_t(const typename BASE::fvec &a) {
lo = BASE::cvtup_lo(a);
hi = BASE::cvtup_hi(a);
}
friend avec_t operator +(const avec_t &a, const avec_t &b) {
return avec_t(a.lo + b.lo, a.hi + b.hi);
}
friend avec_t operator -(const avec_t &a, const avec_t &b) {
return avec_t(a.lo - b.lo, a.hi - b.hi);
}
friend avec_t operator *(const avec_t &a, const avec_t &b) {
return avec_t(a.lo * b.lo, a.hi * b.hi);
}
operator typename BASE::fvec() const {
return BASE::cvtdown(lo, hi);
}
};
typedef avec_t avec;
typedef typename HIGH::fscal aarr[BASE::VL] __attribute__((aligned(BASE::ALIGN)));
static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
typename HIGH::fvec blo = BASE::cvtup_lo(b);
typename HIGH::fvec bhi = BASE::cvtup_hi(b);
typename HIGH::bvec mlo, mhi;
BASE::mask_cvtup(m, &mlo, &mhi);
return avec(HIGH::mask_add(src.lo, mlo, a.lo, blo), HIGH::mask_add(src.hi, mhi, a.hi, bhi));
}
static typename HIGH::fscal acc_reduce_add(const avec &a) {
return HIGH::reduce_add(a.lo + a.hi);
}
static avec acc_zero() {
return avec(HIGH::zero(), HIGH::zero());
}
static void acc_store(aarr mem, const avec &a) {
HIGH::store(mem, a.lo);
HIGH::store(mem + BASE::VL / 2, a.hi);
}
};
// For cases where vector_ops<float,x>::VL == vector_ops<double,x>::VL
// i.e. scalar & AN
template<class BASE_flt_t, class HIGH_flt_t, CalculationMode mic>
struct AccumulatorTwiceMixinNone {
typedef vector_ops<BASE_flt_t, mic> BASE;
typedef vector_ops<HIGH_flt_t, mic> HIGH;
typedef typename HIGH::fvec avec;
typedef typename HIGH::fscal aarr[BASE::VL];
static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
return HIGH::mask_add(src, m, a, static_cast<typename HIGH::fvec>(b));
}
static typename HIGH::fscal acc_reduce_add(const avec &a) {
return HIGH::reduce_add(a);
}
static avec acc_zero() {
return HIGH::zero();
}
static void acc_store(aarr mem, const avec &a) {
HIGH::store(mem, a);
}
};
// This is the interfact that the user will see in the end.
template<class flt_t, class acc_t, CalculationMode mic>
struct vector_routines {};
template<CalculationMode mic>
struct vector_routines<double,double,mic> : public vector_ops<double, mic>, public AccumulatorSameMixin<double, mic> {};
template<CalculationMode mic>
struct vector_routines<float,float,mic> : public vector_ops<float, mic>, public AccumulatorSameMixin<float, mic> {};
template<CalculationMode mic>
struct vector_routines<float,double,mic> : public vector_ops<float, mic>, public AccumulatorTwiceMixin<float,double, mic> {};
// Specialize for AN and scalar
template<>
struct vector_routines<float,double,NONE> : public vector_ops<float, NONE>, public AccumulatorTwiceMixinNone<float,double, NONE> {};
template<>
struct vector_routines<float,double,AN> : public vector_ops<float, AN>, public AccumulatorTwiceMixinNone<float,double, AN> {};
} // namespace lmp_intel

Event Timeline