intel_intrinsics.h
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, May 27, 00:24

intel_intrinsics.h
View Options

This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.

	/* - c++ -- -----------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing author: Markus Höhnerbach (RWTH)
	------------------------------------------------------------------------- */

	// This file provides an intrinsics abstraction that allows access to the
	// underlying SIMD registers.
	// It allows for algorithms that are templated over the used vector length
	// The final interface is provided by vector_routines, which provides the
	// support for different precision modes.
	// The vector_ops interface provides routines specific to one floating point
	// data type, and is specialized for various architectures.
	// The routines work best with AVX-512 and AVX2, as both support the gather
	// instructions.
	// For both AVX and SSE we miss some optimization opportunities in the gather
	// implementations.

	// Vector classes provided with the intel compiler
	#if defined(__MIC__) && !defined(__AVX512F__)
	#include <mic/micvec.h>
	#else
	#include <dvec.h> // icc-mmic hates generating movq
	#include <fvec.h>
	#endif

	// Vector classes for Cilk array notation
	// This is experimental and doesn't yield good code yet
	template<int VL, typename fscal>
	struct lmp_intel_an_fvec {
	fscal data[VL];
	lmp_intel_an_fvec() {}
	explicit lmp_intel_an_fvec(const fscal f) { data[:] = f; }
	explicit lmp_intel_an_fvec(fscal f[VL]) { data[:] = f[:]; }
	lmp_intel_an_fvec(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; }
	lmp_intel_an_fvec& operator =(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; return *this; }
	const lmp_intel_an_fvec operator +(const lmp_intel_an_fvec &b) const {
	lmp_intel_an_fvec ret = *this;
	ret.data[:] += b.data[:];
	return ret;
	}
	const lmp_intel_an_fvec operator -(const lmp_intel_an_fvec &b) const {
	lmp_intel_an_fvec ret = *this;
	ret.data[:] -= b.data[:];
	return ret;
	}
	const lmp_intel_an_fvec operator *(const lmp_intel_an_fvec &b) const {
	lmp_intel_an_fvec ret = *this;
	ret.data[:] *= b.data[:];
	return ret;
	}
	const lmp_intel_an_fvec operator /(const lmp_intel_an_fvec &b) const {
	lmp_intel_an_fvec ret = *this;
	ret.data[:] /= b.data[:];
	return ret;
	}
	lmp_intel_an_fvec& operator +=(const lmp_intel_an_fvec &b) {
	data[:] += b.data[:]; return *this;
	}
	lmp_intel_an_fvec& operator -=(const lmp_intel_an_fvec &b) {
	data[:] -= b.data[:]; return *this;
	}
	lmp_intel_an_fvec& operator *=(const lmp_intel_an_fvec &b) {
	data[:] = b.data[:]; return this;
	}
	lmp_intel_an_fvec& operator /=(const lmp_intel_an_fvec &b) {
	data[:] /= b.data[:]; return *this;
	}
	friend lmp_intel_an_fvec sqrt(const lmp_intel_an_fvec &a) __attribute__((always_inline)) {
	lmp_intel_an_fvec ret; ret.data[:] = sqrt(a.data[:]); return ret;
	}
	friend lmp_intel_an_fvec exp(const lmp_intel_an_fvec &a) __attribute__((always_inline)) {
	lmp_intel_an_fvec ret; ret.data[:] = exp(a.data[:]); return ret;
	}
	friend lmp_intel_an_fvec sin(const lmp_intel_an_fvec &a) __attribute__((always_inline)) {
	lmp_intel_an_fvec ret; ret.data[:] = sin(a.data[:]); return ret;
	}
	friend lmp_intel_an_fvec invsqrt(const lmp_intel_an_fvec &a) __attribute__((always_inline)) {
	lmp_intel_an_fvec ret; ret.data[:] = ((fscal)1.) / sqrt(a.data[:]); return ret;
	}
	friend lmp_intel_an_fvec pow(const lmp_intel_an_fvec &a, const lmp_intel_an_fvec &b) __attribute__((always_inline)) {
	lmp_intel_an_fvec ret; ret.data[:] = pow(a.data[:], b.data[:]); return ret;
	}
	lmp_intel_an_fvec operator - () const {
	lmp_intel_an_fvec ret; ret.data[:] = - data[:]; return ret;
	}
	};
	template<int VL>
	struct lmp_intel_an_ivec {
	int data[VL];
	lmp_intel_an_ivec() {}
	explicit lmp_intel_an_ivec(int i) { data[:] = i; }
	explicit lmp_intel_an_ivec(const int * a) { data[:] = a[0:VL]; }
	const lmp_intel_an_ivec operator &(const lmp_intel_an_ivec &b) {
	lmp_intel_an_ivec ret = *this;
	ret.data[:] &= b.data[:];
	return ret;
	}
	const lmp_intel_an_ivec operator \|(const lmp_intel_an_ivec &b) {
	lmp_intel_an_ivec ret = *this;
	ret.data[:] \|= b.data[:];
	return ret;
	}
	const lmp_intel_an_ivec operator +(const lmp_intel_an_ivec &b) {
	lmp_intel_an_ivec ret = *this;
	ret.data[:] += b.data[:];
	return ret;
	}
	};
	template<int VL>
	struct lmp_intel_an_bvec {
	bool data[VL];
	lmp_intel_an_bvec() {}
	lmp_intel_an_bvec(const lmp_intel_an_bvec &a) { data[:] = a.data[:]; }
	lmp_intel_an_bvec& operator =(const lmp_intel_an_bvec &a) { data[:] = a.data[:]; return *this; }
	explicit lmp_intel_an_bvec(int i) { data[:] = i; }
	friend lmp_intel_an_bvec operator &(const lmp_intel_an_bvec &a, const lmp_intel_an_bvec &b) __attribute__((always_inline)) {
	lmp_intel_an_bvec ret; ret.data[:] = a.data[:] & b.data[:]; return ret;
	}
	friend lmp_intel_an_bvec operator \|(const lmp_intel_an_bvec &a, const lmp_intel_an_bvec &b) __attribute__((always_inline)) {
	lmp_intel_an_bvec ret; ret.data[:] = a.data[:] \| b.data[:]; return ret;
	}
	friend lmp_intel_an_bvec operator ~(const lmp_intel_an_bvec &a) __attribute__((always_inline)) {
	lmp_intel_an_bvec ret; ret.data[:] = ! a.data[:]; return ret;
	}
	lmp_intel_an_bvec& operator &=(const lmp_intel_an_bvec &a) __attribute__((always_inline)) {
	data[:] &= a.data[:]; return *this;
	}
	};

	namespace lmp_intel {

	// Self explanatory mostly, KNC=IMCI and AVX-512, NONE=Scalar, AN=Array Not.
	enum CalculationMode { KNC, AVX, AVX2, SSE, NONE, AN };
	#ifdef __MIC__
	#ifdef LMP_INTEL_VECTOR_MIC
	static const CalculationMode mode = LMP_INTEL_VECTOR_MIC;
	#else
	static const CalculationMode mode = KNC;
	#endif
	#else
	#ifdef LMP_INTEL_VECTOR_HOST
	static const CalculationMode mode = LMP_INTEL_VECTOR_HOST;
	#else
	#ifdef __AVX512F__
	static const CalculationMode mode = KNC;
	#else
	#ifdef __AVX2__
	static const CalculationMode mode = AVX2;
	#else
	#ifdef __AVX__
	static const CalculationMode mode = AVX;
	#else
	static const CalculationMode mode = SSE;
	#endif
	#endif
	#endif
	#endif
	#endif

	// This is used in the selection logic
	template<CalculationMode mode>
	struct vector_traits {
	static const bool support_integer_and_gather_ops = true;
	};

	template<>
	struct vector_traits<AVX> {
	static const bool support_integer_and_gather_ops = false;
	};

	// This is the base template for all the different architectures
	// It will get specialized
	template<class flt_t, CalculationMode mode>
	struct vector_ops {};

	// Intrinsic routines for IMCI and AVX-512
	#if defined(__MIC__) \|\| defined(__AVX512F__)
	// Integer vector class
	#pragma pack(push,64)
	struct ivec32x16 {
	__m512i vec;
	ivec32x16() {}
	ivec32x16(__m512i m) { vec = m; }
	ivec32x16(const int * a) {
	vec = _mm512_load_epi32(reinterpret_cast<const int *>(a));
	}
	explicit ivec32x16(int i) { vec = _mm512_set1_epi32(i); }
	operator __m512i() const { return vec; }
	friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) {
	return _mm512_and_epi32(a, b);
	}
	friend ivec32x16 operator \|(const ivec32x16 &a, const ivec32x16 &b) {
	return _mm512_or_epi32(a, b);
	}
	friend ivec32x16 operator +(const ivec32x16 &a, const ivec32x16 &b) {
	return _mm512_add_epi32(a, b);
	}
	};
	#pragma pack(pop)
	// Double precision routines
	template<>
	struct vector_ops<double, KNC> {
	static const int VL = 8;
	typedef double fscal;
	typedef F64vec8 fvec;
	typedef ivec32x16 ivec;
	typedef __mmask bvec;
	typedef double farr[8] __attribute__((aligned(64)));
	typedef int iarr[16] __attribute__((aligned(64)));
	static fvec recip(const fvec &a) { return _mm512_recip_pd(a); }
	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, bvec mask, const void *base) {
	_mm512_mask_prefetch_i32gather_ps(idx, mask, base, scale, _MM_HINT_T0);
	}
	template<int scale>
	static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
	return _mm512_mask_i32logather_pd(from, mask, idx, base, scale);
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	return _mm512_mask_blend_pd(mask, a, b);
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	return _mm512_fmadd_pd(a, b, c);
	}
	static fvec zero() {
	return _mm512_setzero_pd();
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	return _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	return _mm512_cmp_pd_mask(a, b, _CMP_NLE_US);
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	return _mm512_cmp_pd_mask(a, b, _CMP_LE_OS);
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	return _mm512_cmp_pd_mask(a, b, _CMP_LT_OS);
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	return _mm512_cmpneq_epi32_mask(a, b);
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	return _mm512_cmplt_epi32_mask(a, b);
	}
	static fvec invsqrt(const fvec &a) {
	return _mm512_invsqrt_pd(a);
	}
	static fvec sincos(fvec *cos, const fvec &a) {
	#if __INTEL_COMPILER+0 < 1500
	reinterpret_cast<__m512d >(cos) = _mm512_cos_pd(a);
	return _mm512_sin_pd(a);
	#else
	return _mm512_sincos_pd(reinterpret_cast<__m512d *>(cos), a);
	#endif
	}
	static fscal reduce_add(const fvec &a) {
	return _mm512_reduce_add_pd(a);
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	return _mm512_mullo_epi32(a, b);
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	return _mm512_mask_add_epi32(src, mask, a, b);
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	return _mm512_mask_i32gather_epi32(from, mask, idx, base, scale);
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	return _mm512_mask_add_pd(src, mask, a, b);
	}
	static void store(void *at, const fvec &a) {
	_mm512_store_pd(at, a);
	}
	static void int_store(void *at, const ivec &a) {
	_mm512_store_epi32(at, a);
	}
	static void mask_store(int *at, const bvec &a) {
	for (int i = 0; i < 8; i++) {
	at[i] = (a >> i) & 1;
	}
	}
	static fvec min(const fvec &a, const fvec &b) {
	return _mm512_min_pd(a, b);
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return mask & (1 << at);
	}
	static bool mask_testz(const bvec &mask) {
	return mask == 0;
	}

	static bvec mask_enable_lower(int n) {
	return 0xFF >> (VL - n);
	}

	static ivec int_load_vl(const int *a) {
	return _mm512_load_epi32(a);
	}
	static void int_clear_arr(int *a) {
	_mm512_store_epi32(a, ivec(0));
	}
	static void int_print(const ivec &a) {
	iarr tmp;
	_mm512_store_epi32(tmp, a);
	for (int i = 0; i < 8; i++) printf("%d ", tmp[i]);
	printf("\n");
	}
	template<class T>
	static void gather_x(const ivec &idxs, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	x = gather<1>(x, mask, idxs, &base->x);
	y = gather<1>(y, mask, idxs, &base->y);
	z = gather<1>(z, mask, idxs, &base->z);
	w = int_gather<1>(w, mask, idxs, &base->w);
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char *>(base) + 16);
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char *>(base) + 24);
	r4 = gather<4>(r4, mask, idxs, reinterpret_cast<const char *>(base) + 32);
	r5 = gather<4>(r5, mask, idxs, reinterpret_cast<const char *>(base) + 40);
	r6 = gather<4>(r6, mask, idxs, reinterpret_cast<const char *>(base) + 48);
	r7 = gather<4>(r7, mask, idxs, reinterpret_cast<const char *>(base) + 56);
	}
	static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char *>(base) + 8);
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char *>(base) + 16);
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char *>(base) + 24);
	}
	};

	template<>
	struct vector_ops<float, KNC> {
	static const int VL = 16;
	static const int ALIGN = 64;
	typedef float fscal;
	typedef F32vec16 fvec;
	typedef ivec32x16 ivec;
	typedef __mmask bvec;
	typedef float farr[16] __attribute__((aligned(64)));
	typedef int iarr[16] __attribute__((aligned(64)));
	static const bvec full_mask = 0xFFFF;
	static fvec recip(const fvec &a) { return _mm512_recip_ps(a); }
	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, bvec mask, const void *base) {
	_mm512_mask_prefetch_i32gather_ps(idx, mask, base, scale, _MM_HINT_T0);
	}
	template<int scale>
	static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
	return _mm512_mask_i32gather_ps(from, mask, idx, base, scale);
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	return _mm512_mask_blend_ps(mask, a, b);
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	return _mm512_fmadd_ps(a, b, c);
	}
	static fvec zero() {
	return _mm512_setzero_ps();
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	return _mm512_cmpeq_ps_mask(a, b);
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	return _mm512_cmpnle_ps_mask(a, b);
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	return _mm512_cmple_ps_mask(a, b);
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	return _mm512_cmplt_ps_mask(a, b);
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	return _mm512_cmpneq_epi32_mask(a, b);
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	return _mm512_cmplt_epi32_mask(a, b);
	}
	static fvec invsqrt(const fvec &a) {
	return _mm512_invsqrt_ps(a);
	}
	static fvec sincos(fvec *cos, const fvec &a) {
	#if __INTEL_COMPILER+0 < 1500
	reinterpret_cast<__m512 >(cos) = _mm512_cos_ps(a);
	return _mm512_sin_ps(a);
	#else
	return _mm512_sincos_ps(reinterpret_cast<__m512 *>(cos), a);
	#endif
	}
	static fscal reduce_add(const fvec &a) {
	return _mm512_reduce_add_ps(a);
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	return _mm512_mullo_epi32(a, b);
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	return _mm512_mask_add_epi32(src, mask, a, b);
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	return _mm512_mask_i32gather_epi32(from, mask, idx, base, scale);
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	return _mm512_mask_add_ps(src, mask, a, b);
	}
	static void store(void *at, const fvec &a) {
	_mm512_store_ps(at, a);
	}
	static void int_store(void *at, const ivec &a) {
	_mm512_store_epi32(at, a);
	}
	static void mask_store(int *at, const bvec &a) {
	for (int i = 0; i < 16; i++) {
	at[i] = (a >> i) & 1;
	}
	}
	static fvec min(const fvec &a, const fvec &b) {
	return _mm512_min_ps(a, b);
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return mask & (1 << at);
	}
	static bool mask_testz(const bvec &mask) {
	return mask == 0;
	}

	static bvec mask_enable_lower(int n) {
	return 0xFFFF >> (VL - n);
	}

	static ivec int_load_vl(const int *a) {
	return _mm512_load_epi32(a);
	}
	static void int_clear_arr(int *a) {
	_mm512_store_epi32(a, ivec(0));
	}
	static void int_print(const ivec &a) {
	iarr tmp;
	_mm512_store_epi32(tmp, a);
	for (int i = 0; i < 16; i++) printf("%d ", tmp[i]);
	printf("\n");
	}
	template<class T>
	static void gather_x(const ivec &idxs, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	x = gather<1>(x, mask, idxs, &base->x);
	y = gather<1>(y, mask, idxs, &base->y);
	z = gather<1>(z, mask, idxs, &base->z);
	w = int_gather<1>(w, mask, idxs, &base->w);
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char *>(base) + 8);
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
	r4 = gather<4>(r4, mask, idxs, reinterpret_cast<const char *>(base) + 16);
	r5 = gather<4>(r5, mask, idxs, reinterpret_cast<const char *>(base) + 20);
	r6 = gather<4>(r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
	r7 = gather<4>(r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
	}
	static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char *>(base) + 8);
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
	}
	// Additional routines needed for the implementation of mixed precision
	static fvec cvtdown(const vector_ops<double,KNC>::fvec &lo, const vector_ops<double,KNC>::fvec &hi) {
	__m512 t1 = _mm512_cvtpd_pslo(lo);
	__m512 t2 = _mm512_cvtpd_pslo(hi);
	return _mm512_mask_permute4f128_ps(t1, 0xFF00, t2, _MM_PERM_BADC);
	}
	static vector_ops<double,KNC>::fvec cvtup_lo(const fvec &a) {
	return _mm512_cvtpslo_pd(a);
	}
	static vector_ops<double,KNC>::fvec cvtup_hi(const fvec &a) {
	return _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a, _MM_PERM_BADC)); // permute DCBA -> BADC
	}
	static void mask_cvtup(const bvec &a, vector_ops<double,KNC>::bvec blo, vector_ops<double,KNC>::bvec bhi) {
	*blo = a & 0xFF;
	*bhi = a >> 8;
	}
	};
	#endif

	//////////////////////////////////////////////////////////////////////////////
	// AVX/SSE
	//////////////////////////////////////////////////////////////////////////////

	#ifndef __MIC__
	// class definitions for integer and masks for AVX
	// Note that we have to lower a number of operations to SSE, notably comparison
	// and integer operations.
	// The gather operations are emulated.
	struct ivec32x8 {
	__m256i vec;
	ivec32x8() {}
	ivec32x8(__m256i m) { vec = m; }
	ivec32x8(const int * a) {
	vec = _mm256_load_si256(reinterpret_cast<const __m256i *>(a));
	}
	explicit ivec32x8(int i) { vec = _mm256_set1_epi32(i); }
	operator __m256i() const { return vec; }
	friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) {
	return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
	}
	friend ivec32x8 operator \|(const ivec32x8 &a, const ivec32x8 &b) {
	return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
	}
	friend ivec32x8 operator +(const ivec32x8 &a, const ivec32x8 &b) {
	__m128i alo = _mm256_castsi256_si128(a);
	__m128i ahi = _mm256_extractf128_si256(a, 1);
	__m128i blo = _mm256_castsi256_si128(b);
	__m128i bhi = _mm256_extractf128_si256(b, 1);
	__m128i rlo = _mm_add_epi32(alo, blo);
	__m128i rhi = _mm_add_epi32(ahi, bhi);
	return _mm256_setr_m128i(rlo, rhi);
	}
	};

	struct avx_bvec {
	__m256i vec;
	avx_bvec() {}
	avx_bvec(__m256i m) { vec = m; }
	explicit avx_bvec(int i) { vec = _mm256_set1_epi32(i); }
	operator __m256i() const { return vec; }
	operator F64vec4() const { return _mm256_castsi256_pd(vec); }
	operator F32vec8() const { return _mm256_castsi256_ps(vec); }
	operator ivec32x8() const { return vec; }
	friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) {
	return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
	}
	friend avx_bvec operator \|(const avx_bvec &a, const avx_bvec &b) {
	return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
	}
	friend avx_bvec operator ~(const avx_bvec &a) { return _mm256_castpd_si256(_mm256_andnot_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(avx_bvec(0xFFFFFFFF)))); }
	avx_bvec& operator &=(const avx_bvec &a) { return *this = _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(vec), _mm256_castsi256_pd(a))); }
	};

	template<>
	struct vector_ops<double, AVX> {
	static const int VL = 4;
	typedef double fscal;
	typedef F64vec4 fvec;
	typedef ivec32x8 ivec;
	typedef avx_bvec bvec;
	typedef double farr[4] __attribute__((aligned(32)));
	typedef int iarr[8] __attribute__((aligned(32)));
	static fvec recip(const fvec &a) {
	// newton-raphson
	fvec b = _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a)));
	b = b + b - a * b * b;
	return b + b - a * b * b;
	}
	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
	// nop
	}
	template<int scale>
	static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
	farr result;
	farr src;
	iarr idxs;
	_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
	_mm256_store_pd(reinterpret_cast<double*>(src), from);
	for (int i = 0; i < VL; i++) {
	result[i] = mask_test_at(mask, i)
	? reinterpret_cast<const double>(reinterpret_cast<const char>(base) + scale idxs[2*i])
	: src[i];
	}
	return _mm256_load_pd(reinterpret_cast<double*>(result));
	}
	template<class T>
	static void gather_x(const ivec &idxs, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	iarr i, m;
	int_store(i, idxs);
	mask_store(m, mask);
	__m256d a0 = m[0] ? _mm256_load_pd(reinterpret_cast<const double*>(&base[i[0]/32])) : _mm256_setzero_pd();
	__m256d a1 = m[2] ? _mm256_load_pd(reinterpret_cast<const double*>(&base[i[1]/32])) : _mm256_setzero_pd();
	__m256d b0 = _mm256_unpacklo_pd(a0, a1);
	__m256d b1 = _mm256_unpackhi_pd(a0, a1);
	__m256d a2 = m[4] ? _mm256_load_pd(reinterpret_cast<const double*>(&base[i[2]/32])) : _mm256_setzero_pd();
	__m256d a3 = m[6] ? _mm256_load_pd(reinterpret_cast<const double*>(&base[i[3]/32])) : _mm256_setzero_pd();
	__m256d b2 = _mm256_unpacklo_pd(a2, a3);
	__m256d b3 = _mm256_unpackhi_pd(a2, a3);
	__m256d c0 = _mm256_permute2f128_pd(b0, b2, 0x20);
	__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
	__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
	__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
	x = blend(mask, x, c0);
	y = blend(mask, y, c1);
	z = blend(mask, z, c2);
	w = int_blend(mask, w, _mm256_castps_si256(_mm256_permute_ps(_mm256_castpd_ps(c3), 0xA0)));
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	fvec a = zero(), b = zero(), c = zero(), d = zero();
	gather_4(idxs, mask, base, r0, r1, r2, r3);
	gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
	}
	static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	iarr i, m;
	_mm256_store_si256(reinterpret_cast<__m256i*>(i), idxs);
	mask_store(m, mask);
	__m256d z = _mm256_setzero_pd();
	const char * m0 = reinterpret_cast<const char>(base) + 4 i[0];
	const char * m1 = reinterpret_cast<const char>(base) + 4 i[2];
	const char * m2 = reinterpret_cast<const char>(base) + 4 i[4];
	const char * m3 = reinterpret_cast<const char>(base) + 4 i[6];
	const double * e0 = reinterpret_cast<const double>(m[0] == 0 ? reinterpret_cast<const char>(&z) : m0);
	const double * e1 = reinterpret_cast<const double>(m[2] == 0 ? reinterpret_cast<const char>(&z) : m1);
	const double * e2 = reinterpret_cast<const double>(m[4] == 0 ? reinterpret_cast<const char>(&z) : m2);
	const double * e3 = reinterpret_cast<const double>(m[6] == 0 ? reinterpret_cast<const char>(&z) : m3);
	__m256d a0 = _mm256_load_pd(e0);
	__m256d a1 = _mm256_load_pd(e1);
	__m256d b0 = _mm256_unpacklo_pd(a0, a1);
	__m256d b1 = _mm256_unpackhi_pd(a0, a1);
	__m256d a2 = _mm256_load_pd(e2);
	__m256d a3 = _mm256_load_pd(e3);
	__m256d b2 = _mm256_unpacklo_pd(a2, a3);
	__m256d b3 = _mm256_unpackhi_pd(a2, a3);
	__m256d c0 = _mm256_permute2f128_pd(b0, b2, 0x20);
	__m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
	__m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
	__m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
	r0 = blend(mask, r0, c0);
	r1 = blend(mask, r1, c1);
	r2 = blend(mask, r2, c2);
	r3 = blend(mask, r3, c3);
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	return (b & mask) \| (a & ~ mask);
	}
	static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
	return (b & mask) \| (a & ~ mask);
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	return a*b + c;
	}
	static fvec zero() {
	return _mm256_setzero_pd();
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_EQ_OQ));
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_NLE_US));
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LE_OS));
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LT_OS));
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	__m128i alo = _mm256_castsi256_si128(a);
	__m128i ahi = _mm256_extractf128_si256(a, 1);
	__m128i blo = _mm256_castsi256_si128(b);
	__m128i bhi = _mm256_extractf128_si256(b, 1);
	__m128i rlo = _mm_andnot_si128(_mm_cmpeq_epi32(alo, blo), _mm_cmpeq_epi8(alo, alo));
	__m128i rhi = _mm_andnot_si128(_mm_cmpeq_epi32(ahi, bhi), _mm_cmpeq_epi8(alo, alo));
	return _mm256_setr_m128i(rlo, rhi);
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	__m128i alo = _mm256_castsi256_si128(a);
	__m128i ahi = _mm256_extractf128_si256(a, 1);
	__m128i blo = _mm256_castsi256_si128(b);
	__m128i bhi = _mm256_extractf128_si256(b, 1);
	__m128i rlo = _mm_cmplt_epi32(alo, blo);
	__m128i rhi = _mm_cmplt_epi32(ahi, bhi);
	return _mm256_setr_m128i(rlo, rhi);
	}
	static fvec invsqrt(const fvec &a) {
	return _mm256_invsqrt_pd(a);
	}
	static fvec sincos(fvec *cos, const fvec &a) {
	return _mm256_sincos_pd(reinterpret_cast<__m256d *>(cos), a);
	}
	static fscal reduce_add(const fvec &a) {
	__m256d t1 = _mm256_hadd_pd(a, a);
	__m128d t2 = _mm256_extractf128_pd(t1, 1);
	__m128d t3 = _mm256_castpd256_pd128(t1);
	return _mm_cvtsd_f64(_mm_add_pd(t2, t3));
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	__m128i alo = _mm256_castsi256_si128(a);
	__m128i ahi = _mm256_extractf128_si256(a, 1);
	__m128i blo = _mm256_castsi256_si128(b);
	__m128i bhi = _mm256_extractf128_si256(b, 1);
	__m128i rlo = _mm_mullo_epi32(alo, blo);
	__m128i rhi = _mm_mullo_epi32(ahi, bhi);
	return _mm256_setr_m128i(rlo, rhi);
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	return ((a + b) & mask) \| (src & ~ mask);
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	iarr result;
	iarr src;
	iarr idxs;
	_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
	_mm256_store_si256(reinterpret_cast<__m256i*>(src), from);
	for (int i = 0; i < VL; i++) {
	int tmp;
	if (mask_test_at(mask, i)) {
	tmp = reinterpret_cast<const int>(reinterpret_cast<const char>(base) + scale idxs[2*i]);
	} else {
	tmp = src[2*i];
	}
	result[2*i] = tmp;
	result[2*i+1] = tmp;
	}
	return _mm256_load_si256(reinterpret_cast<__m256i*>(result));
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	return ((a + b) & mask) \| (src & ~ mask);
	}
	static void store(void *at, const fvec &a) {
	_mm256_store_pd(reinterpret_cast<double*>(at), a);
	}
	static void int_store(int *at, const ivec &a) {
	_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
	at[1] = at[2];
	at[2] = at[4];
	at[3] = at[6];
	}
	static void mask_store(int *at, const bvec &a) {
	_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
	}
	static fvec min(const fvec &a, const fvec &b) {
	return _mm256_min_pd(a, b);
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return reinterpret_cast<const int>(&mask)[2at];
	}
	static bool mask_testz(const bvec &mask) {
	return _mm256_testz_si256(mask, mask);
	}
	static bvec mask_enable_lower(int n) {
	static const int base[8] __attribute__((aligned(64))) = {0,0,1,1,2,2,3,3};
	return int_cmplt(ivec(base), ivec(n));
	}
	static ivec int_load_vl(const int *a) {
	__m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(a));
	__m128i c = _mm_unpacklo_epi32(b, b);
	__m128i d = _mm_unpackhi_epi32(b, b);
	return _mm256_setr_m128i(c, d);
	// stolen from http://stackoverflow.com/questions/6687535/unexpected-result-from-avx-m256-unpack-ps-unpack-intrinsic
	}
	static void int_clear_arr(int *a) {
	_mm256_store_si256(reinterpret_cast<__m256i *>(a), ivec(0));
	}
	static bvec full_mask() {
	return bvec(0xFFFFFFFF);
	}
	static void int_print(const ivec &a) {
	iarr tmp;
	_mm256_store_si256(reinterpret_cast<__m256i *>(tmp), a);
	for (int i = 0; i < 8; i++) printf("%d ", tmp[i]);
	printf("\n");
	}
	};

	template<>
	struct vector_ops<float, AVX> {
	static const int VL = 8;
	static const int ALIGN = 32;
	typedef float fscal;
	typedef F32vec8 fvec;
	typedef ivec32x8 ivec;
	typedef avx_bvec bvec;
	typedef float farr[8] __attribute__((aligned(32)));
	typedef int iarr[8] __attribute__((aligned(32)));
	static fvec recip(const fvec &a) {
	fvec b = _mm256_rcp_ps(a);
	b = b + b - a * b * b;
	return b + b - a * b * b;
	}

	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, bvec mask, const void *base) {
	// nop
	}
	template<int scale>
	static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
	farr result;
	farr src;
	iarr idxs;
	_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
	_mm256_store_ps(reinterpret_cast<float*>(src), from);
	for (int i = 0; i < VL; i++) {
	result[i] = mask_test_at(mask, i)
	? reinterpret_cast<const float>(reinterpret_cast<const char>(base) + scale idxs[i])
	: src[i];
	}
	return _mm256_load_ps(reinterpret_cast<float*>(result));
	}
	template<class T>
	static void gather_x(const ivec &idxs, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	iarr i, m;
	int_store(i, idxs);
	mask_store(m, mask);
	farr zero_mem;
	store(zero_mem, zero());
	const float e0 = m[0] ? reinterpret_cast<const float>(&base[i[0]/16]) : zero_mem;
	const float e1 = m[1] ? reinterpret_cast<const float>(&base[i[1]/16]) : zero_mem;
	const float e2 = m[2] ? reinterpret_cast<const float>(&base[i[2]/16]) : zero_mem;
	const float e3 = m[3] ? reinterpret_cast<const float>(&base[i[3]/16]) : zero_mem;
	const float e4 = m[4] ? reinterpret_cast<const float>(&base[i[4]/16]) : zero_mem;
	const float e5 = m[5] ? reinterpret_cast<const float>(&base[i[5]/16]) : zero_mem;
	const float e6 = m[6] ? reinterpret_cast<const float>(&base[i[6]/16]) : zero_mem;
	const float e7 = m[7] ? reinterpret_cast<const float>(&base[i[7]/16]) : zero_mem;
	__m256 a0 = _mm256_loadu2_m128(e4, e0);
	__m256 a1 = _mm256_loadu2_m128(e5, e1);
	__m256 b0 = _mm256_unpacklo_ps(a0, a1);
	__m256 b1 = _mm256_unpackhi_ps(a0, a1);
	__m256 a2 = _mm256_loadu2_m128(e6, e2);
	__m256 a3 = _mm256_loadu2_m128(e7, e3);
	__m256 b2 = _mm256_unpacklo_ps(a2, a3);
	__m256 b3 = _mm256_unpackhi_ps(a2, a3);
	__m256 c0 = _mm256_shuffle_ps(b0, b2, 0x44);
	__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
	__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
	__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
	x = blend(mask, x, c0);
	y = blend(mask, y, c1);
	z = blend(mask, z, c2);
	w = int_blend(mask, w, _mm256_castps_si256(c3));
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	fvec a = zero(), b = zero(), c = zero(), d = zero();
	gather_4(idxs, mask, base, r0, r1, r2, r3);
	gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
	}
	static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	iarr i, m;
	int_store(i, idxs);
	mask_store(m, mask);
	farr zero_mem;
	store(zero_mem, zero());
	const float e0 = m[0] ? reinterpret_cast<const float>(&reinterpret_cast<const char >(base)[4i[0]]) : zero_mem;
	const float e1 = m[1] ? reinterpret_cast<const float>(&reinterpret_cast<const char >(base)[4i[1]]) : zero_mem;
	const float e2 = m[2] ? reinterpret_cast<const float>(&reinterpret_cast<const char >(base)[4i[2]]) : zero_mem;
	const float e3 = m[3] ? reinterpret_cast<const float>(&reinterpret_cast<const char >(base)[4i[3]]) : zero_mem;
	const float e4 = m[4] ? reinterpret_cast<const float>(&reinterpret_cast<const char >(base)[4i[4]]) : zero_mem;
	const float e5 = m[5] ? reinterpret_cast<const float>(&reinterpret_cast<const char >(base)[4i[5]]) : zero_mem;
	const float e6 = m[6] ? reinterpret_cast<const float>(&reinterpret_cast<const char >(base)[4i[6]]) : zero_mem;
	const float e7 = m[7] ? reinterpret_cast<const float>(&reinterpret_cast<const char >(base)[4i[7]]) : zero_mem;
	__m256 a0 = _mm256_loadu2_m128(e4, e0);
	__m256 a1 = _mm256_loadu2_m128(e5, e1);
	__m256 b0 = _mm256_unpacklo_ps(a0, a1);
	__m256 b1 = _mm256_unpackhi_ps(a0, a1);
	__m256 a2 = _mm256_loadu2_m128(e6, e2);
	__m256 a3 = _mm256_loadu2_m128(e7, e3);
	__m256 b2 = _mm256_unpacklo_ps(a2, a3);
	__m256 b3 = _mm256_unpackhi_ps(a2, a3);
	__m256 c0 = _mm256_shuffle_ps(b0, b2, 0x44);
	__m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
	__m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
	__m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
	r0 = blend(mask, r0, c0);
	r1 = blend(mask, r1, c1);
	r2 = blend(mask, r2, c2);
	r3 = blend(mask, r3, c3);
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	return (b & mask) \| (a & ~ mask);
	}
	static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
	return (b & mask) \| (a & ~ mask);
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	return a*b + c;
	}
	static fvec zero() {
	return _mm256_setzero_ps();
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_EQ_OQ));
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_NLE_US));
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LE_OS));
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LT_OS));
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	__m128i alo = _mm256_castsi256_si128(a);
	__m128i ahi = _mm256_extractf128_si256(a, 1);
	__m128i blo = _mm256_castsi256_si128(b);
	__m128i bhi = _mm256_extractf128_si256(b, 1);
	__m128i rlo = _mm_andnot_si128(_mm_cmpeq_epi32(alo, blo), _mm_cmpeq_epi8(alo, alo));
	__m128i rhi = _mm_andnot_si128(_mm_cmpeq_epi32(ahi, bhi), _mm_cmpeq_epi8(alo, alo));
	return _mm256_setr_m128i(rlo, rhi);
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	__m128i alo = _mm256_castsi256_si128(a);
	__m128i ahi = _mm256_extractf128_si256(a, 1);
	__m128i blo = _mm256_castsi256_si128(b);
	__m128i bhi = _mm256_extractf128_si256(b, 1);
	__m128i rlo = _mm_cmplt_epi32(alo, blo);
	__m128i rhi = _mm_cmplt_epi32(ahi, bhi);
	return _mm256_setr_m128i(rlo, rhi);
	}
	static fvec invsqrt(const fvec &a) {
	return _mm256_invsqrt_ps(a);
	}
	static fvec sincos(fvec *cos, const fvec &a) {
	return _mm256_sincos_ps(reinterpret_cast<__m256 *>(cos), a);
	}
	static fscal reduce_add(const fvec &a) {
	__m256 t1 = _mm256_hadd_ps(a, a);
	__m128 t2 = _mm256_extractf128_ps(t1, 1);
	__m128 t3 = _mm256_castps256_ps128(t1);
	__m128 t4 = _mm_add_ps(t2, t3);
	__m128 t5 = _mm_permute_ps(t4, 0x1B); // 0x1B = reverse
	return _mm_cvtss_f32(_mm_add_ps(t4, t5));
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	__m128i alo = _mm256_castsi256_si128(a);
	__m128i ahi = _mm256_extractf128_si256(a, 1);
	__m128i blo = _mm256_castsi256_si128(b);
	__m128i bhi = _mm256_extractf128_si256(b, 1);
	__m128i rlo = _mm_mullo_epi32(alo, blo);
	__m128i rhi = _mm_mullo_epi32(ahi, bhi);
	return _mm256_setr_m128i(rlo, rhi);
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	return ((a + b) & mask) \| (src & ~ mask);
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	iarr result;
	iarr src;
	iarr idxs;
	_mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
	_mm256_store_si256(reinterpret_cast<__m256i*>(src), from);
	for (int i = 0; i < VL; i++) {
	result[i] = mask_test_at(mask, i)
	? reinterpret_cast<const int>(reinterpret_cast<const char>(base) + scale idxs[i])
	: src[i];
	}
	return _mm256_load_si256(reinterpret_cast<__m256i*>(result));
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	return ((a + b) & mask) \| (src & ~ mask);
	}
	static void store(void *at, const fvec &a) {
	_mm256_store_ps(reinterpret_cast<float*>(at), a);
	}
	static void int_store(void *at, const ivec &a) {
	_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
	}
	static void mask_store(int *at, const bvec &a) {
	_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
	}
	static fvec min(const fvec &a, const fvec &b) {
	return _mm256_min_ps(a, b);
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return reinterpret_cast<const int*>(&mask)[at];
	}
	static bool mask_testz(const bvec &mask) {
	return _mm256_testz_si256(mask, mask);
	}
	static bvec mask_enable_lower(int n) {
	static const int base[8] __attribute__((aligned(64))) = {0,1,2,3,4,5,6,7};
	return int_cmplt(ivec(base), ivec(n));
	}
	static ivec int_load_vl(const int *a) {
	return _mm256_load_si256(reinterpret_cast<const __m256i*>(a));
	}
	static void int_clear_arr(int *a) {
	_mm256_store_si256(reinterpret_cast<__m256i *>(a), ivec(0));
	}
	static bvec full_mask() {
	return bvec(0xFFFFFFFF);
	}
	static void int_print(const ivec &a) {
	iarr tmp;
	_mm256_store_si256(reinterpret_cast<__m256i *>(tmp), a);
	for (int i = 0; i < 16; i++) printf("%d ", tmp[i]);
	printf("\n");
	}
	static fvec cvtdown(const vector_ops<double,AVX>::fvec &lo, const vector_ops<double,AVX>::fvec &hi) {
	__m128 t1 = _mm256_cvtpd_ps(lo);
	__m128 t2 = _mm256_cvtpd_ps(hi);
	return _mm256_setr_m128(t1, t2);
	}
	static vector_ops<double,AVX>::fvec cvtup_lo(const fvec &a) {
	return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
	}
	static vector_ops<double,AVX>::fvec cvtup_hi(const fvec &a) {
	return _mm256_cvtps_pd(_mm256_extractf128_ps(a, 1)); // permute DCBA -> BADC
	}
	static void mask_cvtup(const bvec &a, vector_ops<double,AVX>::bvec blo, vector_ops<double,AVX>::bvec bhi) {
	__m256i t1 = _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(a)));
	__m256i t2 = _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(a)));
	*blo = _mm256_permute2f128_si256(t1, t2, 0x20);
	*bhi = _mm256_permute2f128_si256(t1, t2, 0x31);
	}
	};

	// AVX2
	//

	struct avx2_ivec32 {
	__m256i vec;
	avx2_ivec32() {}
	avx2_ivec32(__m256i m) { vec = m; }
	avx2_ivec32(const int * a) {
	vec = _mm256_load_si256(reinterpret_cast<const __m256i *>(a));
	}
	explicit avx2_ivec32(int i) { vec = _mm256_set1_epi32(i); }
	operator __m256i() const { return vec; }
	friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) {
	return _mm256_and_si256(a, b);
	}
	friend avx2_ivec32 operator \|(const avx2_ivec32 &a, const avx2_ivec32 &b) {
	return _mm256_or_si256(a, b);
	}
	friend avx2_ivec32 operator +(const avx2_ivec32 &a, const avx2_ivec32 &b) {
	return _mm256_add_epi32(a, b);
	}
	};

	struct avx2_bvec {
	__m256i vec;
	avx2_bvec() {}
	avx2_bvec(__m256i m) { vec = m; }
	explicit avx2_bvec(int i) { vec = _mm256_set1_epi32(i); }
	operator __m256i() const { return vec; }
	operator __m256d() const { return _mm256_castsi256_pd(vec); }
	operator __m256() const { return _mm256_castsi256_ps(vec); }
	operator F64vec4() const { return _mm256_castsi256_pd(vec); }
	operator F32vec8() const { return _mm256_castsi256_ps(vec); }
	operator avx2_ivec32() const { return vec; }
	friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) {
	return _mm256_and_si256(a, b);
	}
	friend avx2_bvec operator \|(const avx2_bvec &a, const avx2_bvec &b) {
	return _mm256_or_si256(a, b);
	}
	friend avx2_bvec operator ~(const avx2_bvec &a) {
	return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF));
	}
	avx2_bvec& operator &=(const avx2_bvec &a) { return *this = _mm256_and_si256(vec,a); }
	};

	template<>
	struct vector_ops<double, AVX2> {
	static const int VL = 4;
	typedef double fscal;
	typedef F64vec4 fvec;
	typedef avx2_ivec32 ivec;
	typedef avx2_bvec bvec;
	typedef double farr[4] __attribute__((aligned(32)));
	typedef int iarr[8] __attribute__((aligned(32)));
	static fvec recip(const fvec &a) {
	// newton-raphson
	fvec b = _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a)));
	b = b + b - a * b * b;
	return b + b - a * b * b;
	}
	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
	// nop
	}
	template<int scale>
	static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
	ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
	ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
	return _mm256_mask_i32gather_pd(from, static_cast<const double*>(base), _mm256_castsi256_si128(idx1), mask, scale);
	}
	template<class T>
	static void gather_x(const ivec &idx, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
	ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
	x = _mm256_mask_i32gather_pd(x, &base->x, _mm256_castsi256_si128(idx1), mask, 1);
	y = _mm256_mask_i32gather_pd(y, &base->y, _mm256_castsi256_si128(idx1), mask, 1);
	z = _mm256_mask_i32gather_pd(z, &base->z, _mm256_castsi256_si128(idx1), mask, 1);
	w = _mm256_mask_i32gather_epi32(w, &base->w, idx, mask, 1);
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	fvec a = zero(), b = zero(), c = zero(), d = zero();
	gather_4(idxs, mask, base, r0, r1, r2, r3);
	gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
	}
	static void gather_4(const ivec &idx, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
	ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
	r0 = _mm256_mask_i32gather_pd(r0, static_cast<const double*>(base) + 0, _mm256_castsi256_si128(idx1), mask, 4);
	r1 = _mm256_mask_i32gather_pd(r1, static_cast<const double*>(base) + 1, _mm256_castsi256_si128(idx1), mask, 4);
	r2 = _mm256_mask_i32gather_pd(r2, static_cast<const double*>(base) + 2, _mm256_castsi256_si128(idx1), mask, 4);
	r3 = _mm256_mask_i32gather_pd(r3, static_cast<const double*>(base) + 3, _mm256_castsi256_si128(idx1), mask, 4);
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	return (b & mask) \| (a & ~ mask);
	}
	static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
	return (b & mask) \| (a & ~ mask);
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	return a*b + c;
	}
	static fvec zero() {
	return _mm256_setzero_pd();
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_EQ_OQ));
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_NLE_US));
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LE_OS));
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LT_OS));
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	return ~bvec(_mm256_cmpeq_epi32(a, b));
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	return _mm256_cmpgt_epi32(b, a);
	}
	static fvec invsqrt(const fvec &a) {
	return _mm256_invsqrt_pd(a);
	}
	static fvec sincos(fvec *cos, const fvec &a) {
	return _mm256_sincos_pd(reinterpret_cast<__m256d *>(cos), a);
	}
	static fscal reduce_add(const fvec &a) {
	__m256d t1 = _mm256_hadd_pd(a, a);
	__m128d t2 = _mm256_extractf128_pd(t1, 1);
	__m128d t3 = _mm256_castpd256_pd128(t1);
	return _mm_cvtsd_f64(_mm_add_pd(t2, t3));
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	return _mm256_mullo_epi32(a, b);
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	return ((a + b) & mask) \| (src & ~ mask);
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	return _mm256_mask_i32gather_epi32(from, static_cast<const int*>(base), idx, mask, scale);
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	return ((a + b) & mask) \| (src & ~ mask);
	}
	static void store(void *at, const fvec &a) {
	_mm256_store_pd(reinterpret_cast<double*>(at), a);
	}
	static void int_store(int *at, const ivec &a) {
	_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
	at[1] = at[2];
	at[2] = at[4];
	at[3] = at[6];
	}
	static void mask_store(int *at, const bvec &a) {
	_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
	}
	static fvec min(const fvec &a, const fvec &b) {
	return _mm256_min_pd(a, b);
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return reinterpret_cast<const int>(&mask)[2at];
	}
	static bool mask_testz(const bvec &mask) {
	return _mm256_testz_si256(mask, mask);
	}
	static bvec mask_enable_lower(int n) {
	static const int base[8] __attribute__((aligned(64))) = {0,0,1,1,2,2,3,3};
	return int_cmplt(ivec(base), ivec(n));
	}
	static ivec int_load_vl(const int *a) {
	__m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(a));
	__m128i c = _mm_unpacklo_epi32(b, b);
	__m128i d = _mm_unpackhi_epi32(b, b);
	return _mm256_setr_m128i(c, d);
	// stolen from http://stackoverflow.com/questions/6687535/unexpected-result-from-avx-m256-unpack-ps-unpack-intrinsic
	}
	static void int_clear_arr(int *a) {
	_mm256_store_si256(reinterpret_cast<__m256i *>(a), ivec(0));
	}
	static bvec full_mask() {
	return bvec(0xFFFFFFFF);
	}
	static void int_print(const ivec &a) {
	iarr tmp;
	_mm256_store_si256(reinterpret_cast<__m256i *>(tmp), a);
	for (int i = 0; i < 8; i++) printf("%d ", tmp[i]);
	printf("\n");
	}
	};

	template<>
	struct vector_ops<float, AVX2> {
	static const int VL = 8;
	static const int ALIGN = 32;
	typedef float fscal;
	typedef F32vec8 fvec;
	typedef avx2_ivec32 ivec;
	typedef avx2_bvec bvec;
	typedef float farr[8] __attribute__((aligned(32)));
	typedef int iarr[8] __attribute__((aligned(32)));
	static fvec recip(const fvec &a) {
	fvec b = _mm256_rcp_ps(a);
	b = b + b - a * b * b;
	return b + b - a * b * b;
	}

	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, bvec mask, const void *base) {
	// nop
	}
	template<int scale>
	static fvec gather(const fvec &from, bvec mask, const ivec &idx, const void *base) {
	return _mm256_mask_i32gather_ps(from, static_cast<const float*>(base), idx, mask, scale);
	}
	template<class T>
	static void gather_x(const ivec &idx, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	x = _mm256_mask_i32gather_ps(x, reinterpret_cast<const float*>(base) + 0, idx, mask, 1);
	y = _mm256_mask_i32gather_ps(y, reinterpret_cast<const float*>(base) + 1, idx, mask, 1);
	z = _mm256_mask_i32gather_ps(z, reinterpret_cast<const float*>(base) + 2, idx, mask, 1);
	w = _mm256_mask_i32gather_epi32(w, reinterpret_cast<const int*>(base) + 3, idx, mask, 1);
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char *>(base) + 8);
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
	r4 = gather<4>(r4, mask, idxs, reinterpret_cast<const char *>(base) + 16);
	r5 = gather<4>(r5, mask, idxs, reinterpret_cast<const char *>(base) + 20);
	r6 = gather<4>(r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
	r7 = gather<4>(r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
	}
	static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char *>(base) + 0);
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char *>(base) + 4);
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char *>(base) + 8);
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	return (b & mask) \| (a & ~ mask);
	}
	static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
	return (b & mask) \| (a & ~ mask);
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	return a*b + c;
	}
	static fvec zero() {
	return _mm256_setzero_ps();
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_EQ_OQ));
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_NLE_US));
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LE_OS));
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LT_OS));
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	return ~bvec(_mm256_cmpeq_epi32(a, b));
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	return _mm256_cmpgt_epi32(b, a);
	}
	static fvec invsqrt(const fvec &a) {
	return _mm256_invsqrt_ps(a);
	}
	static fvec sincos(fvec *cos, const fvec &a) {
	return _mm256_sincos_ps(reinterpret_cast<__m256 *>(cos), a);
	}
	static fscal reduce_add(const fvec &a) {
	__m256 t1 = _mm256_hadd_ps(a, a);
	__m128 t2 = _mm256_extractf128_ps(t1, 1);
	__m128 t3 = _mm256_castps256_ps128(t1);
	__m128 t4 = _mm_add_ps(t2, t3);
	__m128 t5 = _mm_permute_ps(t4, 0x1B); // 0x1B = reverse
	return _mm_cvtss_f32(_mm_add_ps(t4, t5));
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	return _mm256_mullo_epi32(a, b);
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	return ((a + b) & mask) \| (src & ~ mask);
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	return _mm256_mask_i32gather_epi32(from, static_cast<const int*>(base), idx, mask, scale);
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	return ((a + b) & mask) \| (src & ~ mask);
	}
	static void store(void *at, const fvec &a) {
	_mm256_store_ps(reinterpret_cast<float*>(at), a);
	}
	static void int_store(void *at, const ivec &a) {
	_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
	}
	static void mask_store(int *at, const bvec &a) {
	_mm256_store_si256(reinterpret_cast<__m256i*>(at), a);
	}
	static fvec min(const fvec &a, const fvec &b) {
	return _mm256_min_ps(a, b);
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return reinterpret_cast<const int*>(&mask)[at];
	}
	static bool mask_testz(const bvec &mask) {
	return _mm256_testz_si256(mask, mask);
	}
	static bvec mask_enable_lower(int n) {
	static const int base[8] __attribute__((aligned(64))) = {0,1,2,3,4,5,6,7};
	return int_cmplt(ivec(base), ivec(n));
	}
	static ivec int_load_vl(const int *a) {
	return _mm256_load_si256(reinterpret_cast<const __m256i*>(a));
	}
	static void int_clear_arr(int *a) {
	_mm256_store_si256(reinterpret_cast<__m256i *>(a), ivec(0));
	}
	static bvec full_mask() {
	return bvec(0xFFFFFFFF);
	}
	static void int_print(const ivec &a) {
	iarr tmp;
	_mm256_store_si256(reinterpret_cast<__m256i *>(tmp), a);
	for (int i = 0; i < 16; i++) printf("%d ", tmp[i]);
	printf("\n");
	}
	static fvec cvtdown(const vector_ops<double,AVX2>::fvec &lo, const vector_ops<double,AVX2>::fvec &hi) {
	__m128 t1 = _mm256_cvtpd_ps(lo);
	__m128 t2 = _mm256_cvtpd_ps(hi);
	return _mm256_setr_m128(t1, t2);
	}
	static vector_ops<double,AVX2>::fvec cvtup_lo(const fvec &a) {
	return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
	}
	static vector_ops<double,AVX2>::fvec cvtup_hi(const fvec &a) {
	return _mm256_cvtps_pd(_mm256_extractf128_ps(a, 1)); // permute DCBA -> BADC
	}
	static void mask_cvtup(const bvec &a, vector_ops<double,AVX2>::bvec blo, vector_ops<double,AVX2>::bvec bhi) {
	__m256i t1 = _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(a)));
	__m256i t2 = _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(a)));
	*blo = _mm256_permute2f128_si256(t1, t2, 0x20);
	*bhi = _mm256_permute2f128_si256(t1, t2, 0x31);
	}
	};



	//////////////////////////////////////////////////////////////////////////////
	// SSE
	//////////////////////////////////////////////////////////////////////////////

	#pragma pack(push,16)

	struct ivec32x4 {
	__m128i vec;
	ivec32x4() {}
	ivec32x4(__m128i m) { vec = m; }
	ivec32x4(const int * a) {
	vec = _mm_load_si128(reinterpret_cast<const __m128i *>(a));
	}
	explicit ivec32x4(int i) { vec = _mm_set1_epi32(i); }
	operator __m128i() const { return vec; }
	friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) {
	return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
	}
	friend ivec32x4 operator \|(const ivec32x4 &a, const ivec32x4 &b) {
	return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
	}
	friend ivec32x4 operator +(const ivec32x4 &a, const ivec32x4 &b) {
	return _mm_add_epi32(a, b);
	}
	};

	struct sse_bvecx4 {
	__m128i vec;
	sse_bvecx4() {}
	sse_bvecx4(__m128i m) { vec = m; }
	explicit sse_bvecx4(int i) { vec = _mm_set1_epi32(i); }
	operator __m128i() const { return vec; }
	operator F64vec2() const { return _mm_castsi128_pd(vec); }
	operator ivec32x4() const { return vec; }
	friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) {
	return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
	}
	friend sse_bvecx4 operator \|(const sse_bvecx4 &a, const sse_bvecx4 &b) {
	return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
	}
	friend sse_bvecx4 operator ~(const sse_bvecx4 &a) { return _mm_castpd_si128(_mm_andnot_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(sse_bvecx4(0xFFFFFFFF)))); }
	sse_bvecx4& operator &=(const sse_bvecx4 &a) { return *this = _mm_and_si128(vec,a); }
	};


	#pragma pack(pop)

	template<>
	struct vector_ops<double, SSE> {
	static const int VL = 2;
	typedef double fscal;
	typedef F64vec2 fvec;
	typedef ivec32x4 ivec;
	typedef sse_bvecx4 bvec;
	typedef double farr[2] __attribute__((aligned(16)));
	typedef int iarr[4] __attribute__((aligned(16)));
	static fvec recip(const fvec &a) {
	fvec b = _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(a)));
	b = b + b - a * b * b;
	return b + b - a * b * b;
	}
	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
	// nop
	}
	template<int scale>
	static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
	fvec res = from;
	if (_mm_extract_epi32(mask, 0)) {
	res = _mm_loadl_pd(res, reinterpret_cast<const double>(reinterpret_cast<const char>(base) + scale*_mm_extract_epi32(idx, 0)));
	}
	if (_mm_extract_epi32(mask, 2)) {
	res = _mm_loadh_pd(res, reinterpret_cast<const double>(reinterpret_cast<const char>(base) + scale*_mm_extract_epi32(idx, 2)));
	}
	return res;
	}
	template<class T>
	static void gather_x(const ivec &idxs, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	__m128d a0lo, a0hi, a1lo, a1hi;
	if (_mm_extract_epi32(mask, 0)) {
	a0lo = _mm_load_pd(reinterpret_cast<const double*>(&base[_mm_extract_epi32(idxs, 0)/32]));
	a0hi = _mm_load_pd(reinterpret_cast<const double*>(&base[_mm_extract_epi32(idxs, 0)/32].z));
	}
	if (_mm_extract_epi32(mask, 2)) {
	a1lo = _mm_load_pd(reinterpret_cast<const double*>(&base[_mm_extract_epi32(idxs, 2)/32]));
	a1hi = _mm_load_pd(reinterpret_cast<const double*>(&base[_mm_extract_epi32(idxs, 2)/32].z));
	}
	__m128d c0 = _mm_unpacklo_pd(a0lo, a1lo);
	__m128d c1 = _mm_unpackhi_pd(a0lo, a1lo);
	__m128d c2 = _mm_unpacklo_pd(a0hi, a1hi);
	__m128d c3 = _mm_unpackhi_pd(a0hi, a1hi);
	x = blend(mask, x, c0);
	y = blend(mask, y, c1);
	z = blend(mask, z, c2);
	w = int_blend(mask, w, _mm_shuffle_epi32(_mm_castpd_si128(c3), 0xA0));
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	fvec a = zero(), b = zero(), c = zero(), d = zero();
	gather_4(idxs, mask, base, r0, r1, r2, r3);
	gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
	}
	static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char*>(base) + 8);
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char*>(base) + 16);
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char*>(base) + 24);
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	return (b & _mm_castsi128_pd(mask)) \| _mm_andnot_pd(_mm_castsi128_pd(mask), a);
	}
	static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
	return (b & mask) \| _mm_andnot_si128(mask, a);
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	return a*b + c;
	}
	static fvec zero() {
	return _mm_setzero_pd();
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	return _mm_castpd_si128(_mm_cmpeq_pd(a, b));
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	return _mm_castpd_si128(_mm_cmpnle_pd(a, b));
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	return _mm_castpd_si128(_mm_cmple_pd(a, b));
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	return _mm_castpd_si128(_mm_cmplt_pd(a, b));
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	__m128i we = _mm_undefined_si128();
	return _mm_andnot_si128(_mm_cmpeq_epi32(a, b), _mm_cmpeq_epi8(we, we));
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	return _mm_cmplt_epi32(a, b);
	}
	static fvec invsqrt(const fvec &a) {
	return _mm_invsqrt_pd(a);
	}
	static fvec sincos(fvec *cos, const fvec &a) {
	return _mm_sincos_pd(reinterpret_cast<__m128d *>(cos), a);
	}
	static fscal reduce_add(const fvec &a) {
	__m128d t1 = _mm_shuffle_pd(a, a, 1); // reverse vector
	return _mm_cvtsd_f64(_mm_add_pd(t1, a));
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	return _mm_mullo_epi32(a, b);
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	return ((a + b) & mask) \| _mm_andnot_si128(mask, src);
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	iarr result;
	iarr src;
	iarr idxs, m;
	_mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
	_mm_store_si128(reinterpret_cast<__m128i*>(src), from);
	_mm_store_si128(reinterpret_cast<__m128i*>(m), mask);
	for (int i = 0; i < VL; i++) {
	int tmp;
	if (m[2*i]) {
	tmp = reinterpret_cast<const int>(reinterpret_cast<const char>(base) + scale idxs[2*i]);
	} else {
	tmp = src[2*i];
	}
	result[2*i] = tmp;
	result[2*i+1] = tmp;
	}
	return _mm_load_si128(reinterpret_cast<__m128i*>(result));
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	return ((a + b) & _mm_castsi128_pd(mask)) \| _mm_andnot_pd(_mm_castsi128_pd(mask), src);
	}
	static void store(void *at, const fvec &a) {
	_mm_store_pd(reinterpret_cast<double*>(at), a);
	}
	static void int_store(int *at, const ivec &a) {
	_mm_store_si128(reinterpret_cast<__m128i*>(at), a);
	at[1] = at[2];
	}
	static void mask_store(int *at, const bvec &a) {
	_mm_store_si128(reinterpret_cast<__m128i*>(at), a);
	}
	static fvec min(const fvec &a, const fvec &b) {
	return _mm_min_pd(a, b);
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return reinterpret_cast<const int>(&mask)[2at];
	}
	static bool mask_testz(const bvec &mask) {
	return _mm_testz_si128(mask, mask);
	}
	static bvec mask_enable_lower(int n) {
	static const int base[4] __attribute__((aligned(32))) = {0,0,1,1};
	return int_cmplt(ivec(base), ivec(n));
	}
	static ivec int_load_vl(const int *a) {
	__m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(a));
	return _mm_unpacklo_epi32(b, b);
	}
	static void int_clear_arr(int *a) {
	_mm_store_si128(reinterpret_cast<__m128i *>(a), ivec(0));
	}
	static bvec full_mask() {
	return bvec(0xFFFFFFFF);
	}
	static void int_print(const ivec &a) {
	iarr tmp;
	_mm_store_si128(reinterpret_cast<__m128i *>(tmp), a);
	for (int i = 0; i < 4; i++) printf("%d ", tmp[i]);
	printf("\n");
	}
	};

	template<>
	struct vector_ops<float, SSE> {
	static const int VL = 4;
	static const int ALIGN = 16;
	typedef float fscal;
	typedef F32vec4 fvec;
	typedef ivec32x4 ivec;
	typedef sse_bvecx4 bvec;
	typedef float farr[4] __attribute__((aligned(16)));
	typedef int iarr[4] __attribute__((aligned(16)));
	static fvec recip(const fvec &a) {
	fvec b = _mm_rcp_ps(a);
	b = b + b - a * b * b;
	return b + b - a * b * b;
	}
	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
	// nop
	}
	template<int scale>
	static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
	farr result;
	farr src;
	iarr idxs, m;
	mask_store(m, mask);
	_mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
	_mm_store_ps(reinterpret_cast<float*>(src), from);
	for (int i = 0; i < VL; i++) {
	result[i] = m[i]
	? reinterpret_cast<const float>(reinterpret_cast<const char>(base) + scale idxs[i])
	: src[i];
	}
	return _mm_load_ps(reinterpret_cast<float*>(result));
	}
	template<class T>
	static void gather_x(const ivec &idxs, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	x = gather<1>(x, mask, idxs, &base->x);
	y = gather<1>(y, mask, idxs, &base->y);
	z = gather<1>(z, mask, idxs, &base->z);
	w = int_gather<1>(w, mask, idxs, &base->w);
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	fvec a = zero(), b = zero(), c = zero(), d = zero();
	gather_4(idxs, mask, base, r0, r1, r2, r3);
	gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
	}
	static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char*>(base) + 0);
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char*>(base) + 4);
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char*>(base) + 8);
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char*>(base) + 12);
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	return (b & _mm_castsi128_ps(mask)) \| _mm_andnot_ps(_mm_castsi128_ps(mask), a);
	}
	static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
	return (b & mask) \| _mm_andnot_si128(mask, a);
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	return a*b + c;
	}
	static fvec zero() {
	return _mm_setzero_ps();
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	return _mm_castps_si128(_mm_cmpeq_ps(a, b));
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	return _mm_castps_si128(_mm_cmpnle_ps(a, b));
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	return _mm_castps_si128(_mm_cmple_ps(a, b));
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	return _mm_castps_si128(_mm_cmplt_ps(a, b));
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	__m128i we = _mm_undefined_si128();
	return _mm_andnot_si128(_mm_cmpeq_epi32(a, b), _mm_cmpeq_epi8(we, we));
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	return _mm_cmplt_epi32(a, b);
	}
	static fvec invsqrt(const fvec &a) {
	return _mm_invsqrt_ps(a);
	}
	static fvec sincos(fvec *cos, const fvec &a) {
	return _mm_sincos_ps(reinterpret_cast<__m128 *>(cos), a);
	}
	static fscal reduce_add(const fvec &a) {
	__m128 t1 = _mm_hadd_ps(a, a);
	__m128 t2 = _mm_shuffle_ps(t1, t1, 0x1B); // reverse
	return _mm_cvtss_f32(_mm_add_ps(t1, t2));
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	return _mm_mullo_epi32(a, b);
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	return ((a + b) & mask) \| _mm_andnot_si128(mask, src);
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	iarr result;
	iarr src;
	iarr idxs, m;
	_mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
	_mm_store_si128(reinterpret_cast<__m128i*>(src), from);
	_mm_store_si128(reinterpret_cast<__m128i*>(m), mask);
	for (int i = 0; i < VL; i++) {
	int tmp;
	if (m[i]) {
	tmp = reinterpret_cast<const int>(reinterpret_cast<const char>(base) + scale idxs[i]);
	} else {
	tmp = src[i];
	}
	result[i] = tmp;
	}
	return _mm_load_si128(reinterpret_cast<__m128i*>(result));
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	return ((a + b) & _mm_castsi128_ps(mask)) \| _mm_andnot_ps(_mm_castsi128_ps(mask), src);
	}
	static void store(void *at, const fvec &a) {
	_mm_store_ps(reinterpret_cast<float*>(at), a);
	}
	static void int_store(int *at, const ivec &a) {
	_mm_store_si128(reinterpret_cast<__m128i*>(at), a);
	}
	static void mask_store(int *at, const bvec &a) {
	_mm_store_si128(reinterpret_cast<__m128i*>(at), a);
	}
	static fvec min(const fvec &a, const fvec &b) {
	return _mm_min_ps(a, b);
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return reinterpret_cast<const int*>(&mask)[at];
	}
	static bool mask_testz(const bvec &mask) {
	return _mm_testz_si128(mask, mask);
	}
	static bvec mask_enable_lower(int n) {
	static const int base[4] __attribute__((aligned(32))) = {0,1,2,3};
	return int_cmplt(ivec(base), ivec(n));
	}
	static ivec int_load_vl(const int *a) {
	return _mm_load_si128(reinterpret_cast<const __m128i*>(a));
	}
	static void int_clear_arr(int *a) {
	_mm_store_si128(reinterpret_cast<__m128i *>(a), ivec(0));
	}
	static bvec full_mask() {
	return bvec(0xFFFFFFFF);
	}
	static void int_print(const ivec &a) {
	iarr tmp;
	_mm_store_si128(reinterpret_cast<__m128i *>(tmp), a);
	for (int i = 0; i < 4; i++) printf("%d ", tmp[i]);
	printf("\n");
	}
	static fvec cvtdown(const vector_ops<double,SSE>::fvec &lo, const vector_ops<double,SSE>::fvec &hi) {
	__m128 t1 = _mm_cvtpd_ps(lo);
	__m128 t2 = _mm_cvtpd_ps(hi);
	return _mm_shuffle_ps(t1, t2, 0x44); // t2[1]t2[0]t1[1]t1[0]
	}
	static vector_ops<double,SSE>::fvec cvtup_lo(const fvec &a) {
	return _mm_cvtps_pd(a);
	}
	static vector_ops<double,SSE>::fvec cvtup_hi(const fvec &a) {
	return _mm_cvtps_pd(_mm_shuffle_ps(a, a, 0x4E)); // permute DCBA -> BADC
	}
	static void mask_cvtup(const bvec &a, vector_ops<double,SSE>::bvec blo, vector_ops<double,SSE>::bvec bhi) {
	*blo = _mm_unpacklo_epi32(a, a);
	*bhi = _mm_unpackhi_epi32(a, a);
	}
	};


	#endif

	// Scalar implementation
	template<class flt_t>
	struct vector_ops<flt_t, NONE> {
	static const int VL = 1;
	static const int ALIGN = 4;
	typedef flt_t fscal;
	typedef flt_t fvec;
	typedef int ivec;
	typedef bool bvec;
	typedef flt_t farr[1];
	typedef int iarr[1];
	static fvec recip(const fvec &a) {
	return ((flt_t) 1.) / a;
	}
	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
	// nop
	}
	template<int scale>
	static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
	return mask ? reinterpret_cast<const flt_t>(reinterpret_cast<const char>(base) + scale idx) : from;
	}
	template<class T>
	static void gather_x(const ivec &idxs, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	x = gather<1>(x, mask, idxs, &base->x);
	y = gather<1>(y, mask, idxs, &base->y);
	z = gather<1>(z, mask, idxs, &base->z);
	w = int_gather<1>(w, mask, idxs, &base->w);
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	fvec a = zero(), b = zero(), c = zero(), d = zero();
	gather_4(idxs, mask, base, r0, r1, r2, r3);
	gather_4(idxs, mask, reinterpret_cast<const char>(base) + 4 sizeof(fscal), r4, r5, r6, r7);
	}
	static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char>(base) + 0 sizeof(fscal));
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char>(base) + 1 sizeof(fscal));
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char>(base) + 2 sizeof(fscal));
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char>(base) + 3 sizeof(fscal));
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	return mask ? b : a;
	}
	static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
	return mask ? b : a;
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	return a*b + c;
	}
	static fvec zero() {
	return 0.;
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	return a == b;
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	return !(a <= b);
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	return a <= b;
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	return a < b;
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	return a != b;
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	return a < b;
	}
	static fvec invsqrt(const fvec &a) {
	return 1. / sqrt(a);
	}
	static fvec sincos(fvec *c, const fvec &a) {
	*c = cos(a);
	return sin(a);
	}
	static fscal reduce_add(const fvec &a) {
	return a;
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	return a * b;
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	return mask ? a + b : src;
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	return mask ? reinterpret_cast<const int>(reinterpret_cast<const char>(base) + scale idx) : from;
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	return mask ? a + b : src;
	}
	static void store(void *at, const fvec &a) {
	reinterpret_cast<flt_t>(at) = a;
	}
	static void int_store(int *at, const ivec &a) {
	reinterpret_cast<int>(at) = a;
	}
	static void mask_store(int *at, const bvec &a) {
	*at = a;
	}
	static fvec min(const fvec &a, const fvec &b) {
	return a < b ? a : b;
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return mask;
	}
	static bool mask_testz(const bvec &mask) {
	return ! mask;
	}
	static bvec mask_enable_lower(int n) {
	return n > 0 ? true : false;
	}
	static ivec int_load_vl(const int *a) {
	return *a;
	}
	static void int_clear_arr(int *a) {
	*a = 0;
	}
	static bvec full_mask() {
	return true;
	}
	static void int_print(const ivec &a) {
	}
	};

	// Array notation implementation
	template<class flt_t>
	struct vector_ops<flt_t, AN> {
	static const int VL = 4;
	typedef flt_t fscal;
	typedef lmp_intel_an_fvec<VL, fscal> fvec;
	typedef lmp_intel_an_ivec<VL> ivec;
	typedef lmp_intel_an_bvec<VL> bvec;
	typedef flt_t farr[VL];
	typedef int iarr[VL];
	static fvec recip(const fvec &a) {
	fvec ret; ret.data[:] = ((fscal)1.) / a.data[:]; return ret;
	}
	template<int scale>
	static void gather_prefetch_t0(const ivec &idx, const bvec &mask, const void *base) {
	// nop
	}
	template<int scale>
	static fvec gather(const fvec &from, const bvec &mask, const ivec &idx, const void *base) {
	fvec ret = from;
	if (mask.data[:]) ret.data[:] = reinterpret_cast<const fscal >(reinterpret_cast<const char>(base) + scale idx.data[:]);
	return ret;
	}
	template<class T>
	static void gather_x(const ivec &idxs, const bvec &mask, const T base, fvec x, fvec y, fvec z, ivec *w) {
	x = gather<1>(x, mask, idxs, &base->x);
	y = gather<1>(y, mask, idxs, &base->y);
	z = gather<1>(z, mask, idxs, &base->z);
	w = int_gather<1>(w, mask, idxs, &base->w);
	}
	static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3, fvec r4, fvec r5, fvec r6, fvec r7) {
	fvec a = zero(), b = zero(), c = zero(), d = zero();
	gather_4(idxs, mask, base, r0, r1, r2, r3);
	gather_4(idxs, mask, reinterpret_cast<const char>(base) + 4 sizeof(fscal), r4, r5, r6, r7);
	}
	static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
	fvec r0, fvec r1, fvec r2, fvec r3) {
	r0 = gather<4>(r0, mask, idxs, reinterpret_cast<const char>(base) + 0 sizeof(fscal));
	r1 = gather<4>(r1, mask, idxs, reinterpret_cast<const char>(base) + 1 sizeof(fscal));
	r2 = gather<4>(r2, mask, idxs, reinterpret_cast<const char>(base) + 2 sizeof(fscal));
	r3 = gather<4>(r3, mask, idxs, reinterpret_cast<const char>(base) + 3 sizeof(fscal));
	}
	static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
	fvec ret = a;
	if (mask.data[:]) ret.data[:] = b.data[:];
	return ret;
	}
	static ivec int_blend(const bvec &mask, const ivec &a, const ivec &b) {
	fvec ret = a;
	if (mask.data[:]) ret.data[:] = b.data[:];
	return ret;
	}
	static fvec fmadd(const fvec &a, const fvec &b, const fvec &c) {
	fvec ret; ret.data[:] = a.data[:] * b.data[:] + c.data[:]; return ret;
	}
	static fvec zero() {
	return fvec(0.);
	}
	static bvec cmpeq(const fvec &a, const fvec &b) {
	bvec ret; ret.data[:] = a.data[:] == b.data[:]; return ret;
	}
	static bvec cmpnle(const fvec &a, const fvec &b) {
	bvec ret; ret.data[:] = !(a.data[:] <= b.data[:]); return ret;
	}
	static bvec cmple(const fvec &a, const fvec &b) {
	bvec ret; ret.data[:] = a.data[:] <= b.data[:]; return ret;
	}
	static bvec cmplt(const fvec &a, const fvec &b) {
	bvec ret; ret.data[:] = a.data[:] < b.data[:]; return ret;
	}
	static bvec int_cmpneq(const ivec &a, const ivec &b) {
	bvec ret; ret.data[:] = a.data[:] != b.data[:]; return ret;
	}
	static bvec int_cmplt(const ivec &a, const ivec &b) {
	bvec ret; ret.data[:] = a.data[:] < b.data[:]; return ret;
	}
	static fvec invsqrt(const fvec &a) {
	fvec ret; ret.data[:] = ((fscal)1.) / sqrt(a.data[:]); return ret;
	}
	static fvec sincos(fvec *c, const fvec &a) {
	c->data[:] = cos(a.data[:]);
	fvec ret; ret.data[:] = sin(a.data[:]); return ret;
	}
	static fscal reduce_add(const fvec &a) {
	return __sec_reduce_add(a.data[:]);
	}
	static ivec int_mullo(const ivec &a, const ivec &b) {
	ivec ret; ret.data[:] = a.data[:] * b.data[:]; return ret;
	}
	static ivec int_mask_add(const ivec &src, const bvec &mask, const ivec &a, const ivec &b) {
	ivec ret = src;
	if (mask.data[:]) ret.data[:] = a.data[:] + b.data[:];
	return ret;
	}
	template<int scale>
	static ivec int_gather(const ivec &from, bvec mask, const ivec &idx, const void *base) {
	ivec ret = from;
	if (mask.data[:]) ret.data[:] = reinterpret_cast<const int>(base)[scale idx.data[:] / sizeof(int)];
	return ret;
	}
	static fvec mask_add(const fvec &src, const bvec &mask, const fvec &a, const fvec &b) {
	fvec ret = src;
	if (mask.data[:]) ret.data[:] = a.data[:] + b.data[:];
	return ret;
	}
	static void store(void *at, const fvec &a) {
	reinterpret_cast<fscal*>(at)[0:VL] = a.data[:];
	}
	static void int_store(int *at, const ivec &a) {
	reinterpret_cast<int*>(at)[0:VL] = a.data[:];
	}
	static void mask_store(int *at, const bvec &a) {
	at[0:VL] = a.data[:];
	}
	static fvec min(const fvec &a, const fvec &b) {
	fvec ret = b;
	if (a.data[:] < b.data[:]) ret.data[:] = a.data[:];
	return ret;
	}
	static bool mask_test_at(const bvec &mask, int at) {
	return mask.data[at];
	}
	static bool mask_testz(const bvec &mask) {
	return ! __sec_reduce_or(mask.data[:]);
	}
	static bvec mask_enable_lower(int n) {
	bvec ret; ret.data[:] = __sec_implicit_index(0) < n; return ret;
	}
	static ivec int_load_vl(const int *a) {
	return ivec(a);
	}
	static void int_clear_arr(int *a) {
	a[0:VL] = 0;
	}
	static bvec full_mask() {
	return bvec(1);
	}
	static void int_print(const ivec &a) {
	}
	};

	// Mixins to implement mixed precision and single/single and double/double
	// This one is for single/single and double/double
	template<class BASE_flt_t, CalculationMode BASE_mic>
	struct AccumulatorSameMixin {
	typedef vector_ops<BASE_flt_t, BASE_mic> BASE;
	typedef typename BASE::fvec avec;
	typedef typename BASE::farr aarr;

	static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
	return BASE::mask_add(src, m, a, b);
	}

	static typename BASE::fscal acc_reduce_add(const avec &a) {
	return BASE::reduce_add(a);
	}

	static avec acc_zero() {
	return BASE::zero();
	}

	static void acc_store(aarr mem, const avec &a) {
	BASE::store(mem, a);
	}

	};

	// Mixed precision for cases where double vectors contain fewer elements
	template<class BASE_flt_t, class HIGH_flt_t, CalculationMode mic>
	struct AccumulatorTwiceMixin {
	typedef vector_ops<BASE_flt_t, mic> BASE;
	typedef vector_ops<HIGH_flt_t, mic> HIGH;

	struct avec_t {
	typename HIGH::fvec lo, hi;
	avec_t(const typename HIGH::fvec &alo, const typename HIGH::fvec &ahi) : lo(alo), hi(ahi) {}
	avec_t(const typename BASE::fvec &a) {
	lo = BASE::cvtup_lo(a);
	hi = BASE::cvtup_hi(a);
	}
	friend avec_t operator +(const avec_t &a, const avec_t &b) {
	return avec_t(a.lo + b.lo, a.hi + b.hi);
	}
	friend avec_t operator -(const avec_t &a, const avec_t &b) {
	return avec_t(a.lo - b.lo, a.hi - b.hi);
	}
	friend avec_t operator *(const avec_t &a, const avec_t &b) {
	return avec_t(a.lo * b.lo, a.hi * b.hi);
	}
	operator typename BASE::fvec() const {
	return BASE::cvtdown(lo, hi);
	}
	};

	typedef avec_t avec;
	typedef typename HIGH::fscal aarr[BASE::VL] __attribute__((aligned(BASE::ALIGN)));

	static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
	typename HIGH::fvec blo = BASE::cvtup_lo(b);
	typename HIGH::fvec bhi = BASE::cvtup_hi(b);
	typename HIGH::bvec mlo, mhi;
	BASE::mask_cvtup(m, &mlo, &mhi);
	return avec(HIGH::mask_add(src.lo, mlo, a.lo, blo), HIGH::mask_add(src.hi, mhi, a.hi, bhi));
	}

	static typename HIGH::fscal acc_reduce_add(const avec &a) {
	return HIGH::reduce_add(a.lo + a.hi);
	}

	static avec acc_zero() {
	return avec(HIGH::zero(), HIGH::zero());
	}

	static void acc_store(aarr mem, const avec &a) {
	HIGH::store(mem, a.lo);
	HIGH::store(mem + BASE::VL / 2, a.hi);
	}

	};

	// For cases where vector_ops<float,x>::VL == vector_ops<double,x>::VL
	// i.e. scalar & AN
	template<class BASE_flt_t, class HIGH_flt_t, CalculationMode mic>
	struct AccumulatorTwiceMixinNone {
	typedef vector_ops<BASE_flt_t, mic> BASE;
	typedef vector_ops<HIGH_flt_t, mic> HIGH;

	typedef typename HIGH::fvec avec;
	typedef typename HIGH::fscal aarr[BASE::VL];

	static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
	return HIGH::mask_add(src, m, a, static_cast<typename HIGH::fvec>(b));
	}
	static typename HIGH::fscal acc_reduce_add(const avec &a) {
	return HIGH::reduce_add(a);
	}

	static avec acc_zero() {
	return HIGH::zero();
	}

	static void acc_store(aarr mem, const avec &a) {
	HIGH::store(mem, a);
	}

	};

	// This is the interfact that the user will see in the end.
	template<class flt_t, class acc_t, CalculationMode mic>
	struct vector_routines {};

	template<CalculationMode mic>
	struct vector_routines<double,double,mic> : public vector_ops<double, mic>, public AccumulatorSameMixin<double, mic> {};

	template<CalculationMode mic>
	struct vector_routines<float,float,mic> : public vector_ops<float, mic>, public AccumulatorSameMixin<float, mic> {};

	template<CalculationMode mic>
	struct vector_routines<float,double,mic> : public vector_ops<float, mic>, public AccumulatorTwiceMixin<float,double, mic> {};

	// Specialize for AN and scalar
	template<>
	struct vector_routines<float,double,NONE> : public vector_ops<float, NONE>, public AccumulatorTwiceMixinNone<float,double, NONE> {};

	template<>
	struct vector_routines<float,double,AN> : public vector_ops<float, AN>, public AccumulatorTwiceMixinNone<float,double, AN> {};

	} // namespace lmp_intel

intel_intrinsics.hNo OneTemporaryActions

File Metadata

intel_intrinsics.hView Options

Event Timeline

intel_intrinsics.h
No OneTemporary
Actions

intel_intrinsics.h
View Options