gradient_avx.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Jun 11, 15:10

gradient_avx.cpp
View Options

	/**
	* @Author Christoph Schaefer, EPFL (christophernstrerne.schaefer@epfl.ch), Gilles Fourestey (gilles.fourestey@epfl.ch)
	* @date July 2017
	* @version 0,1
	*
	*/

	#include <iostream>
	#include <string.h>
	//#include <cuda_runtime.h>
	#include <math.h>
	#include <sys/time.h>
	#include <fstream>
	#include <immintrin.h>
	//
	#include "structure_hpc.hpp"
	#include "simd_math_avx.h"
	#include "gradient.hpp"
	//
	//#include "iacaMarks.h"
	//
	//
	//
	void print256(__m256d __reg)
	{
	printf("%f\n", ((double*) &__reg)[0]);
	printf("%f\n", ((double*) &__reg)[1]);
	printf("%f\n", ((double*) &__reg)[2]);
	printf("%f\n", ((double*) &__reg)[3]);
	}
	//
	//
	//

	//
	//
	//
	#ifdef _double

	inline
	static
	void rotateCoordinateSystem_avx(__m256d Q_x, __m256d Q_y, const __m256d P_x, const __m256d P_y, double* angles)
	{
	//
	__m256d theta = _mm256_loadu_pd(angles);
	/positionning at the potential center/
	__m256d cos_theta = _mm256_cos_pd(theta);
	__m256d sin_theta = _mm256_sin_pd(theta);
	// rotation: 6 ops
	Q_x = P_xcos_theta + P_ysin_theta;
	Q_y = P_ycos_theta - P_xsin_theta;
	}
	//
	//
	//
	struct point module_potentialDerivatives_totalGradient_5_SOA_AVX(const struct point pImage, const struct Potential_SOA lens, int
	shalos, int nhalos)
	{
	asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins");
	//
	struct point grad, clumpgrad;
	grad.x = 0;
	grad.y = 0;
	//
	// smearing the image coordinates on registers
	__m256d image_x = _mm256_set1_pd(pImage->x);
	__m256d image_y = _mm256_set1_pd(pImage->y);
	//
	__m256d __grad_x = _mm256_set1_pd(0.);
	__m256d __grad_y = _mm256_set1_pd(0.);
	//
	__m256d __result_x = _mm256_set1_pd(0.);
	__m256d __result_y = _mm256_set1_pd(0.);
	//
	__m256d one = _mm256_set1_pd( 1. );
	//
	int i;
	int imax = shalos + nhalos;
	//#pragma unroll
	for(i = shalos; i < imax - imax%4; i = i + 4)
	{
	//
	struct point true_coord, true_coord_rotation;
	//
	__m256d b0 = _mm256_loadu_pd(&lens->b0[i]);
	//
	//true_coord.x = pImage->x - lens->position_x[i];
	//true_coord.y = pImage->y - lens->position_y[i];
	//
	__m256d true_coord_x = _mm256_sub_pd(image_x, _mm256_loadu_pd(&lens->position_x[i]));
	__m256d true_coord_y = _mm256_sub_pd(image_y, _mm256_loadu_pd(&lens->position_y[i]));
	//
	__m256d theta = _mm256_loadu_pd(&lens->ellipticity_angle[i]);
	/positionning at the potential center/
	__m256d cos_theta;
	__m256d sin_theta = _mm256_sincos_pd(&cos_theta, theta);
	//
	__m256d x = _mm256_add_pd(_mm256_mul_pd(true_coord_x, cos_theta), _mm256_mul_pd(true_coord_y, sin_theta));
	__m256d y = _mm256_sub_pd(_mm256_mul_pd(true_coord_y, cos_theta), _mm256_mul_pd(true_coord_x, sin_theta));
	//
	//true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
	//
	__m256d ell_pot = _mm256_loadu_pd(&lens->ellipticity_potential[i]);
	//
	__m256d _R = __SQRT(ADD(MUL(x, MUL(x, SUB(one, ell_pot))), MUL(y, MUL(y, ADD(one, ell_pot)))));
	//
	__result_x = MUL(MUL(SUB(one, ell_pot), b0), MUL(x, __INV(_R)));
	__result_y = MUL(MUL(ADD(one, ell_pot), b0), MUL(y, __INV(_R)));
	//
	__grad_x = __grad_x + _mm256_sub_pd(_mm256_mul_pd(__result_x, cos_theta), _mm256_mul_pd(__result_y, sin_theta));
	__grad_y = __grad_y + _mm256_add_pd(_mm256_mul_pd(__result_y, cos_theta), _mm256_mul_pd(__result_x, sin_theta));
	//
	}
	//
	grad.x = ((double*) &__grad_x)[0];
	grad.x += ((double*) &__grad_x)[1];
	grad.x += ((double*) &__grad_x)[2];
	grad.x += ((double*) &__grad_x)[3];
	//
	grad.y = ((double*) &__grad_y)[0];
	grad.y += ((double*) &__grad_y)[1];
	grad.y += ((double*) &__grad_y)[2];
	grad.y += ((double*) &__grad_y)[3];
	//
	// end of peeling
	//
	if (nhalos%4 > 0)
	{
	struct point grad_peel;
	grad_peel = module_potentialDerivatives_totalGradient_5_SOA(pImage, lens, i, nhalos%4);
	//
	grad.x += grad_peel.x;
	grad.y += grad_peel.y;
	}
	//


	return grad;
	}
	//
	//
	//
	struct point module_potentialDerivatives_totalGradient_5_SOA_AVX_v2(const struct point pImage, const struct Potential_SOA lens, int shalos, int nhalos)
	{
	asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins");
	//
	struct point grad, clumpgrad;
	grad.x = 0;
	grad.y = 0;
	//
	// smearing the image coordinates on registers
	__m256d image_x = _mm256_set1_pd(pImage->x);
	__m256d image_y = _mm256_set1_pd(pImage->y);
	//
	__m256d __grad_x = _mm256_set1_pd(0.);
	__m256d __grad_y = _mm256_set1_pd(0.);
	//
	__m256d __result_x = _mm256_set1_pd(0.);
	__m256d __result_y = _mm256_set1_pd(0.);
	//
	__m256d one = _mm256_set1_pd( 1. );
	//
	int i;
	int imax = shalos + nhalos;
	//#pragma unroll
	for(i = shalos; i < imax - imax%4; i = i + 4)
	{
	//
	struct point true_coord, true_coord_rotation;
	//
	__m256d b0 = _mm256_loadu_pd(&lens->b0[i]);
	__m256d theta = _mm256_loadu_pd(&lens->ellipticity_angle[i]);
	__m256d ell_pot = _mm256_loadu_pd(&lens->ellipticity_potential[i]);
	//
	//true_coord.x = pImage->x - lens->position_x[i];
	//true_coord.y = pImage->y - lens->position_y[i];
	//
	__m256d true_coord_x = _mm256_sub_pd(image_x, _mm256_loadu_pd(&lens->position_x[i]));
	__m256d true_coord_y = _mm256_sub_pd(image_y, _mm256_loadu_pd(&lens->position_y[i]));
	//
	/positionning at the potential center/
	__m256d cos_theta = _mm256_loadu_pd(&lens->anglecos[i]);
	__m256d sin_theta = _mm256_loadu_pd(&lens->anglesin[i]);
	//
	__m256d x = _mm256_add_pd(_mm256_mul_pd(true_coord_x, cos_theta), _mm256_mul_pd(true_coord_y, sin_theta));
	__m256d y = _mm256_sub_pd(_mm256_mul_pd(true_coord_y, cos_theta), _mm256_mul_pd(true_coord_x, sin_theta));
	//
	//true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
	//
	//
	__m256d _R = __INV(__SQRT(ADD(MUL(x, MUL(x, SUB(one, ell_pot))), MUL(y, MUL(y, ADD(one, ell_pot))))));
	//
	__result_x = MUL(MUL(SUB(one, ell_pot), b0), MUL(x, _R));
	__result_y = MUL(MUL(ADD(one, ell_pot), b0), MUL(y, _R));
	//
	__grad_x = __grad_x + _mm256_sub_pd(_mm256_mul_pd(__result_x, cos_theta), _mm256_mul_pd(__result_y, sin_theta));
	__grad_y = __grad_y + _mm256_add_pd(_mm256_mul_pd(__result_y, cos_theta), _mm256_mul_pd(__result_x, sin_theta));
	//
	}
	//
	grad.x = ((double*) &__grad_x)[0];
	grad.x += ((double*) &__grad_x)[1];
	grad.x += ((double*) &__grad_x)[2];
	grad.x += ((double*) &__grad_x)[3];
	//
	grad.y = ((double*) &__grad_y)[0];
	grad.y += ((double*) &__grad_y)[1];
	grad.y += ((double*) &__grad_y)[2];
	grad.y += ((double*) &__grad_y)[3];
	//
	// end of peeling
	//
	if (nhalos%4 > 0)
	{
	struct point grad_peel;
	grad_peel = module_potentialDerivatives_totalGradient_5_SOA(pImage, lens, i, nhalos%4);
	//
	grad.x += grad_peel.x;
	grad.y += grad_peel.y;
	}
	//


	return grad;
	}
	//
	//
	//
	#if 0
	struct point module_potentialDerivatives_totalGradient_5_SOA_AVX(const struct point pImage, const struct Potential_SOA lens, int shalos, int nhalos)
	{
	asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins");
	printf("# module_potentialDerivatives_totalGradient_SIS_SOA_AVX begins\n");
	//
	struct point grad, result;
	//
	grad.x = 0;
	grad.y = 0;
	//
	for(int i = shalos; i < shalos + nhalos; i++)
	{
	//
	struct point true_coord, true_coord_rotation;
	//
	true_coord.x = pImage->x - lens->position_x[i];
	true_coord.y = pImage->y - lens->position_y[i];
	//
	//true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
	double cose = lens->anglecos[i];
	double sine = lens->anglesin[i];
	//
	double x = true_coord.xcose + true_coord.ysine;
	double y = true_coord.ycose - true_coord.xsine;
	//:
	double R = sqrt(xx(1 - lens->ellipticity[i]/3.) + yy(1 + lens->ellipticity[i]/3.));
	result.x = (1 - lens->ellipticity[i]/3.)lens->b0[i]x/R;
	result.y = (1 + lens->ellipticity[i]/3.)lens->b0[i]y/R;
	//
	grad.x += result.xcose - result.ysine;
	grad.y += result.ycose + result.xsine;
	}
	return grad;
	}
	#endif
	//
	//
	//
	struct point module_potentialDerivatives_totalGradient_8_SOA_AVX(const struct point pImage, const struct Potential_SOA lens, const int shalos, const int nhalos)
	{
	asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_AVX begins");
	//
	struct point grad, clumpgrad;
	grad.x = 0;
	grad.y = 0;
	//
	// smearing the image coordinates on registers
	__m256d image_x = _mm256_set1_pd(pImage->x);
	__m256d image_y = _mm256_set1_pd(pImage->y);
	//
	__m256d __grad_x = _mm256_set1_pd(0.);
	__m256d __grad_y = _mm256_set1_pd(0.);
	//
	int i;
	int imax = shalos + nhalos;
	//#pragma unroll
	for(i = shalos; i < imax - imax%4; i = i + 4)
	{
	//IACA_START;
	//
	__m256d two = _mm256_set1_pd( 2. );
	__m256d one = _mm256_set1_pd( 1. );
	__m256d zero = _mm256_set1_pd( 0. );
	__m256d half = _mm256_set1_pd( 0.5);
	__m256d mhalf = _mm256_set1_pd(-0.5);
	//
	// 2 loads
	__m256d true_coord_x = _mm256_sub_pd(image_x, _mm256_loadu_pd(&lens->position_x[i]));
	__m256d true_coord_y = _mm256_sub_pd(image_y, _mm256_loadu_pd(&lens->position_y[i]));
	// 2 loafs
	__m256d rc = _mm256_loadu_pd(&lens->rcore[i]);
	__m256d b0 = _mm256_loadu_pd(&lens->b0[i]);
	// 2 loads
	__m256d eps = _mm256_loadu_pd(&lens->ellipticity_potential[i]);
	//
	__m256d one_minus_eps = _mm256_sub_pd(one, eps);
	__m256d one_plus_eps = _mm256_add_pd(one, eps);
	__m256d one_plus_eps_rcp = __INV(one_plus_eps);
	// 1 load
	__m256d theta = _mm256_loadu_pd(&lens->ellipticity_angle[i]);
	/positionning at the potential center/
	__m256d cos_theta;
	__m256d sin_theta = _mm256_sincos_pd(&cos_theta, theta);
	// rotation: 6 ops
	__m256d x = _mm256_add_pd(_mm256_mul_pd(true_coord_x, cos_theta), _mm256_mul_pd(true_coord_y, sin_theta));
	__m256d y = _mm256_sub_pd(_mm256_mul_pd(true_coord_y, cos_theta), _mm256_mul_pd(true_coord_x, sin_theta));
	//
	__m256d sqe = __SQRT(eps);
	//
	__m256d cx1 = one_minus_eps*one_plus_eps_rcp; // (1. - eps)/(1. + eps); 3 ops
	__m256d cxro = one_plus_epsone_plus_eps; // (1. + eps)(1. + eps); 3 ops
	__m256d cyro = one_minus_epsone_minus_eps; // (1. - eps)(1. - eps); 3 ops
	__m256d rem2 = xx__INV(cxro) + yy__INV(cyro); // xx/(cxro) + yy/(cyro); ~5 ops
	//
	__m256d zci_re = zero;
	__m256d zci_im = mhalf(one - epseps)*__INV(sqe); // ~4 ops
	__m256d znum_re = cx1*x;
	__m256d znum_im = twosqe__SQRT(rcrc + rem2) - y__INV(cx1); // ~4 ops
	//
	__m256d zden_re = x;
	__m256d zden_im = _mm256_mul_pd(_mm256_set1_pd(2.), _mm256_mul_pd(rc, sqe));
	//
	zden_im = _mm256_sub_pd(zden_im, y);
	__m256d norm = (zden_rezden_re + zden_imzden_im);
	__m256d zis_re = (znum_rezden_re + znum_imzden_im)*__INV(norm); // 3 ops
	__m256d zis_im = (znum_imzden_re - znum_rezden_im)*__INV(norm); // 3 ops
	//
	norm = zis_re;
	//
	zis_re = _mm256_log_pd(__SQRT(normnorm + zis_imzis_im)); // 3 ops// ln(zis) = ln(\|zis\|)+i.Arg(zis)
	zis_re = _mm256_log_pd(__SQRT(normnorm + zis_imzis_im)); // 3 ops// ln(zis) = ln(\|zis\|)+i.Arg(zis)
	zis_im = _mm256_atan2_pd(zis_im, norm); //
	//
	__m256d zres_re = zci_rezis_re - zci_imzis_im; // Re( zci*ln(zis) ) 3 ops
	__m256d zres_im = zci_imzis_re + zis_imzci_re; // Im( zci*ln(zis) ) 3 ops
	//
	zis_re = b0*zres_re;
	zis_im = b0*zres_im;
	//
	sin_theta = _mm256_sincos_pd(&cos_theta, zero - theta);
	// rotation: 6 ops
	__grad_x = __grad_x + _mm256_add_pd(_mm256_mul_pd(zis_re, cos_theta), _mm256_mul_pd(zis_im, sin_theta));
	__grad_y = __grad_y + _mm256_sub_pd(_mm256_mul_pd(zis_im, cos_theta), _mm256_mul_pd(zis_re, sin_theta));
	//
	}
	//
	grad.x = ((double*) &__grad_x)[0];
	grad.x += ((double*) &__grad_x)[1];
	grad.x += ((double*) &__grad_x)[2];
	grad.x += ((double*) &__grad_x)[3];
	//
	grad.y = ((double*) &__grad_y)[0];
	grad.y += ((double*) &__grad_y)[1];
	grad.y += ((double*) &__grad_y)[2];
	grad.y += ((double*) &__grad_y)[3];
	//
	// end of peeling
	//
	if (nhalos%4 > 0)
	{
	struct point grad_peel;
	grad_peel = module_potentialDerivatives_totalGradient_8_SOA(pImage, lens, i, nhalos%4);
	//
	grad.x += grad_peel.x;
	grad.y += grad_peel.y;
	}
	//
	return(grad);
	}
	//
	//
	//
	struct point module_potentialDerivatives_totalGradient_8_SOA_AVX_v2(const struct point pImage, const struct Potential_SOA lens, const int shalos, const int nhalos)
	{
	asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_AVX begins");
	//
	struct point grad, clumpgrad;
	grad.x = 0;
	grad.y = 0;
	//
	// smearing the image coordinates on registers
	__m256d image_x = _mm256_set1_pd(pImage->x);
	__m256d image_y = _mm256_set1_pd(pImage->y);
	//
	__m256d __grad_x = _mm256_set1_pd(0.);
	__m256d __grad_y = _mm256_set1_pd(0.);
	//
	__m256d two = _mm256_set1_pd( 2. );
	__m256d one = _mm256_set1_pd( 1. );
	__m256d zero = _mm256_set1_pd( 0. );
	__m256d half = _mm256_set1_pd( 0.5);
	__m256d mhalf = _mm256_set1_pd(-0.5);
	//
	int i;
	int imax = shalos + nhalos;
	//#pragma unroll
	for(i = shalos; i < imax - imax%4; i = i + 4)
	{
	//printf("i = %d lens = %p\n", i, &lens->anglecos[i]);fflush(stdout);
	//IACA_START;
	//
	//
	// 2 loads
	__m256d true_coord_x = _mm256_sub_pd(image_x, _mm256_loadu_pd(&lens->position_x[i]));
	__m256d true_coord_y = _mm256_sub_pd(image_y, _mm256_loadu_pd(&lens->position_y[i]));
	// 2 loafs
	__m256d rc = _mm256_loadu_pd(&lens->rcore[i]);
	__m256d b0 = _mm256_loadu_pd(&lens->b0[i]);
	// 2 loads
	__m256d eps = _mm256_loadu_pd(&lens->ellipticity_potential[i]);
	//
	__m256d one_minus_eps = _mm256_sub_pd(one, eps);
	__m256d one_plus_eps = _mm256_add_pd(one, eps);
	__m256d one_plus_eps_rcp = __INV(one_plus_eps);
	// 2 load
	__m256d cose = _mm256_loadu_pd(&lens->anglecos[i]);
	__m256d sine = _mm256_loadu_pd(&lens->anglesin[i]);
	//
	__m256d x = ADD(MUL(true_coord_x, cose), MUL(true_coord_y, sine));
	__m256d y = SUB(MUL(true_coord_y, cose), MUL(true_coord_x, sine));
	//
	__m256d sqe = __SQRT(eps);
	//
	__m256d cx1 = one_minus_eps*one_plus_eps_rcp; // (1. - eps)/(1. + eps); 3 ops
	__m256d cxro = one_plus_epsone_plus_eps; // (1. + eps)(1. + eps); 3 ops
	__m256d cyro = one_minus_epsone_minus_eps; // (1. - eps)(1. - eps); 3 ops
	__m256d rem2 = xx__INV(cxro) + yy__INV(cyro); // xx/(cxro) + yy/(cyro); ~5 ops
	//
	__m256d zci_re = zero;
	__m256d zci_im = mhalf(one - epseps)*__INV(sqe); // ~4 ops
	__m256d znum_re = cx1*x;
	__m256d znum_im = twosqe__SQRT(rcrc + rem2) - y__INV(cx1); // ~4 ops
	//
	__m256d zden_re = x;
	__m256d zden_im = _mm256_mul_pd(_mm256_set1_pd(2.), _mm256_mul_pd(rc, sqe));
	//
	zden_im = _mm256_sub_pd(zden_im, y);
	//
	__m256d norm = (zden_rezden_re + zden_imzden_im);
	__m256d zis_re = (znum_rezden_re + znum_imzden_im)*__INV(norm); // 3 ops
	__m256d zis_im = (znum_imzden_re - znum_rezden_im)*__INV(norm); // 3 ops
	//
	norm = zis_re;
	//
	zis_re = _mm256_log_pd(__SQRT(normnorm + zis_imzis_im)); // 3 ops// ln(zis) = ln(\|zis\|)+i.Arg(zis)
	zis_re = _mm256_log_pd(__SQRT(normnorm + zis_imzis_im)); // 3 ops// ln(zis) = ln(\|zis\|)+i.Arg(zis)
	zis_im = _mm256_atan2_pd(zis_im, norm); //
	//
	__m256d zres_re = zci_rezis_re - zci_imzis_im; // Re( zci*ln(zis) ) 3 ops
	__m256d zres_im = zci_imzis_re + zis_imzci_re; // Im( zci*ln(zis) ) 3 ops
	//
	zis_re = b0*zres_re;
	zis_im = b0*zres_im;
	//
	__grad_x = __grad_x + SUB(MUL(zis_re, cose), MUL(zis_im, sine));
	__grad_y = __grad_y + ADD(MUL(zis_im, cose), MUL(zis_re, sine));
	//
	}
	//
	grad.x = ((double*) &__grad_x)[0];
	grad.x += ((double*) &__grad_x)[1];
	grad.x += ((double*) &__grad_x)[2];
	grad.x += ((double*) &__grad_x)[3];
	//
	grad.y = ((double*) &__grad_y)[0];
	grad.y += ((double*) &__grad_y)[1];
	grad.y += ((double*) &__grad_y)[2];
	grad.y += ((double*) &__grad_y)[3];
	//
	// end of peeling
	//
	if (nhalos%4 > 0)
	{
	struct point grad_peel;
	grad_peel = module_potentialDerivatives_totalGradient_8_SOA(pImage, lens, i, nhalos%4);
	//
	grad.x += grad_peel.x;
	grad.y += grad_peel.y;
	}
	//
	return(grad);
	}
	//
	//
	//
	struct point module_potentialDerivatives_totalGradient_81_SOA_AVX(const struct point pImage, const struct Potential_SOA lens, const int shalos, const int nhalos)
	{
	struct point grad, clumpgrad;
	grad.x = 0;
	grad.y = 0;
	//
	// smearing the image coordinates on registers
	__m256d image_x = _mm256_set1_pd(pImage->x);
	__m256d image_y = _mm256_set1_pd(pImage->y);
	//
	__m256d __grad_x = _mm256_set1_pd(0.);
	__m256d __grad_y = _mm256_set1_pd(0.);
	//
	int i;
	int imax = shalos + nhalos;
	//#pragma unroll
	for(i = shalos; i < imax - imax%4; i = i + 4)
	//for(i = 0; i < nhalos - nhalos%4; i = i + 4)
	{
	//IACA_START;
	//
	__m256d two = _mm256_set1_pd( 2. );
	__m256d one = _mm256_set1_pd( 1. );
	__m256d zero = _mm256_set1_pd( 0. );
	__m256d half = _mm256_set1_pd( 0.5);
	__m256d mhalf = _mm256_set1_pd(-0.5);
	//
	// 2 loads
	__m256d true_coord_x = _mm256_sub_pd(image_x, _mm256_loadu_pd(&lens->position_x[i]));
	__m256d true_coord_y = _mm256_sub_pd(image_y, _mm256_loadu_pd(&lens->position_y[i]));
	// 2 loads
	__m256d rc = _mm256_loadu_pd(&lens->rcore[i]);
	__m256d rcut = _mm256_loadu_pd(&lens->rcut[i]);
	__m256d b0 = _mm256_loadu_pd(&lens->b0[i]);
	__m256d t05 = b0rcut__INV(rcut - rc);
	//__m256d t05 = b0; //rcut__INV(rcut - rc);
	// 2 loads
	__m256d eps = _mm256_loadu_pd(&lens->ellipticity_potential[i]);
	//
	__m256d one_minus_eps = _mm256_sub_pd(one, eps);
	__m256d one_plus_eps = _mm256_add_pd(one, eps);
	__m256d one_plus_eps_rcp = __INV(one_plus_eps);
	// 1 load
	__m256d theta = _mm256_loadu_pd(&lens->ellipticity_angle[i]);
	/positionning at the potential center/
	//__m256d cos_theta = _mm256_cos_pd(theta);
	//__m256d sin_theta = _mm256_sin_pd(theta);
	__m256d cos_theta;
	__m256d sin_theta = _mm256_sincos_pd(&cos_theta, theta);
	// rotation: 6 ops
	__m256d x = _mm256_add_pd(_mm256_mul_pd(true_coord_x, cos_theta), _mm256_mul_pd(true_coord_y, sin_theta));
	__m256d y = _mm256_sub_pd(_mm256_mul_pd(true_coord_y, cos_theta), _mm256_mul_pd(true_coord_x, sin_theta));
	//
	__m256d sqe = __SQRT(eps);
	//
	__m256d cx1 = one_minus_eps*one_plus_eps_rcp; // (1. - eps)/(1. + eps); 3 ops
	__m256d cxro = one_plus_epsone_plus_eps; // (1. + eps)(1. + eps); 3 ops
	__m256d cyro = one_minus_epsone_minus_eps; // (1. - eps)(1. - eps); 3 ops
	__m256d rem2 = xx__INV(cxro) + yy__INV(cyro); // xx/(cxro) + yy/(cyro); ~5 ops
	//
	__m256d zci_re = zero;
	__m256d zci_im = mhalf(one - epseps)*__INV(sqe); // ~4 ops
	//
	__m256d znum_re, znum_im;
	__m256d zden_re = zero, zden_im = zero;
	__m256d norm;
	//
	__m256d zis_re = zero, zis_im = zero;
	__m256d zres_rc_re = zero, zres_rc_im = zero;
	__m256d zres_rcut_re = zero, zres_rcut_im = zero;
	//
	// step 1
	//
	{
	// 2.sqesqrt(rc*rc + rem2) - y/cx1, 7 ops
	znum_re = cx1*x;
	znum_im = twosqe__SQRT(rcrc + rem2) - y__INV(cx1); // ~4 ops
	//
	zden_re = x;
	zden_im = _mm256_mul_pd(_mm256_set1_pd(2.), _mm256_mul_pd(rc, sqe));
	//
	zden_im = _mm256_sub_pd(zden_im, y);
	//norm = (zden.rezden.re + zden.imzden.im); 3 ops
	norm = (zden_rezden_re + zden_imzden_im);
	zis_re = (znum_rezden_re + znum_imzden_im)*__INV(norm); // 3 ops
	zis_im = (znum_imzden_re - znum_rezden_im)*__INV(norm); // 3 ops
	//
	norm = zis_re;
	//
	zis_re = _mm256_log_pd(__SQRT(normnorm + zis_imzis_im)); // 3 ops// ln(zis) = ln(\|zis\|)+i.Arg(zis)
	zis_im = _mm256_atan2_pd(zis_im, norm); //
	//
	zres_rc_re = zci_rezis_re - zci_imzis_im; // Re( zci*ln(zis) ) 3 ops
	zres_rc_im = zci_imzis_re + zis_imzci_re; // Im( zci*ln(zis) ) 3 ops
	}
	//
	// step 2
	//
	{
	// 2.sqesqrt(rcut*rcut + rem2) - y/cx1, 7 ops
	znum_re = cx1*x;
	znum_im = twosqe__SQRT(rcutrcut + rem2) - y__INV(cx1); // ~4 ops
	//
	zden_re = x;
	zden_im = _mm256_mul_pd(_mm256_set1_pd(2.), _mm256_mul_pd(rcut, sqe));
	zden_im = _mm256_sub_pd(zden_im, y);
	//
	norm = (zden_rezden_re + zden_imzden_im); // 3 ops
	//
	zis_re = (znum_rezden_re + znum_imzden_im)*__INV(norm); // 3 ops
	zis_im = (znum_imzden_re - znum_rezden_im)*__INV(norm); // 3 ops
	//
	norm = zis_re;
	//
	zis_re = _mm256_log_pd(__SQRT(normnorm + zis_imzis_im)); // 3 ops// ln(zis) = ln(\|zis\|)+i.Arg(zis)
	zis_im = _mm256_atan2_pd(zis_im, norm); //
	//
	zres_rcut_re = (zci_rezis_re - zci_imzis_im); // Re( zci*ln(zis) ) 3 ops
	zres_rcut_im = (zci_imzis_re + zis_imzci_re); // Im( zci*ln(zis) ) 3 ops
	}
	//
	// postlogue
	//
	zis_re = t05*(zres_rc_re - zres_rcut_re);
	zis_im = t05*(zres_rc_im - zres_rcut_im);
	//
	//cos_theta = _mm256_cos_pd(zero - theta);
	//sin_theta = _mm256_sin_pd(zero - theta);
	sin_theta = _mm256_sincos_pd(&cos_theta, zero - theta);
	// rotation: 6 ops
	__grad_x = __grad_x + _mm256_add_pd(_mm256_mul_pd(zis_re, cos_theta), _mm256_mul_pd(zis_im, sin_theta));
	__grad_y = __grad_y + _mm256_sub_pd(_mm256_mul_pd(zis_im, cos_theta), _mm256_mul_pd(zis_re, sin_theta));
	}
	//
	//
	//
	grad.x = ((double*) &__grad_x)[0];
	grad.x += ((double*) &__grad_x)[1];
	grad.x += ((double*) &__grad_x)[2];
	grad.x += ((double*) &__grad_x)[3];
	//
	grad.y = ((double*) &__grad_y)[0];
	grad.y += ((double*) &__grad_y)[1];
	grad.y += ((double*) &__grad_y)[2];
	grad.y += ((double*) &__grad_y)[3];
	//
	// end of peeling
	/*
	if (nhalos%4 > 0)
	{
	struct point grad_peel;
	grad_peel = module_potentialDerivatives_totalGradient_SOA(pImage, lens, i, nhalos);
	//grad_peel = rotateCoordinateSystem(grad_peel, -theta);
	//
	grad.x += grad_peel.x;
	grad.y += grad_peel.y;
	}
	*/
	for (; i < nhalos; ++i)
	{
	struct point true_coord;
	true_coord.x = pImage->x - lens->position_x[i];
	true_coord.y = pImage->y - lens->position_y[i];
	//printf("x, y = %f, %f\n", lens->position.x, lens->position.y);
	struct point true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
	//
	complex zis, zis_cut;
	double t05 = lens->b0[i]*lens->rcut[i]/(lens->rcut[i] - lens->rcore[i]);
	zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential[i], lens->rcore[i]);
	//
	zis_cut = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential[i], lens->rcut[i]);

	struct point clumpgrad;
	clumpgrad.x = t05*(zis.re - zis_cut.re);
	clumpgrad.y = t05*(zis.im - zis_cut.im);
	clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]);
	//
	grad.x += clumpgrad.x;
	grad.y += clumpgrad.y;
	}
	//IACA_END;
	//
	return(grad);
	}


	typedef struct point (halo_func_avx_t) (const struct point pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
	halo_func_avx_t halo_func_avx[100] =
	{
	0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_AVX, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_AVX, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, module_potentialDerivatives_totalGradient_81_SOA_AVX, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0
	};
	//
	//
	//
	struct point module_potentialDerivatives_totalGradient_SOA_AVX(const struct point pImage, const struct Potential_SOA lens, int nhalos)
	{
	struct point grad, clumpgrad;
	//
	grad.x = clumpgrad.x = 0;
	grad.y = clumpgrad.y = 0;
	//
	int shalos = 0;
	//
	/*
	int* p_type = &(lens->type)[0];
	int* lens_type = (int) malloc(nhalossizeof(int));
	memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int));
	*/
	//
	while (shalos < nhalos)
	{
	int lens_type = lens->type[shalos];
	int count = 1;
	while (lens->type[shalos + count] == lens_type) count++;
	//std::cout << "type = " << lens_type << " " << count << " " << shalos << std::endl;
	//
	clumpgrad = (*halo_func_avx[lens_type])(pImage, lens, shalos, count);
	//
	grad.x += clumpgrad.x;
	grad.y += clumpgrad.y;
	shalos += count;
	}

	return(grad);
	}

	#endif

gradient_avx.cppNo OneTemporaryActions

File Metadata

gradient_avx.cppView Options

Event Timeline

gradient_avx.cpp
No OneTemporary
Actions

gradient_avx.cpp
View Options