diff --git a/src/gradient_avx.cpp b/src/gradient_avx.cpp index 54bdc6f..874d87a 100644 --- a/src/gradient_avx.cpp +++ b/src/gradient_avx.cpp @@ -1,514 +1,509 @@ #include <iostream> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <immintrin.h> // #include "structure_hpc.h" #include "simd_math_avx.h" #include "gradient.hpp" // //#include "iacaMarks.h" // // // void print256(__m256d __reg) { printf("%f\n", ((double*) &__reg)[0]); printf("%f\n", ((double*) &__reg)[1]); printf("%f\n", ((double*) &__reg)[2]); printf("%f\n", ((double*) &__reg)[3]); } // // // // // // inline static void rotateCoordinateSystem_avx(__m256d Q_x, __m256d Q_y, const __m256d P_x, const __m256d P_y, double* angles) { // __m256d theta = _mm256_loadu_pd(angles); /*positionning at the potential center*/ __m256d cos_theta = _mm256_cos_pd(theta); __m256d sin_theta = _mm256_sin_pd(theta); // rotation: 6 ops Q_x = P_x*cos_theta + P_y*sin_theta; Q_y = P_y*cos_theta - P_x*sin_theta; } // // // struct point module_potentialDerivatives_totalGradient_8_SOA_AVX(const struct point *pImage, const struct Potential_SOA *lens, const int shalos, const int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_AVX begins"); // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; // // smearing the image coordinates on registers __m256d image_x = _mm256_set1_pd(pImage->x); __m256d image_y = _mm256_set1_pd(pImage->y); // __m256d __grad_x = _mm256_set1_pd(0.); __m256d __grad_y = _mm256_set1_pd(0.); // int i; int imax = shalos + nhalos; //#pragma unroll for(i = shalos; i < imax - imax%4; i = i + 4) { //IACA_START; // __m256d two = _mm256_set1_pd( 2. ); __m256d one = _mm256_set1_pd( 1. ); __m256d zero = _mm256_set1_pd( 0. ); __m256d half = _mm256_set1_pd( 0.5); __m256d mhalf = _mm256_set1_pd(-0.5); // // 2 loads __m256d true_coord_x = _mm256_sub_pd(image_x, _mm256_loadu_pd(&lens->position_x[i])); __m256d true_coord_y = _mm256_sub_pd(image_y, _mm256_loadu_pd(&lens->position_y[i])); // 2 loafs __m256d rc = _mm256_loadu_pd(&lens->rcore[i]); __m256d b0 = _mm256_loadu_pd(&lens->b0[i]); // 2 loads __m256d eps = _mm256_loadu_pd(&lens->ellipticity_potential[i]); // __m256d one_minus_eps = _mm256_sub_pd(one, eps); __m256d one_plus_eps = _mm256_add_pd(one, eps); __m256d one_plus_eps_rcp = __INV(one_plus_eps); // 1 load __m256d theta = _mm256_loadu_pd(&lens->ellipticity_angle[i]); /*positionning at the potential center*/ __m256d cos_theta; __m256d sin_theta = _mm256_sincos_pd(&cos_theta, theta); // rotation: 6 ops __m256d x = _mm256_add_pd(_mm256_mul_pd(true_coord_x, cos_theta), _mm256_mul_pd(true_coord_y, sin_theta)); __m256d y = _mm256_sub_pd(_mm256_mul_pd(true_coord_y, cos_theta), _mm256_mul_pd(true_coord_x, sin_theta)); // __m256d sqe = __SQRT(eps); // __m256d cx1 = one_minus_eps*one_plus_eps_rcp; // (1. - eps)/(1. + eps); 3 ops __m256d cxro = one_plus_eps*one_plus_eps; // (1. + eps)*(1. + eps); 3 ops __m256d cyro = one_minus_eps*one_minus_eps; // (1. - eps)*(1. - eps); 3 ops __m256d rem2 = x*x*__INV(cxro) + y*y*__INV(cyro); // x*x/(cxro) + y*y/(cyro); ~5 ops // __m256d zci_re = zero; __m256d zci_im = mhalf*(one - eps*eps)*__INV(sqe); // ~4 ops __m256d znum_re = cx1*x; __m256d znum_im = two*sqe*__SQRT(rc*rc + rem2) - y*__INV(cx1); // ~4 ops // __m256d zden_re = x; __m256d zden_im = _mm256_mul_pd(_mm256_set1_pd(2.), _mm256_mul_pd(rc, sqe)); // zden_im = _mm256_sub_pd(zden_im, y); __m256d norm = (zden_re*zden_re + zden_im*zden_im); __m256d zis_re = (znum_re*zden_re + znum_im*zden_im)*__INV(norm); // 3 ops __m256d zis_im = (znum_im*zden_re - znum_re*zden_im)*__INV(norm); // 3 ops // norm = zis_re; // zis_re = _mm256_log_pd(__SQRT(norm*norm + zis_im*zis_im)); // 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis) zis_re = _mm256_log_pd(__SQRT(norm*norm + zis_im*zis_im)); // 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis) zis_im = _mm256_atan2_pd(zis_im, norm); // // __m256d zres_re = zci_re*zis_re - zci_im*zis_im; // Re( zci*ln(zis) ) 3 ops __m256d zres_im = zci_im*zis_re + zis_im*zci_re; // Im( zci*ln(zis) ) 3 ops // zis_re = b0*zres_re; zis_im = b0*zres_im; // sin_theta = _mm256_sincos_pd(&cos_theta, zero - theta); // rotation: 6 ops __grad_x = __grad_x + _mm256_add_pd(_mm256_mul_pd(zis_re, cos_theta), _mm256_mul_pd(zis_im, sin_theta)); __grad_y = __grad_y + _mm256_sub_pd(_mm256_mul_pd(zis_im, cos_theta), _mm256_mul_pd(zis_re, sin_theta)); // } // grad.x = ((double*) &__grad_x)[0]; grad.x += ((double*) &__grad_x)[1]; grad.x += ((double*) &__grad_x)[2]; grad.x += ((double*) &__grad_x)[3]; // grad.y = ((double*) &__grad_y)[0]; grad.y += ((double*) &__grad_y)[1]; grad.y += ((double*) &__grad_y)[2]; grad.y += ((double*) &__grad_y)[3]; // // end of peeling // if (nhalos%4 > 0) { struct point grad_peel; grad_peel = module_potentialDerivatives_totalGradient_8_SOA(pImage, lens, i, nhalos%4); // grad.x += grad_peel.x; grad.y += grad_peel.y; } // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_8_SOA_AVX_v2(const struct point *pImage, const struct Potential_SOA *lens, const int shalos, const int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_AVX begins"); // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; // // smearing the image coordinates on registers __m256d image_x = _mm256_set1_pd(pImage->x); __m256d image_y = _mm256_set1_pd(pImage->y); // __m256d __grad_x = _mm256_set1_pd(0.); __m256d __grad_y = _mm256_set1_pd(0.); // __m256d two = _mm256_set1_pd( 2. ); __m256d one = _mm256_set1_pd( 1. ); __m256d zero = _mm256_set1_pd( 0. ); __m256d half = _mm256_set1_pd( 0.5); __m256d mhalf = _mm256_set1_pd(-0.5); // int i; int imax = shalos + nhalos; //#pragma unroll for(i = shalos; i < imax - imax%4; i = i + 4) { //printf("i = %d lens = %p\n", i, &lens->anglecos[i]);fflush(stdout); //IACA_START; // // // 2 loads __m256d true_coord_x = _mm256_sub_pd(image_x, _mm256_loadu_pd(&lens->position_x[i])); __m256d true_coord_y = _mm256_sub_pd(image_y, _mm256_loadu_pd(&lens->position_y[i])); // 2 loafs __m256d rc = _mm256_loadu_pd(&lens->rcore[i]); __m256d b0 = _mm256_loadu_pd(&lens->b0[i]); // 2 loads __m256d eps = _mm256_loadu_pd(&lens->ellipticity_potential[i]); // __m256d one_minus_eps = _mm256_sub_pd(one, eps); __m256d one_plus_eps = _mm256_add_pd(one, eps); __m256d one_plus_eps_rcp = __INV(one_plus_eps); - // 1 load + // 2 load __m256d cose = _mm256_loadu_pd(&lens->anglecos[i]); __m256d sine = _mm256_loadu_pd(&lens->anglesin[i]); // __m256d x = ADD(MUL(true_coord_x, cose), MUL(true_coord_y, sine)); - __m256d two = _mm256_set1_pd( 2. ); - __m256d one = _mm256_set1_pd( 1. ); - __m256d zero = _mm256_set1_pd( 0. ); - __m256d half = _mm256_set1_pd( 0.5); - __m256d mhalf = _mm256_set1_pd(-0.5); __m256d y = SUB(MUL(true_coord_y, cose), MUL(true_coord_x, sine)); // __m256d sqe = __SQRT(eps); // __m256d cx1 = one_minus_eps*one_plus_eps_rcp; // (1. - eps)/(1. + eps); 3 ops __m256d cxro = one_plus_eps*one_plus_eps; // (1. + eps)*(1. + eps); 3 ops __m256d cyro = one_minus_eps*one_minus_eps; // (1. - eps)*(1. - eps); 3 ops __m256d rem2 = x*x*__INV(cxro) + y*y*__INV(cyro); // x*x/(cxro) + y*y/(cyro); ~5 ops // __m256d zci_re = zero; __m256d zci_im = mhalf*(one - eps*eps)*__INV(sqe); // ~4 ops __m256d znum_re = cx1*x; __m256d znum_im = two*sqe*__SQRT(rc*rc + rem2) - y*__INV(cx1); // ~4 ops // __m256d zden_re = x; __m256d zden_im = _mm256_mul_pd(_mm256_set1_pd(2.), _mm256_mul_pd(rc, sqe)); // zden_im = _mm256_sub_pd(zden_im, y); // __m256d norm = (zden_re*zden_re + zden_im*zden_im); __m256d zis_re = (znum_re*zden_re + znum_im*zden_im)*__INV(norm); // 3 ops __m256d zis_im = (znum_im*zden_re - znum_re*zden_im)*__INV(norm); // 3 ops // norm = zis_re; // zis_re = _mm256_log_pd(__SQRT(norm*norm + zis_im*zis_im)); // 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis) zis_re = _mm256_log_pd(__SQRT(norm*norm + zis_im*zis_im)); // 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis) zis_im = _mm256_atan2_pd(zis_im, norm); // // __m256d zres_re = zci_re*zis_re - zci_im*zis_im; // Re( zci*ln(zis) ) 3 ops __m256d zres_im = zci_im*zis_re + zis_im*zci_re; // Im( zci*ln(zis) ) 3 ops // zis_re = b0*zres_re; zis_im = b0*zres_im; // __grad_x = __grad_x + SUB(MUL(zis_re, cose), MUL(zis_im, sine)); __grad_y = __grad_y + ADD(MUL(zis_im, cose), MUL(zis_re, sine)); // } // grad.x = ((double*) &__grad_x)[0]; grad.x += ((double*) &__grad_x)[1]; grad.x += ((double*) &__grad_x)[2]; grad.x += ((double*) &__grad_x)[3]; // grad.y = ((double*) &__grad_y)[0]; grad.y += ((double*) &__grad_y)[1]; grad.y += ((double*) &__grad_y)[2]; grad.y += ((double*) &__grad_y)[3]; // // end of peeling // if (nhalos%4 > 0) { struct point grad_peel; grad_peel = module_potentialDerivatives_totalGradient_8_SOA(pImage, lens, i, nhalos%4); // grad.x += grad_peel.x; grad.y += grad_peel.y; } // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_81_SOA_AVX(const struct point *pImage, const struct Potential_SOA *lens, const int shalos, const int nhalos) { struct point grad, clumpgrad; grad.x = 0; grad.y = 0; // // smearing the image coordinates on registers __m256d image_x = _mm256_set1_pd(pImage->x); __m256d image_y = _mm256_set1_pd(pImage->y); // __m256d __grad_x = _mm256_set1_pd(0.); __m256d __grad_y = _mm256_set1_pd(0.); // int i; int imax = shalos + nhalos; //#pragma unroll for(i = shalos; i < imax - imax%4; i = i + 4) //for(i = 0; i < nhalos - nhalos%4; i = i + 4) { //IACA_START; // __m256d two = _mm256_set1_pd( 2. ); __m256d one = _mm256_set1_pd( 1. ); __m256d zero = _mm256_set1_pd( 0. ); __m256d half = _mm256_set1_pd( 0.5); __m256d mhalf = _mm256_set1_pd(-0.5); // // 2 loads __m256d true_coord_x = _mm256_sub_pd(image_x, _mm256_loadu_pd(&lens->position_x[i])); __m256d true_coord_y = _mm256_sub_pd(image_y, _mm256_loadu_pd(&lens->position_y[i])); // 2 loads __m256d rc = _mm256_loadu_pd(&lens->rcore[i]); __m256d rcut = _mm256_loadu_pd(&lens->rcut[i]); __m256d b0 = _mm256_loadu_pd(&lens->b0[i]); __m256d t05 = b0*rcut*__INV(rcut - rc); //__m256d t05 = b0; //*rcut*__INV(rcut - rc); // 2 loads __m256d eps = _mm256_loadu_pd(&lens->ellipticity_potential[i]); // __m256d one_minus_eps = _mm256_sub_pd(one, eps); __m256d one_plus_eps = _mm256_add_pd(one, eps); __m256d one_plus_eps_rcp = __INV(one_plus_eps); // 1 load __m256d theta = _mm256_loadu_pd(&lens->ellipticity_angle[i]); /*positionning at the potential center*/ //__m256d cos_theta = _mm256_cos_pd(theta); //__m256d sin_theta = _mm256_sin_pd(theta); __m256d cos_theta; __m256d sin_theta = _mm256_sincos_pd(&cos_theta, theta); // rotation: 6 ops __m256d x = _mm256_add_pd(_mm256_mul_pd(true_coord_x, cos_theta), _mm256_mul_pd(true_coord_y, sin_theta)); __m256d y = _mm256_sub_pd(_mm256_mul_pd(true_coord_y, cos_theta), _mm256_mul_pd(true_coord_x, sin_theta)); // __m256d sqe = __SQRT(eps); // __m256d cx1 = one_minus_eps*one_plus_eps_rcp; // (1. - eps)/(1. + eps); 3 ops __m256d cxro = one_plus_eps*one_plus_eps; // (1. + eps)*(1. + eps); 3 ops __m256d cyro = one_minus_eps*one_minus_eps; // (1. - eps)*(1. - eps); 3 ops __m256d rem2 = x*x*__INV(cxro) + y*y*__INV(cyro); // x*x/(cxro) + y*y/(cyro); ~5 ops // __m256d zci_re = zero; __m256d zci_im = mhalf*(one - eps*eps)*__INV(sqe); // ~4 ops // __m256d znum_re, znum_im; __m256d zden_re = zero, zden_im = zero; __m256d norm; // __m256d zis_re = zero, zis_im = zero; __m256d zres_rc_re = zero, zres_rc_im = zero; __m256d zres_rcut_re = zero, zres_rcut_im = zero; // // step 1 // { // 2.*sqe*sqrt(rc*rc + rem2) - y/cx1, 7 ops znum_re = cx1*x; znum_im = two*sqe*__SQRT(rc*rc + rem2) - y*__INV(cx1); // ~4 ops // zden_re = x; zden_im = _mm256_mul_pd(_mm256_set1_pd(2.), _mm256_mul_pd(rc, sqe)); // zden_im = _mm256_sub_pd(zden_im, y); //norm = (zden.re*zden.re + zden.im*zden.im); 3 ops norm = (zden_re*zden_re + zden_im*zden_im); zis_re = (znum_re*zden_re + znum_im*zden_im)*__INV(norm); // 3 ops zis_im = (znum_im*zden_re - znum_re*zden_im)*__INV(norm); // 3 ops // norm = zis_re; // zis_re = _mm256_log_pd(__SQRT(norm*norm + zis_im*zis_im)); // 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis) zis_im = _mm256_atan2_pd(zis_im, norm); // // zres_rc_re = zci_re*zis_re - zci_im*zis_im; // Re( zci*ln(zis) ) 3 ops zres_rc_im = zci_im*zis_re + zis_im*zci_re; // Im( zci*ln(zis) ) 3 ops } // // step 2 // { // 2.*sqe*sqrt(rcut*rcut + rem2) - y/cx1, 7 ops znum_re = cx1*x; znum_im = two*sqe*__SQRT(rcut*rcut + rem2) - y*__INV(cx1); // ~4 ops // zden_re = x; zden_im = _mm256_mul_pd(_mm256_set1_pd(2.), _mm256_mul_pd(rcut, sqe)); zden_im = _mm256_sub_pd(zden_im, y); // norm = (zden_re*zden_re + zden_im*zden_im); // 3 ops // zis_re = (znum_re*zden_re + znum_im*zden_im)*__INV(norm); // 3 ops zis_im = (znum_im*zden_re - znum_re*zden_im)*__INV(norm); // 3 ops // norm = zis_re; // zis_re = _mm256_log_pd(__SQRT(norm*norm + zis_im*zis_im)); // 3 ops// ln(zis) = ln(|zis|)+i.Arg(zis) zis_im = _mm256_atan2_pd(zis_im, norm); // // zres_rcut_re = (zci_re*zis_re - zci_im*zis_im); // Re( zci*ln(zis) ) 3 ops zres_rcut_im = (zci_im*zis_re + zis_im*zci_re); // Im( zci*ln(zis) ) 3 ops } // // postlogue // zis_re = t05*(zres_rc_re - zres_rcut_re); zis_im = t05*(zres_rc_im - zres_rcut_im); // //cos_theta = _mm256_cos_pd(zero - theta); //sin_theta = _mm256_sin_pd(zero - theta); sin_theta = _mm256_sincos_pd(&cos_theta, zero - theta); // rotation: 6 ops __grad_x = __grad_x + _mm256_add_pd(_mm256_mul_pd(zis_re, cos_theta), _mm256_mul_pd(zis_im, sin_theta)); __grad_y = __grad_y + _mm256_sub_pd(_mm256_mul_pd(zis_im, cos_theta), _mm256_mul_pd(zis_re, sin_theta)); } // // // grad.x = ((double*) &__grad_x)[0]; grad.x += ((double*) &__grad_x)[1]; grad.x += ((double*) &__grad_x)[2]; grad.x += ((double*) &__grad_x)[3]; // grad.y = ((double*) &__grad_y)[0]; grad.y += ((double*) &__grad_y)[1]; grad.y += ((double*) &__grad_y)[2]; grad.y += ((double*) &__grad_y)[3]; // // end of peeling /* if (nhalos%4 > 0) { struct point grad_peel; grad_peel = module_potentialDerivatives_totalGradient_SOA(pImage, lens, i, nhalos); //grad_peel = rotateCoordinateSystem(grad_peel, -theta); // grad.x += grad_peel.x; grad.y += grad_peel.y; } */ for (; i < nhalos; ++i) { struct point true_coord; true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; //printf("x, y = %f, %f\n", lens->position.x, lens->position.y); struct point true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // complex zis, zis_cut; double t05 = lens->b0[i]*lens->rcut[i]/(lens->rcut[i] - lens->rcore[i]); zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential[i], lens->rcore[i]); // zis_cut = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential[i], lens->rcut[i]); struct point clumpgrad; clumpgrad.x = t05*(zis.re - zis_cut.re); clumpgrad.y = t05*(zis.im - zis_cut.im); clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; } //IACA_END; // return(grad); } typedef struct point (*halo_func_avx_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); halo_func_avx_t halo_func_avx[100] = { -0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_AVX_v2, 0, +0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_AVX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_81_SOA_AVX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // // // struct point module_potentialDerivatives_totalGradient_SOA_AVX(const struct point *pImage, const struct Potential_SOA *lens, int nhalos) { struct point grad, clumpgrad; // grad.x = clumpgrad.x = 0; grad.y = clumpgrad.y = 0; // int shalos = 0; // /* int* p_type = &(lens->type)[0]; int* lens_type = (int*) malloc(nhalos*sizeof(int)); memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); */ // while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type) count++; //std::cout << "type = " << lens_type << " " << count << " " << shalos << std::endl; // clumpgrad = (*halo_func_avx[lens_type])(pImage, lens, shalos, count); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; shalos += count; } return(grad); } diff --git a/src/gradient_avx.hpp b/src/gradient_avx.hpp index 107ee89..627e350 100644 --- a/src/gradient_avx.hpp +++ b/src/gradient_avx.hpp @@ -1,15 +1,17 @@ #ifndef __GRADAVX_HPP__ #define __GRADAVX_HPP__ // /** for both gradient and second derivatives **/ //static struct point rotateCoordinateSystem(struct point P, double theta); #include "gradient.hpp" /** gradient **/ struct point module_potentialDerivatives_totalGradient_8_SOA_AVX(const struct point *pImage, const struct Potential_SOA *lens, const int shalos, const int nhalos); // struct point module_potentialDerivatives_totalGradient_81_SOA_AVX(const struct point *pImage, const struct Potential_SOA *lens, const int shalos, const int nhalos); // struct point module_potentialDerivatives_totalGradient_SOA_AVX(const struct point *pImage, const struct Potential_SOA *lens, int nhalos); // +struct point module_potentialDerivatives_totalGradient_SOA_novec(const struct point *pImage, const struct Potential_SOA *lens, int nhalos); +// #endif