diff --git a/src/chi_CPU.cpp b/src/chi_CPU.cpp index 3863483..23b9c0f 100644 --- a/src/chi_CPU.cpp +++ b/src/chi_CPU.cpp @@ -1,541 +1,540 @@ #include <iostream> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <unistd.h> #include <assert.h> // #ifndef __xlC__ #warning "gnu compilers" -#include <immintrin.h> #endif // //#include "simd_math.h" #include "chi_CPU.hpp" #ifdef __WITH_GPU #include <cuda_runtime.h> #include "cudafunctions.cuh" #include "grid_gradient_GPU.cuh" #endif ////#endif //#include "gradient_GPU.cuh" #ifdef _OPENMP #include <omp.h> #endif #ifdef __WITH_MPI #warning "MPI enabled" #include <mpi.h> #include "mpi_check.h" #endif #include "delense_CPU_utils.hpp" #include "delense_CPU.hpp" // #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) // extern double myseconds(); #if 0 { struct timeval tp; struct timezone tzp; int i; i = gettimeofday(&tp,&tzp); return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); } #endif // void mychi_bruteforce_SOA_CPU_grid_gradient(double *chi, int *error, runmode_param *runmode, const struct Potential_SOA *lens, const struct grid_param *frame, const int *nimages_strongLensing, galaxy *images) { // //double dx, dy; //, x_pos, y_pos; //pixelsize // // double im_dist[MAXIMPERSOURCE]; // distance to a real image for an "theoretical" image found from a theoretical source // int im_index; // index of the closest real image for an image found from a theoretical source // int second_closest_id; // index of the second closest real image for an image found from a theoretical source // int thread_found_image = 0; // at each iteration in the lens plane, turn to 1 whether the thread find an image // struct point im_position, temp; // position of the image found from a theoretical source + temp variable for comparison // struct triplet Tsup, Tinf, Tsupsource, Tinfsource;// triangles for the computation of the images created by the theoretical sources unsigned int nsets = runmode->nsets; unsigned int nimagestot = runmode->nimagestot; // int MAXIMPERSOURCE; int numImages = 0; // int maxImagesPerSource = 0; for( int source_id = 0; source_id < nsets; source_id ++) { numImages += nimages_strongLensing[source_id]; maxImagesPerSource = MAX(nimages_strongLensing[source_id], maxImagesPerSource); } MAXIMPERSOURCE = maxImagesPerSource; // // struct galaxy sources [nsets][nimagestot]; // theoretical sources (common for a set) int nimagesfound [nsets][nimagestot][MAXIMPERSOURCE]; // number of images found from the theoretical sources int locimagesfound [nsets][nimagestot]; struct point image_pos [nsets][nimagestot][MAXIMPERSOURCE]; //double image_dist [nsets][nimagestot][MAXIMPERSOURCE]; struct point tim [nimagestot][MAXIMPERSOURCE]; // theoretical images (computed from sources) // int world_size = 1; int world_rank = 0; #ifdef __WITH_MPI MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); MPI_Barrier(MPI_COMM_WORLD); #endif unsigned int verbose = (world_rank == 0); // int grid_size = runmode->nbgridcells; int loc_grid_size = runmode->nbgridcells/world_size; // double y_len = fabs(frame->ymax - frame->ymin); int y_len_loc = runmode->nbgridcells/world_size; int y_pos_loc = (int) world_rank*y_len_loc; int y_bound = y_len_loc; if ((world_rank + 1) != world_size) y_bound++; // // const double dx = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1); const double dy = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1); // double *grid_gradient_x, *grid_gradient_y; // //grid_gradient_x = (double *)malloc((int) grid_size*loc_grid_size*sizeof(double)); //grid_gradient_y = (double *)malloc((int) grid_size*loc_grid_size*sizeof(double)); #ifdef __WITH_GPU cudasafe(cudaMallocHost(&grid_gradient_x, grid_size*y_bound*sizeof(double)), "mychi_bruteforce_SOA_CPU_grid_gradient: allocating grid_gradient_x"); cudasafe(cudaMallocHost(&grid_gradient_y, grid_size*y_bound*sizeof(double)), "mychi_bruteforce_SOA_CPU_grid_gradient: allocating grid_gradient_x"); #else grid_gradient_x = (double *)malloc((int) grid_size*y_bound*sizeof(double)); grid_gradient_y = (double *)malloc((int) grid_size*y_bound*sizeof(double)); #endif // //uais //if (verbose) printf("@@%d: nsets = %d nimagestot = %d maximgpersource = %d, grid_size = %d, loc_grid_size = %d, y_pos_loc = %d\n", world_rank, nsets, nimagestot, MAXIMPERSOURCE, grid_size, loc_grid_size, y_pos_loc); // const int grid_dim = runmode->nbgridcells; //Packaging the image to sourceplane conversion double time = -myseconds(); double grad_time = -myseconds(); //gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, grid_dim); int zero = 0; #ifdef __WITH_GPU #warning "Using GPUs..." #ifdef __WITH_UM #warning "Chi computation using UNIFIED MEMORY" //gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc); gradient_grid_GPU_UM(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc); #else gradient_grid_GPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc); #endif cudasafe(cudaGetLastError(), "after gradient_grid_GPU"); //gradient_grid_GPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, grid_size); #else gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc); #endif // #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif grad_time += myseconds(); // int index = 0; // index tracks the image within the total image array in the image plane *chi = 0; // double chi2_time = 0.; double delense_time = 0.; double image_time = 0.; // int images_found = 0; // long int images_total = 0; // printf("@@nsets = %d nx = %d ny = %d, xmin = %f, dx = %f, ymin = %f, dy = %f\n", nsets, runmode->nbgridcells, runmode->nbgridcells, frame->xmin, dx, frame->ymin, dy ); // int numsets = 0; // for( int source_id = 0; source_id < nsets; source_id ++) numsets += nimages_strongLensing[source_id]; printf("@@Total numsets = %d\n", numsets); // // // nsets : number of images in the source plane // nimagestot: number of images in the image plane // image_time -= myseconds(); // for( int source_id = 0; source_id < nsets; source_id ++) { // number of images in the image plane for the specific image (1,3,5...) unsigned short int nimages = nimages_strongLensing[source_id]; //@@printf("@@ source_id = %d, nimages = %d\n", source_id, nimages_strongLensing[source_id]); //____________________________ image (constrains) loop ________________________________ for(unsigned short int image_id = 0; image_id < nimages; image_id++) { //printf("@@ nimages = %d\n", nimages_strongLensing[source_id]); //________ computation of theoretical sources _________ // output: sources[source_id].center //printf("Image = %f %f\n", images[index + image_id].center.x, images[index + image_id].center.y); mychi_transformImageToSourcePlane_SOA(runmode->nhalos, &images[index + image_id].center, images[index + image_id].dr, lens, &sources[source_id][image_id].center); // // void mychi_transformImageToSourcePlane_SOA(const int Nlens, const struct point *image_point, double dlsds, const struct Potential_SOA *lens, struct point *source_point) // struct point Grad = module_potentialDerivatives_totalGradient_SOA(&images[index + image_id].center, lens, runmode->nhalos); //printf(" image %d, %d (%d) = (%.15f, %.15f) -> (%.15f, %.15f)\n", source_id, image_id, nimages, images[index + image_id].center.x, images[index + image_id].center.y, sources[source_id][image_id].center.x, sources[source_id][image_id].center.y ); //printf("Grad.x = %f, %f\n", Grad.x, grid_gradient_x[images[index + image_id].center.x/dx]); // // find the position of the constrain in the source plane sources[source_id][image_id].center.x = images[index + image_id].center.x - images[index + image_id].dr*Grad.x; sources[source_id][image_id].center.y = images[index + image_id].center.y - images[index + image_id].dr*Grad.y; // //time += myseconds(); // sources[source_id][image_id].redshift = images[index + image_id].redshift; // sources[source_id][image_id].dr = images[index + image_id].dr; sources[source_id][image_id].dls = images[index + image_id].dls; sources[source_id][image_id].dos = images[index + image_id].dos; //#if 1 } index += nimages; } #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif image_time += myseconds(); // // main loop // //double y_len = fabs(frame->ymax - frame->ymin); //int y_len_loc = runmode->nbgridcells/world_size; //int y_pos_loc = (int) world_rank*y_len_loc; //printf("%d out of %d: y_id = %d to %d\n", world_rank, world_size, y_pos_loc, y_pos_loc + y_len_loc - 1); //fflush(stdout); //MPI_Barrier(MPI_COMM_WORLD); // delense_time -= myseconds(); index = 0; int numimg = 0; // delense(/*nsets, nimagestot,*/ &image_pos[0][0][0], MAXIMPERSOURCE, &locimagesfound[0][0], &numimg, runmode, lens, frame, nimages_strongLensing, &sources[0][0], grid_gradient_x, grid_gradient_y); printf("%d: numimg = %d\n", world_rank, numimg); // // local delensing done, moving on to the reduction // #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif delense_time += myseconds(); // double comm_time = -myseconds(); // int numimagesfound [nsets][nimagestot]; memset(&numimagesfound, 0, nsets*nimagestot*sizeof(int)); struct point imagesposition [nsets][nimagestot][MAXIMPERSOURCE]; memset(&imagesposition, 0, nsets*nimagestot*MAXIMPERSOURCE*sizeof(point)); // int numimagesfound_tmp[nsets][nimagestot]; struct point imagesposition_tmp[nsets][nimagestot][MAXIMPERSOURCE]; // memset(numimagesfound_tmp, 0, nsets*nimagestot*sizeof(int)); memset(imagesposition_tmp, 0, nsets*nimagestot*sizeof(int)); // #ifdef __WITH_MPI int total = 0; //MPI_Reduce(&locimagesfound, &imagesfound, nsets*nimagestot, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); //MPI_Reduce(&locimagesfound, &imagesfound, nsets*nimagestot, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); // if (!verbose) { MPI_CHECK(MPI_Send( &numimg , 1 , MPI_INT , 0, 666 + world_rank, MPI_COMM_WORLD )); if (numimg != 0) { MPI_CHECK(MPI_Send( &locimagesfound, nsets*nimagestot , MPI_INT , 0, 666 + world_rank, MPI_COMM_WORLD )); //MPI_CHECK(MPI_Send( &image_pos, nsets*nimagestot*MAXIMPERSOURCE, MPI_points, 0, 667 + world_rank, MPI_COMM_WORLD )); MPI_CHECK(MPI_Send( &image_pos, nsets*nimagestot*MAXIMPERSOURCE*2, MPI_DOUBLE, 0, 667 + world_rank, MPI_COMM_WORLD )); } } // if (verbose) { int image_sum = 0; // for (int ipe = 0; ipe < world_size; ++ipe) { MPI_Status status; // if (ipe == 0) { memcpy(&numimagesfound_tmp, &locimagesfound, nsets*nimagestot*sizeof(int)); memcpy(&imagesposition_tmp, &image_pos, nsets*nimagestot*MAXIMPERSOURCE*sizeof(point)); } else { MPI_CHECK(MPI_Recv(&numimg , 1 , MPI_INT , ipe, 666 + ipe, MPI_COMM_WORLD, &status)); if (numimg != 0) { MPI_CHECK(MPI_Recv(&numimagesfound_tmp, nsets*nimagestot , MPI_INT , ipe, 666 + ipe, MPI_COMM_WORLD, &status)); MPI_CHECK(MPI_Recv(&imagesposition_tmp, nsets*nimagestot*MAXIMPERSOURCE*2, MPI_DOUBLE, ipe, 667 + ipe, MPI_COMM_WORLD, &status)); } } // //MPI_Reduce(&imagesfound_tmp, &total, ipe, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); // if (numimg != 0) for (int jj = 0; jj < nimagestot; ++jj) { for (int ii = 0; ii < nsets; ++ii) { //int img_len = numimagesfound[ii][jj]; int img_len = numimagesfound_tmp[ii][jj]; //printf("%d: %d %d, img_len = %d\n", ipe, ii, jj, img_len); image_sum += img_len; if (img_len != 0) { //int loc_length = numimagesfound[ii][jj]; int loc_length = numimagesfound[ii][jj]; memcpy(&imagesposition[ii][jj][loc_length], &imagesposition_tmp[ii][jj], img_len*sizeof(point)); numimagesfound[ii][jj] += img_len; numimg += img_len; } } } } } //MPI_Barrier(MPI_COMM_WORLD); #endif comm_time += myseconds(); // // image extraction to compute // chi2_time = -myseconds(); index = 0; // if (verbose) for( int source_id = 0; source_id < nsets; source_id ++) { unsigned short int nimages = nimages_strongLensing[source_id]; // //______________________Initialisation______________________ // for (unsigned short int i = 0; i < nimages; ++i) for (unsigned short int j = 0; j < nimages; ++j) nimagesfound[source_id][i][j] = 0; // for( unsigned short int image_id = 0; image_id < nimages; image_id++) { //#endif // double image_dist[MAXIMPERSOURCE]; // //printf(" Image %d, number of sources found %d\n", image_id, loc_images_found); // struct point image_position; int image_index; // for (int ii = 0; ii < /*loc*/numimagesfound[source_id][image_id]; ++ii) { // // int image_index = 0; // image_position = imagesposition[source_id][image_id][ii]; image_dist[0] = mychi_dist(image_position, images[index + 0].center); // get the distance to the real image for(int i = 1; i < nimages_strongLensing[source_id]; i++) { // get the distance to each real image and keep the index of the closest real image image_dist[i] = mychi_dist(image_position, images[index + i].center); //printf(" *** image %d = %f %f, distance = %f\n", index + i, images[index + i].center.x, images[index + i].center.y, image_dist[i]); if (image_dist[i] < image_dist[image_index]) { image_index = i; } } // // we should exit loops here // // p1_time += myseconds(); // int skip_image = 0; // Sometimes due to the numerical errors at the centerpoint, // for SIE potentials an additional image will appear at the center of the Potential. // This is due to the fact that it is not possible to simulate an infinity value // at the center correctly, Check that sis correspond to Nlens[0] /* for (int iterator = 0; iterator < runmode->Nlens[0]; ++iterator) { if ( fabs(image_position.x - lens[0].position_x[iterator]) <= dx/2. and fabs(image_position.y - lens[0].position_y[iterator]) <= dx/2.) { skip_image = 1; printf("WARNING: You are using SIE potentials. An image to close to one of the potential centers has been classified as numerical error and removed \n"); } } */ //printf("%d %d %d %d\n", x_id, y_id,thread_found_image, skip_image); if (!skip_image) { //#pragma omp atomic images_found++; struct point temp; //printf(" source %d, image %d, index %d, Images found: %d\n", source_id, image_id, image_index, nimagesfound[source_id][image_id][image_index]); //checking whether a closest image has already been found if (nimagesfound[source_id][image_id][image_index] == 0) { // if no image found up to now //image position is allocated to theoretical image //#pragma omp critical tim[image_id][image_index] = image_position; //#pragma omp atomic nimagesfound[source_id][image_id][image_index]++; } else if (nimagesfound[source_id][image_id][image_index] > 0) { // if we have already found an image // If the new image we found is closer than the previous image //printf(" tim2: %f %f\n", image_dist[image_index], mychi_dist(images[index + image_index].center, tim[image_id][image_index])); if (image_dist[image_index] < mychi_dist(images[index + image_index].center, tim[image_id][image_index])) { temp = tim[image_id][image_index]; // we store the position of the old image in temp //#pragma omp critical tim[image_id][image_index] = image_position; // we link the observed image with the image we just found //printf("tim2 %d %d = %f %f\n", image_id, image_index, image_position.x, image_position.y); } else { temp = image_position; // we store the position of the image we just found in temp } // initialising second_closest_id to the highest value // Loop over all images in the set except the closest one // and initialize to the furthest away image int second_closest_id = 0; for (int i = 1; i < nimages_strongLensing[source_id] && (i != image_index); i++) { if(image_dist[i] > image_dist[second_closest_id]) second_closest_id=i; } /////////////////////////////////////////////////////////////// // Loop over all images in the set that are not yet allocated to a theoretical image // and allocate the closest one // we search for an observed image not already linked (nimagesfound=0) for(int i = 0; i < nimages_strongLensing[source_id] && nimagesfound[source_id][image_id][i] == 0; i++) { if(image_dist[i] < image_dist[second_closest_id]) { second_closest_id = i; // im_index value changes only if we found a not linked yet image image_index = i; //printf("tim3 %d %d = %f %f\n", image_id, image_index, temp.x, temp.y); //#pragma omp critical tim[image_id][image_index] = temp; // if we found an observed and not already linked image, we allocate the theoretical image temp } } // increasing the total number of images found (If we find more than 1 theoretical image linked to 1 real image, // these theoretical //#pragma omp atomic nimagesfound[source_id][image_id][image_index]++; // images are included in this number) } } //#pragma omp atomic //loc_images_found++; //thread_found_image = 0; // for next iteration } // } //#pragma omp barrier //____________________________ end of image loop // //____________________________ computing the local chi square // double chiimage; // int _nimages = nimages_strongLensing[source_id]; // for( int iter = 0; iter < _nimages*_nimages; iter++) { int i=iter/nimages_strongLensing[source_id]; int j=iter%nimages_strongLensing[source_id]; //printf("nimagesfound %d %d = %d\n", i, j, nimagesfound[i][j]); if(i != j) { // In the current method, we get the source in the source plane by ray tracing image in nimagesfound[i][i]. If we ray trace back, // we arrive again at the same position and thus the chi2 from it is 0. Thus we do not calculate the chi2 (-> if i!=j) if(nimagesfound[source_id][i][j] > 0) { double pow1 = images[index + j].center.x - tim[i][j].x; double pow2 = images[index + j].center.y - tim[i][j].y; // //chiimage = pow(images[index + j].center.x - tim[i][j].x, 2) + pow(images[index + j].center.y - tim[i][j].y, 2); // compute the chi2 chiimage = pow1*pow1 + pow2*pow2; // compute the chi2 //printf("%d %d = %.15f\n", i, j, chiimage); *chi += chiimage; } else if(nimagesfound[source_id][i][j] == 0) { // If we do not find a correpsonding image, we add a big value to the chi2 to disfavor the model *chi += 100.*nimages_strongLensing[source_id]; } } } // //____________________________ end of computing the local chi square // //printf("%d: chi = %.15f\n", source_id, *chi); /* for (int i=0; i < nimages_strongLensing[source_id]; ++i){ for (int j=0; j < nimages_strongLensing[source_id]; ++j){ printf(" %d",nimagesfound[i][j]); } printf("\n"); }*/ //Incrementing Index: Images already treated by previous source_id index += nimages_strongLensing[source_id]; } #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif // chi2_time += myseconds(); time += myseconds(); // if (verbose) { // // int nthreads = 1; // //#pragma omp parallel // nthreads = omp_get_num_threads(); // printf(" overall time = %f s.\n", time); printf(" - grad time = %f s.\n", grad_time); printf(" - image time = %f s.\n", image_time); printf(" - delense time = %f s.\n", delense_time); printf(" - comm time = %f s.\n", comm_time); printf(" - chi2 time = %f s.\n", chi2_time); // printf(" images found: %d out of %ld\n", images_found, images_total); } // #ifdef __WITH_GPU #ifndef __WITH_UM cudasafe(cudaFree(grid_gradient_x), "deallocating grid_gradient_x"); cudasafe(cudaFree(grid_gradient_y), "deallocating grid_gradient_y"); #endif #else free(grid_gradient_x); free(grid_gradient_y); #endif } diff --git a/src/chi_CPU_barycenter.cpp b/src/chi_CPU_barycenter.cpp index bbf74d7..60d1047 100644 --- a/src/chi_CPU_barycenter.cpp +++ b/src/chi_CPU_barycenter.cpp @@ -1,283 +1,282 @@ #include <iostream> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <unistd.h> #include <assert.h> #ifndef __xlC__ #warning "gnu compilers" -#include <immintrin.h> #endif //#include "simd_math.h" #include "chi_CPU.hpp" //#ifdef __WITH_GPU #include "grid_gradient_GPU.cuh" ////#endif //#include "gradient_GPU.cuh" #ifdef _OPENMP #include <omp.h> #endif #ifdef __WITH_MPI #warning "MPI enabled" #include <mpi.h> #include "mpi_check.h" #endif #include "delense_CPU_utils.hpp" #include "delense_CPU.hpp" #include "delense_GPU.cuh" #include "chi_comm.hpp" #include "chi_computation.hpp" #ifdef __WITH_GPU #include <cuda_runtime.h> #include "cudafunctions.cuh" #endif #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) // // // double myseconds() { struct timeval tp; struct timezone tzp; int i; i = gettimeofday(&tp,&tzp); return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); } void mychi_bruteforce_SOA_CPU_grid_gradient_barycentersource(double *chi, int *error, runmode_param *runmode, const struct Potential_SOA *lens, const struct grid_param *frame, const int *nimages_strongLensing, galaxy *images) { // //double dx, dy; //, x_pos, y_pos; //pixelsize // // double im_dist[MAXIMPERSOURCE]; // distance to a real image for an "theoretical" image found from a theoretical source // int im_index; // index of the closest real image for an image found from a theoretical source // int second_closest_id; // index of the second closest real image for an image found from a theoretical source // int thread_found_image = 0; // at each iteration in the lens plane, turn to 1 whether the thread find an image // struct point im_position, temp; // position of the image found from a theoretical source + temp variable for comparison // struct triplet Tsup, Tinf, Tsupsource, Tinfsource;// triangles for the computation of the images created by the theoretical sources // unsigned int nsets = runmode->nsets; unsigned int nimagestot = runmode->nimagestot; // int MAXIMPERSOURCE; int numImages = 0; // int maxImagesPerSource = 0; for( int source_id = 0; source_id < nsets; source_id ++) { numImages += nimages_strongLensing[source_id]; maxImagesPerSource = MAX(nimages_strongLensing[source_id], maxImagesPerSource); } MAXIMPERSOURCE = maxImagesPerSource; // struct galaxy sources[nsets]; // theoretical sources (common for a set) int nimagesfound [nsets][MAXIMPERSOURCE]; // number of images found from the theoretical sources int locimagesfound[nsets]; //int* locimagesfound = (int*) malloc(nsets*sizeof(int)); struct point image_pos [nsets][MAXIMPERSOURCE]; //struct point* image_pos = (struct point*) malloc(nsets*MAXIMPERSOURCE*sizeof(struct point)); //double image_dist [nsets][runmode->nimagestot][MAXIMPERSOURCE]; struct point tim [MAXIMPERSOURCE]; // theoretical images (computed from sources) // int world_size = 1; int world_rank = 0; #ifdef __WITH_MPI MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); MPI_Barrier(MPI_COMM_WORLD); #endif unsigned int verbose = (world_rank == 0); // int grid_size = runmode->nbgridcells; int loc_grid_size = runmode->nbgridcells/world_size; // double y_len = fabs(frame->ymax - frame->ymin); int y_len_loc = runmode->nbgridcells/world_size; int y_pos_loc = (int) world_rank*y_len_loc; int y_bound = y_len_loc; // if ((world_rank + 1) != world_size) y_bound++; printf("@@nsets = %d nx = %d ny = %d, xmin = %f, ymin = %f\n", nsets, runmode->nbgridcells, y_bound, frame->xmin, frame->ymin ); // // const double dx = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1); const double dy = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1); // double *grid_gradient_x, *grid_gradient_y; // //grid_gradient_x = (double *)malloc((int) grid_size*loc_grid_size*sizeof(double)); //grid_gradient_y = (double *)malloc((int) grid_size*loc_grid_size*sizeof(double)); //grid_gradient_x = (double *)malloc((int) grid_size*y_bound*sizeof(double)); //grid_gradient_y = (double *)malloc((int) grid_size*y_bound*sizeof(double)); grid_gradient_x = (double *)malloc((int) grid_size*y_bound*sizeof(double)); grid_gradient_y = (double *)malloc((int) grid_size*y_bound*sizeof(double)); // const int grid_dim = runmode->nbgridcells; //Packaging the image to sourceplane conversion double time = -myseconds(); double grad_time = -myseconds(); //gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, grid_dim); int zero = 0; gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc); //gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, grid_size, loc_grid_size); #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif grad_time += myseconds(); // int index = 0; // index tracks the image within the total image array in the image plane *chi = 0; // double chi2_time = 0.; double delense_time = 0.; double image_time = 0.; // int images_found = 0; long int images_total = 0; // //printf("@@nsets = %d nx = %d ny = %d, xmin = %f, dx = %f, ymin = %f, dy = %f\n", nsets, runmode->nbgridcells, runmode->nbgridcells, frame->xmin, dx, frame->ymin, dy ); // int numsets = 0; // for( int source_id = 0; source_id < nsets; source_id ++) numsets += nimages_strongLensing[source_id]; // time = -myseconds(); // // nsets : number of images in the source plane // nimagestot: number of images in the image plane // // // Image Lensing // image_time -= myseconds(); // //unsigned int nsets = nsets; // for( int source_id = 0; source_id < nsets; source_id ++) { // number of images in the image plane for the specific image (1,3,5...) unsigned short int nimages = nimages_strongLensing[source_id]; //@@printf("@@ source_id = %d, nimages = %d\n", source_id, nimages_strongLensing[source_id]); //Init sources sources[source_id].center.x = sources[source_id].center.y = 0.; //____________________________ image (constrains) loop ________________________________ for(unsigned short int image_id = 0; image_id < nimages; image_id++) { // struct galaxy sources_temp; struct point Grad = module_potentialDerivatives_totalGradient_SOA(&images[index + image_id].center, lens, runmode->nhalos); // // find the position of the constrain in the source plane sources_temp.center.x = images[index + image_id].center.x - images[index + image_id].dr*Grad.x; sources_temp.center.y = images[index + image_id].center.y - images[index + image_id].dr*Grad.y; //________ Adding up for barycenter comp _________ sources[source_id].center.x += sources_temp.center.x; sources[source_id].center.y += sources_temp.center.y; //time += myseconds(); sources[source_id].redshift = 0.; sources[source_id].shape.a = sources[source_id].shape.b = sources[source_id].shape.theta = (type_t) 0.; sources[source_id].mag = 0.; // } //________ Dividing by nimages for final barycenter result _________ sources[source_id].center.x /= nimages; sources[source_id].center.y /= nimages; sources[source_id].redshift = images[index].redshift; sources[source_id].dr = images[index].dr; sources[source_id].dls = images[index].dls; sources[source_id].dos = images[index].dos; //#if 1 index += nimages; } #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif image_time += myseconds(); // // Delensing // delense_time -= myseconds(); index = 0; int numimg = 0; #ifndef __WITH_GPU delense_barycenter(&image_pos[0][0], MAXIMPERSOURCE, &locimagesfound[0], &numimg, runmode, lens, frame, nimages_strongLensing, sources, grid_gradient_x, grid_gradient_y); #else //delense_barycenter_GPU(&image_pos[0][0], &locimagesfound[0], &numimg, runmode, lens, frame, nimages_strongLensing, sources, grid_gradient_x, grid_gradient_y); //struct point* ip[MAXIMPERSOURCE] = image_pos; delense_barycenter_GPU(&image_pos[0][0], MAXIMPERSOURCE, &locimagesfound[0], &numimg, runmode, lens, frame, nimages_strongLensing, sources, grid_gradient_x, grid_gradient_y); printf("done. numimg = %d\n", numimg);fflush(stdout); #endif // // Communications // #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif delense_time += myseconds(); // double comm_time = -myseconds(); // int numimagesfound [nsets]; struct point imagesposition [nsets][MAXIMPERSOURCE]; // memset(&numimagesfound, 0, nsets*sizeof(int)); memset(&imagesposition, 0, nsets*MAXIMPERSOURCE*sizeof(point)); // delense_comm(&numimagesfound[0], MAXIMPERSOURCE, &imagesposition[0][0], &numimg, nimages_strongLensing, &locimagesfound[0], &image_pos[0][0], nsets, world_rank, world_size); // comm_time += myseconds(); // // image extraction to compute // chi2_time = -myseconds(); if (verbose) chi_computation(chi, &images_found, &numimagesfound[0], &imagesposition[0][0], MAXIMPERSOURCE, nimages_strongLensing, images, nsets); #ifdef __WITH_GPU MPI_Barrier(MPI_COMM_WORLD); #endif // chi2_time += myseconds(); time += myseconds(); // if (verbose) { // // int nthreads = 1; // //#pragma omp parallel // nthreads = omp_get_num_threads(); // printf(" overall time = %f s.\n", time); printf(" - grad time = %f s.\n", grad_time); printf(" - image time = %f s.\n", image_time); printf(" - delense time = %f s.\n", delense_time); printf(" - comm time = %f s.\n", comm_time); printf(" - chi2 time = %f s.\n", chi2_time); // printf(" images found: %d out of %ld\n", images_found, images_total); } // #ifdef __WITH_GPU cudaFree(grid_gradient_x); cudaFree(grid_gradient_y); #else free(grid_gradient_x); free(grid_gradient_y); #endif } diff --git a/src/chi_barycenter_GPU.cu b/src/chi_barycenter_GPU.cu index 4028cce..5b3e4a5 100644 --- a/src/chi_barycenter_GPU.cu +++ b/src/chi_barycenter_GPU.cu @@ -1,252 +1,251 @@ #include <iostream> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <unistd.h> #include <assert.h> #ifndef __xlC__ #warning "gnu compilers" -#include <immintrin.h> #endif //#include "simd_math.h" #include "chi_CPU.hpp" //#ifdef __WITH_GPU #include "grid_gradient_GPU.cuh" ////#endif //#include "gradient_GPU.cuh" #ifdef _OPENMP #include <omp.h> #endif #ifdef __WITH_MPI #warning "MPI enabled" #include <mpi.h> #include "mpi_check.h" #include "chi_comm.hpp" #endif #include "delense_CPU_utils.hpp" #include "delense_CPU.hpp" #include "delense_GPU.cuh" #include "chi_comm.hpp" #include "chi_computation.hpp" #ifdef __WITH_GPU #include <cuda_runtime.h> #include "nvml_GPU.cuh" #include "cudafunctions.cuh" #endif #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) int MAXIMPERSOURCE; // // // double myseconds() { struct timeval tp; struct timezone tzp; int i; i = gettimeofday(&tp,&tzp); return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); } void mychi_bruteforce_SOA_GPU_grid_gradient_barycentersource(type_t* grid_gradient_x, type_t* grid_gradient_y, double *chi, int *error, runmode_param *runmode, const struct Potential_SOA *lens, const struct grid_param *frame, const int *nimages_strongLensing, galaxy *images, double dx, double dy, const int y_pos_loc, const int y_bound) { // PUSH_RANGE("chi barycenter computation", 7); unsigned int nsets = runmode->nsets; // int numImages = 0; // int maxImagesPerSource = 0; for( int source_id = 0; source_id < nsets; source_id ++) { numImages += nimages_strongLensing[source_id]; maxImagesPerSource = MAX(nimages_strongLensing[source_id], maxImagesPerSource); } MAXIMPERSOURCE = maxImagesPerSource + 2; struct galaxy* sources; cudaMallocManaged(&sources, nsets*sizeof(struct galaxy)); // theoretical sources (common for a set) cudaMemPrefetchAsync(sources, nsets*sizeof(struct galaxy), cudaCpuDeviceId); // int nimagesfound [nsets][MAXIMPERSOURCE]; // number of images found from the theoretical sources int locimagesfound[nsets]; //int* locimagesfound = (int*) malloc(nsets*sizeof(int)); struct point image_pos [nsets][MAXIMPERSOURCE]; //struct point* image_pos = (struct point*) malloc(nsets*MAXIMPERSOURCE*sizeof(struct point)); //double image_dist [nsets][runmode->nimagestot][MAXIMPERSOURCE]; struct point tim [MAXIMPERSOURCE]; // theoretical images (computed from sources) int grid_size = runmode->nbgridcells; int world_size = 1; int world_rank = 0; #ifdef __WITH_MPI #warning "using MPI..." MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); MPI_Barrier(MPI_COMM_WORLD); #endif unsigned int verbose = (world_rank == 0); // //const double dx = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1); //const double dy = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1); // const int grid_dim = runmode->nbgridcells; double time = -myseconds(); double grad_time = -myseconds(); PUSH_RANGE("gradient_grid_GPU_UM", 1); gradient_grid_GPU_UM(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, 0, y_pos_loc); //#ifdef __WITH_MPI // MPI_Barrier(MPI_COMM_WORLD); //#endif // int index = 0; // index tracks the image within the total image array in the image plane *chi = 0; // double chi2_time = 0.; double delense_time = 0.; double image_time = 0.; // int images_found = 0; long int images_total = 0; // //printf("@@nsets = %d nx = %d ny = %d, xmin = %f, dx = %f, ymin = %f, dy = %f\n", nsets, runmode->nbgridcells, runmode->nbgridcells, frame->xmin, dx, frame->ymin, dy ); // // time = -myseconds(); // // nsets : number of images in the source plane // nimagestot: number of images in the image plane // // // Image Lensing // PUSH_RANGE("image construction", 2); image_time -= myseconds(); // //unsigned int nsets = nsets; // for( int source_id = 0; source_id < nsets; source_id ++) { // number of images in the image plane for the specific image (1,3,5...) unsigned short int nimages = nimages_strongLensing[source_id]; //printf("@@ source_id = %d, nimages = %d\n", source_id, nimages_strongLensing[source_id]); //Init sources sources[source_id].center.x = sources[source_id].center.y = 0.; //____________________________ image (constrains) loop ________________________________ for(unsigned short int image_id = 0; image_id < nimages; image_id++) { // struct galaxy sources_temp; struct point Grad = module_potentialDerivatives_totalGradient_SOA(&images[index + image_id].center, lens, runmode->nhalos); // // find the position of the constrain in the source plane sources_temp.center.x = images[index + image_id].center.x - images[index + image_id].dr*Grad.x; sources_temp.center.y = images[index + image_id].center.y - images[index + image_id].dr*Grad.y; //std::cerr << "Image " << index + image_id << "x: " << images[index + image_id].center.x << " y: "<<images[index + image_id].center.y << "Grad.x: " << Grad.x << " Grad.y: "<<Grad.y << std::endl; //________ Adding up for barycenter comp _________ sources[source_id].center.x += sources_temp.center.x; sources[source_id].center.y += sources_temp.center.y; //time += myseconds(); sources[source_id].redshift = 0.; sources[source_id].shape.a = sources[source_id].shape.b = sources[source_id].shape.theta = (type_t) 0.; sources[source_id].mag = 0.; // } //________ Dividing by nimages for final barycenter result _________ sources[source_id].center.x /= nimages; sources[source_id].center.y /= nimages; sources[source_id].redshift = images[index].redshift; sources[source_id].dr = images[index].dr; sources[source_id].dls = images[index].dls; sources[source_id].dos = images[index].dos; index += nimages; } cudaMemPrefetchAsync(&sources[0], MAXIMPERSOURCE*sizeof(struct galaxy), 0); #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif image_time += myseconds(); POP_RANGE; // image construction cudaDeviceSynchronize(); grad_time += myseconds(); POP_RANGE; // gradient computation // // Delensing // PUSH_RANGE("delense barycenter", 3); delense_time -= myseconds(); index = 0; int numimg = 0; delense_barycenter_GPU(&image_pos[0][0], MAXIMPERSOURCE, &locimagesfound[0], &numimg, runmode, lens, frame, nimages_strongLensing, sources, grid_gradient_x, grid_gradient_y, y_pos_loc, y_bound); #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif delense_time += myseconds(); POP_RANGE; // // Communications // PUSH_RANGE("delense communication", 4); double comm_time = -myseconds(); // int numimagesfound [nsets]; struct point imagesposition [nsets][MAXIMPERSOURCE]; // memset(&numimagesfound, 0, nsets*sizeof(int)); memset(&imagesposition, 0, nsets*MAXIMPERSOURCE*sizeof(point)); // delense_comm(&numimagesfound[0], MAXIMPERSOURCE, &imagesposition[0][0], &numimg, nimages_strongLensing, &locimagesfound[0], &image_pos[0][0], nsets, world_rank, world_size); // comm_time += myseconds(); POP_RANGE; // // image extraction to compute // PUSH_RANGE("chi computation", 5); chi2_time = -myseconds(); if (verbose) chi_computation(chi, &images_found, &numimagesfound[0], &imagesposition[0][0], MAXIMPERSOURCE, nimages_strongLensing, images, nsets); #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif // chi2_time += myseconds(); POP_RANGE; // // All done // time += myseconds(); // if (verbose) { // // int nthreads = 1; // //#pragma omp parallel // nthreads = omp_get_num_threads(); // printf(" overall time = %f s.\n", time); printf(" - grad time = %f s.\n", grad_time); printf(" - image time = %f s.\n", image_time); printf(" - delense time = %f s.\n", delense_time); printf(" - comm time = %f s.\n", comm_time); printf(" - chi2 time = %f s.\n", chi2_time); // printf(" images found: %d out of %ld\n", images_found, images_total); } // PUSH_RANGE("Memory deallocation", 6); POP_RANGE; POP_RANGE; } diff --git a/src/gradient.cpp b/src/gradient.cpp index 75a29ab..c51e745 100644 --- a/src/gradient.cpp +++ b/src/gradient.cpp @@ -1,993 +1,992 @@ /** Lenstool-HPC: HPC based massmodeling software and Lens-map generation Copyright (C) 2017 Christoph Schaefer, EPFL (christophernstrerne.schaefer@epfl.ch), Gilles Fourestey (gilles.fourestey@epfl.ch) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. @brief: Function for single order computation on CPUs */ #include <iostream> #include <iomanip> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> -#include <immintrin.h> #include <map> /* #ifdef __AVX__ #include "simd_math_avx.h" #endif #ifdef __AVX512F__ #include "simd_math_avx512f.h" #endif */ #include "structure_hpc.hpp" #include "gradient.hpp" #include "utils.hpp" #ifdef __WITH_MARLA #warning "using marla" #include <exp.h> #include <log.h> #include <atan.h> #define exp marla_exp #define log marla_log #define atan2 marla_atan2 #endif //#include "iacaMarks.h" // // // /**@brief Return the gradient of the projected lens potential for one clump * !!! You have to multiply by dlsds to obtain the true gradient * for the expressions, see the papers : JP Kneib & P Natarajan, Cluster Lenses, The Astronomy and Astrophysics Review (2011) for 1 and 2 * and JP Kneib PhD (1993) for 3 * * @param pImage point where the result is computed in the lens plane * @param lens mass distribution */ // /// Useful functions // complex piemd_1derivatives_ci05(type_t x, type_t y, type_t eps, type_t rc) { type_t sqe, cx1, cxro, cyro, rem2; complex zci, znum, zden, zis, zres; type_t norm; // //std::cout << "piemd_lderivatives" << std::endl; sqe = sqrt(eps); cx1 = ((type_t) 1. - eps) / ((type_t) 1. + eps); cxro = ((type_t) 1. + eps) * ((type_t) 1. + eps); cyro = ((type_t) 1. - eps) * ((type_t) 1. - eps); // rem2 = x * x / cxro + y * y / cyro; /*zci=cpx(0.,-0.5*(1.-eps*eps)/sqe); znum=cpx(cx1*x,(2.*sqe*sqrt(rc*rc+rem2)-y/cx1)); zden=cpx(x,(2.*rc*sqe-y)); zis=pcpx(zci,lncpx(dcpx(znum,zden))); zres=pcpxflt(zis,b0);*/ // --> optimized code zci.re = (type_t) 0.; zci.im = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe; znum.re = cx1 * x; znum.im = (type_t) 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; zden.re = x; zden.im = (type_t) 2.*rc * sqe - y; norm = zden.re * zden.re + zden.im * zden.im; // zis = znum/zden zis.re = (znum.re * zden.re + znum.im * zden.im) / norm; zis.im = (znum.im * zden.re - znum.re * zden.im) / norm; norm = zis.re; zis.re = log(sqrt(norm * norm + zis.im * zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // norm = zis.re; zres.re = zci.re * zis.re - zci.im * zis.im; // Re( zci*ln(zis) ) zres.im = zci.im * zis.re + zis.im * zci.re; // Im( zci*ln(zis) ) // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // return zres; } // //// changes the coordinates of point P into a new basis (rotation of angle theta) //// y' y x' //// * | / //// * | / theta //// * | / //// *|--------->x /* inline struct point rotateCoordinateSystem(struct point P, double theta) { struct point Q; Q.x = P.x*cos(theta) + P.y*sin(theta); Q.y = P.y*cos(theta) - P.x*sin(theta); return(Q); } */ // // struct point grad_halo(const struct point *pImage, const struct Potential *lens) { struct point true_coord, true_coord_rot, result; type_t X, Y, R, angular_deviation, u; complex zis; // result.x = result.y = (type_t) 0.; // /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump true_coord.x = pImage->x - lens->position.x; true_coord.y = pImage->y - lens->position.y; //printf("x, y = %f, %f\n", lens->position.x, lens->position.y); // true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle); switch (lens->type) { case 5 : /*Elliptical Isothermal Sphere*/ /*rotation of the coordiante axes to match the potential axes*/ //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle); // R=sqrt(true_coord_rot.x*true_coord_rot.x*((type_t) 1. - lens->ellipticity_potential) + true_coord_rot.y*true_coord_rot.y*((type_t) 1. + lens->ellipticity_potential)); //ellippot = ellipmass/3 result.x = ((type_t) 1. - lens->ellipticity_potential)*lens->b0*true_coord_rot.x/(R); result.y = ((type_t) 1. + lens->ellipticity_potential)*lens->b0*true_coord_rot.y/(R); break; case 8 : /* PIEMD */ /*rotation of the coordiante axes to match the potential axes*/ /*Doing something....*/ complex zis; zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore); // result.x = lens->b0*zis.re; result.y = lens->b0*zis.im; break; case 81 : //PIEMD Kassiola & Kovner,1993 I0.5c-I0.5cut // type_t t05; if ( lens->ellipticity_potential > (type_t) 2E-4 ) { t05 = lens->b0*lens->rcut/(lens->rcut - lens->rcore); complex zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore); complex zis_cut = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcut); result.x = t05 * (zis.re - zis_cut.re); result.y = t05 * (zis.im - zis_cut.im); //printf(" g = %f %f\n", result.x, result.y); } else if (( u = true_coord_rot.x*true_coord_rot.x + true_coord_rot.y*true_coord_rot.y) > (type_t) 0. ) { // Circular dPIE Elliasdottir 2007 Eq A23 slighly modified for t05 X = lens->rcore; Y = lens->rcut; t05 = sqrt(u + X * X) - X - sqrt(u + Y * Y) + Y; // Faster and equiv to Elliasdottir (see Golse PhD) t05 *= lens->b0 * Y / (Y - X) / u; // 1/u because t05/sqrt(u) and normalised Q/sqrt(u) result.x = t05*true_coord_rot.x; result.y = t05*true_coord_rot.y; } else { result.x = (type_t) 0.; result.y = (type_t) 0.; } // break; default: std::cout << "ERROR: Grad 1 profil type of clump "<< lens->name << " unknown : "<< lens->type << std::endl; break; }; result = rotateCoordinateSystem(result, -lens->ellipticity_angle); return result; } // // // struct point module_potentialDerivatives_totalGradient(const int nhalos, const struct point* pImage, const struct Potential* lens) { struct point grad, clumpgrad; // grad.x = (type_t) 0.; grad.y = (type_t) 0.; // for(int i = 0; i < nhalos; i++) { clumpgrad = grad_halo(pImage, &lens[i]); //compute gradient for each clump separately //std::cout << clumpgrad.x << " " << clumpgrad.y << std::endl; //nan check //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) { // add the gradients grad.x += clumpgrad.x; grad.y += clumpgrad.y; } } // return(grad); } // // SOA versions, vectorizable // struct point module_potentialDerivatives_totalGradient_5_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA begins\n"); // struct point grad, result; grad.x = (type_t) 0.; grad.y = (type_t) 0.; for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); type_t ell_pot = lens->ellipticity_potential[i]; // type_t R = sqrt(true_coord_rotation.x*true_coord_rotation.x*((type_t) 1. - ell_pot) + true_coord_rotation.y*true_coord_rotation.y*((type_t) 1. + ell_pot)); // result.x = ((type_t) 1. - ell_pot)*lens->b0[i]*true_coord_rotation.x/R; result.y = ((type_t) 1. + ell_pot)*lens->b0[i]*true_coord_rotation.y/R; // result = rotateCoordinateSystem(result, -lens->ellipticity_angle[i]); // grad.x += result.x; grad.y += result.y; } return grad; } // // // struct point module_potentialDerivatives_totalGradient_5_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // struct point grad, result; grad.x = (type_t) 0.; grad.y = (type_t) 0.; for(int i = shalos; i < shalos + nhalos; i++) { struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); type_t b0 = lens->b0[i]; // type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t ell_pot = lens->ellipticity_potential[i]; // type_t val = x*x*((type_t) 1. - ell_pot) + y*y*((type_t) 1. + ell_pot); type_t R = 1./sqrtf(val); R = R*(1.5 - 0.5*val*R*R); result.x = ((type_t) 1. - ell_pot)*b0*x*R; result.y = ((type_t) 1. + ell_pot)*b0*y*R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; //grad.x = x/R; //grad.y = y/R; } return grad; } #if 0 struct point module_potentialDerivatives_totalGradient_5_SOA_print(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int index) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // struct point grad, result; std::ofstream myfile; grad.x = (type_t) 0.; grad.y = (type_t) 0.; #ifdef _double std::string name = "Double_"; #else std::string name = "Float_"; #endif for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; //// /* myfile.open (name + "pImage->x_1.txt", std::ios_base::app); myfile << index << " " << pImage->x << std::setprecision(7) << " " << std::endl; myfile.close(); // myfile.open (name + "pImage->y_1.txt", std::ios_base::app); myfile << index << " " << pImage->y << std::setprecision(7) << " " << std::endl; myfile.close(); // myfile.open (name + "true_coord.x_2.txt", std::ios_base::app); myfile << index << " " << true_coord.x << std::setprecision(7) << " " << std::endl; myfile.close(); // myfile.open (name + "true_coord.y_2.txt", std::ios_base::app); myfile << index << " " << true_coord.y << std::setprecision(7) << " " << std::endl; myfile.close(); */// type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; //// myfile.open (name + "lens->anglecos[i]_3.txt", std::ios_base::app); myfile << index << " " << lens->anglecos[i] << std::setprecision(7) << " " << std::endl; myfile.close(); // myfile.open (name + "lens->anglesin[i]_3.txt", std::ios_base::app); myfile << index << " " << lens->anglesin[i] << std::setprecision(7) << " " << std::endl; myfile.close(); //// type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; //// /* myfile.open (name + "x_4.txt", std::ios_base::app); myfile << index << " " << x << std::setprecision(7) << " " << std::endl; myfile.close(); / myfile.open (name + "y_4.txt", std::ios_base::app); myfile << index << " " << y << std::setprecision(7) << " " << std::endl; myfile.close(); *//// type_t ell_pot = lens->ellipticity_potential[i]; // type_t R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot)); // result.x = (1 - ell_pot)*lens->b0[i]*x/R; result.y = (1 + ell_pot)*lens->b0[i]*y/R; //// /* myfile.open (name + "ell_pot_5.txt", std::ios_base::app); myfile << index << " " << ell_pot << std::setprecision(7) << " " << std::endl; myfile.close(); // myfile.open (name + "R_5.txt", std::ios_base::app); myfile << index << " " << R << std::setprecision(7) << " " << std::endl; myfile.close(); myfile.open (name + "x_6.txt", std::ios_base::app); myfile << index << " " << x << std::setprecision(7) << " " << std::endl; myfile.close(); myfile.open (name + "y_6.txt", std::ios_base::app); myfile << index << " " << y << std::setprecision(7) << " " << std::endl; myfile.close(); */// grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; //// /* myfile.open (name + "grad.x_7.txt", std::ios_base::app); myfile << index << " " << grad.x << std::setprecision(7) << " " << std::endl; myfile.close(); myfile.open (name + "grad.y_7.txt", std::ios_base::app); myfile << index << " " << grad.y << std::setprecision(7) << " " << std::endl; myfile.close(); *//// } return grad; } #endif // // // struct point module_potentialDerivatives_totalGradient_5_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins"); // struct point grad, result; grad.x = (type_t) 0.; grad.y = (type_t) 0.; #pragma novector for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; //: type_t R = sqrt(x*x*((type_t) 1. - lens->ellipticity[i]/(type_t) 3.) + y*y*((type_t) 1. + lens->ellipticity[i]/(type_t) 3.)); result.x = ((type_t) 1. - lens->ellipticity[i]/(type_t) 3.)*lens->b0[i]*x/R; result.y = ((type_t) 1. + lens->ellipticity[i]/(type_t) 3.)*lens->b0[i]*y/R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; } return grad; } // // // struct point module_potentialDerivatives_totalGradient_8_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = (type_t) 0.; grad.y = (type_t) 0.; // for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // //@@printf("image_x = %f image_y = %f\n", pImage->x, pImage->y); true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; //printf("x = %f y = %f\n", true_coord.x, true_coord.y); /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // type_t x = true_coord_rot.x; type_t y = true_coord_rot.y; //@@printf("x = %f y = %f\n", x, y); type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; // //std::cout << "piemd_lderivatives" << std::endl; // type_t sqe = sqrt(eps); // type_t cx1 = ((type_t) 1. - eps)/((type_t) 1. + eps); type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps); type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres; type_t norm; // zci.re = (type_t) 0.; zci.im = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe; //@@printf("zci = %f %f\n", zci.re, zci.im); // znum.re = cx1*x; znum.im = (type_t) 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = (type_t) 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden //@@printf("norm = %f\n", norm); // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; //@@printf("zis = %f %f\n", zis.re, zis.im); norm = zis.re; zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); //@@printf("y,x = %f %f\n", zis.im, norm); // norm = zis.re; zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) // //@@printf("zres: %f %f\n", zres.re, zres.im); // zis.re = zres.re; zis.im = zres.im; // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // rotation clumpgrad.x = zis.re; clumpgrad.y = zis.im; clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]); // clumpgrad.x = lens->b0[i]*clumpgrad.x; clumpgrad.y = lens->b0[i]*clumpgrad.y; // //clumpgrad.x = lens->b0[i]*zis.re; //clumpgrad.y = lens->b0[i]*zis.im; //nan check //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) //{ // add the gradients grad.x += clumpgrad.x; grad.y += clumpgrad.y; //@@printf("grad: %f %f\n", grad.x, grad.y); //@@std::cout << "grad.x = " << grad.x << " grad.y = " << grad.y << std::endl; //} } //IACA_END; // return(grad); } // // struct point module_potentialDerivatives_totalGradient_8_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_v2 begins"); //std::cout << "# module_potentialDerivatives_totalGradient_8_SOA_v2 begins" << std::endl; // // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; //printf("%d %d\n", shalos, nhalos); // for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; complex zis; type_t b0 = lens->b0[i]; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; // type_t sqe = sqrt(eps); // type_t cx1 = ((type_t) 1. - eps)/((type_t) 1. + eps); type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps); type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres; type_t norm; // zci.re = (type_t) 0.; zci.im = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe; // KERNEL(rc, zres) // grad.x += b0*(zres.re*cose - zres.im*sine); grad.y += b0*(zres.im*cose + zres.re*sine); // } //IACA_END; // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_8_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens , int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_v2 begins"); //std::cout << "# module_potentialDerivatives_totalGradient_8_SOA_v2 begins" << std::endl; // // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = (type_t) 0.; grad.y = (type_t) 0.; //printf("%d %d\n", shalos, nhalos); // #pragma novector for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; complex zis; type_t b0 = lens->b0[i]; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; // type_t sqe = sqrt(eps); // type_t cx1 = ((type_t) 1. - eps)/((type_t) 1. + eps); type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps); type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres; type_t norm; // zci.re = (type_t) 0.; zci.im = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe; // KERNEL(rc, zres) // grad.x += b0*(zres.re*cose - zres.im*sine); grad.y += b0*(zres.im*cose + zres.re*sine); // } //IACA_END; // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_81_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = (type_t) 0.; grad.y = (type_t) 0.; for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // type_t x = true_coord_rot.x; type_t y = true_coord_rot.y; type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; type_t rcut = lens->rcut[i]; type_t b0 = lens->b0[i]; type_t t05 = b0*rcut/(rcut - rc); // type_t sqe = sqrt(eps); // type_t cx1 = ((type_t) 1. - eps)/((type_t) 1. + eps); type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps); type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres_rc, zres_rcut; type_t norm; // zci.re = (type_t) 0.; zci.im = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe; // step 1 { KERNEL(rc, zres_rc) } // step 2 { KERNEL(rcut, zres_rcut) } zis.re = t05*(zres_rc.re - zres_rcut.re); zis.im = t05*(zres_rc.im - zres_rcut.im); // rotation clumpgrad.x = zis.re; clumpgrad.y = zis.im; clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; //} } //IACA_END; // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_81_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = (type_t) 0.; grad.y = (type_t) 0.; for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; type_t rcut = lens->rcut[i]; type_t b0 = lens->b0[i]; type_t t05 = b0*rcut/(rcut - rc); // type_t sqe = sqrt(eps); // type_t cx1 = ((type_t) 1. - eps)/((type_t) 1. + eps); type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps); type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres_rc, zres_rcut; type_t norm; // zci.re = (type_t) 0.; zci.im = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe; // // step 1 { KERNEL(rc, zres_rc) } // step 2 { KERNEL(rcut, zres_rcut) } zis.re = t05*(zres_rc.re - zres_rcut.re); zis.im = t05*(zres_rc.im - zres_rcut.im); // rotation grad.x += (zis.re*cose - zis.im*sine); grad.y += (zis.im*cose + zis.re*sine); // } // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_81_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 struct point grad, clumpgrad; grad.x = 0.; grad.y = 0.; #pragma novector for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; type_t rcut = lens->rcut[i]; type_t b0 = lens->b0[i]; type_t t05 = b0*rcut/(rcut - rc); // type_t sqe = sqrt(eps); // type_t cx1 = ((type_t) 1. - eps)/((type_t) 1. + eps); type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps); type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres_rc, zres_rcut; type_t norm; // zci.re = (type_t) 0.; zci.im = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe; // // step 1 { KERNEL(rc, zres_rc) } // step 2 { KERNEL(rcut, zres_rcut) } zis.re = t05*(zres_rc.re - zres_rcut.re); zis.im = t05*(zres_rc.im - zres_rcut.im); // rotation grad.x += (zis.re*cose - zis.im*sine); grad.y += (zis.im*cose + zis.re*sine); // } // return(grad); } // // // typedef struct point (*halo_func_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); halo_func_t halo_func[100] = { 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_v2, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_v2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_81_SOA_v2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // // // struct point module_potentialDerivatives_totalGradient_SOA(const struct point *pImage, const struct Potential_SOA *lens, int nhalos) { struct point grad, clumpgrad; // grad.x = clumpgrad.x = (type_t) 0.; grad.y = clumpgrad.y = (type_t) 0.; // int shalos = 0; while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type and shalos + count < nhalos){ //int i = count; //std::cerr << lens->position_x[i] << lens->position_y[i] << lens->anglecos[i]<< lens->anglesin[i]<< lens->ellipticity_potential[i] << lens->rcore[i] << lens->rcut[i] << lens->b0[i] << std::endl; //std::cerr << "lens->type[shalos + count] = " << lens->type[shalos + count] << " " << lens_type << " " << lens_type << " " << " count = " << count << std::endl; count++; } // clumpgrad = (*halo_func[lens_type])(pImage, lens, shalos, count); // //std::cerr << lens->position_x[i] << lens->position_y[i] << lens->anglecos[i]<< lens->anglesin[i]<< lens->ellipticity_potential[i] << lens->rcore[i] << lens->rcut[i] << lens->b0[i] << std::endl; //std::cerr << "type = " << lens_type << " " << count << " " << nhalos << " " << " grad.x = " << clumpgrad.x << " grad.y = " << clumpgrad.y << std::endl; grad.x += clumpgrad.x; grad.y += clumpgrad.y; shalos += count; } return(grad); } // typedef struct point (*halo_func_t_novec) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); // halo_func_t_novec halo_func_novec[100] = { 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_v2_novec, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_v2_novec, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_81_SOA_v2_novec, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // // // struct point module_potentialDerivatives_totalGradient_SOA_novec(const struct point *pImage, const struct Potential_SOA *lens, int nhalos) { struct point grad, clumpgrad; // grad.x = clumpgrad.x = 0; grad.y = clumpgrad.y = 0; // int shalos = 0; // //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos); //return; /* int* p_type = &(lens->type)[0]; int* lens_type = (int*) malloc(nhalos*sizeof(int)); memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); */ //quicksort(lens_type, nhalos); // while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type) count++; //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; // clumpgrad = (*halo_func_novec[lens_type])(pImage, lens, shalos, count); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; shalos += count; } return(grad); } diff --git a/src/gradient2.cpp b/src/gradient2.cpp index b728ed2..17d9dcb 100644 --- a/src/gradient2.cpp +++ b/src/gradient2.cpp @@ -1,258 +1,257 @@ /** Lenstool-HPC: HPC based massmodeling software and Lens-map generation Copyright (C) 2017 Christoph Schaefer, EPFL (christophernstrerne.schaefer@epfl.ch), Gilles Fourestey (gilles.fourestey@epfl.ch) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. @brief: Function for second order computation on CPU to compare with GPUs */ #include <iostream> #include <iomanip> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> -#include <immintrin.h> #include <map> /* #ifdef __AVX__ #include "simd_math_avx.h" #endif #ifdef __AVX512F__ #include "simd_math_avx512f.h" #endif */ #include "structure_hpc.hpp" #include "gradient2.hpp" //lenstool fct /* * derivates of I0.5 KK * Parameters : * - (x,y) is the computation position of the potential * - eps is the ellepticity (a-b)/(a+b) * - rc is the core radius * - b0 asymptotic Einstein radius E0. (6pi*vdisp^2/c^2) * * Return a the 4 second derivatives of the PIEMD potential */ inline void mdci05_hpc(type_t x, type_t y, type_t eps, type_t rc, type_t b0, struct matrix *res) { type_t ci, sqe, cx1, cxro, cyro, wrem; type_t didyre, didyim, didxre; // didxim; type_t cx1inv, den1, num2, den2; // sqe = sqrt(eps); cx1 = (1. - eps)/(1. + eps); cx1inv = 1./cx1; // cxro = (1. + eps)*(1. + eps); /* rem^2=x^2/(1+e^2) + y^2/(1-e^2) Eq 2.3.6*/ cyro = (1. - eps)*(1. - eps); ci = 0.5*(1. - eps * eps) / sqe; wrem = sqrt(rc*rc + x*x/cxro + y*y/cyro); /*wrem^2=w^2+rem^2 with w core radius*/ den1 = 2.*sqe*wrem - y*cx1inv; den1 = cx1*cx1*x*x + den1*den1; num2 = 2.*rc*sqe - y; den2 = x*x + num2*num2; // didxre = ci*( cx1*(2.*sqe*x*x/cxro/wrem - 2.*sqe*wrem + y*cx1inv)/den1 + num2/den2 ); didyre = ci*( (2.*sqe*x*y*cx1/cyro/wrem - x)/den1 + x/den2 ); didyim = ci * ( (2.* sqe * wrem * cx1inv - y * cx1inv * cx1inv - 4.*eps*y/cyro + 2.* sqe * y * y / cyro / wrem * cx1inv)/den1 - num2/den2 ); // res->a = b0 * didxre; res->b = res->d = b0 * didyre; //(didyre+didxim)/2.; res->c = b0 * didyim; // return(res); } // void printmat(matrix A){ std::cerr << A.a << " " << A.b << " " << A.c << " " << A.d << std::endl; } // // SOA versions, vectorizable // // struct matrix module_potentialDerivatives_totalGradient2_81_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient2_81_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // type_t t05; type_t RR; // struct matrix grad2, clump, clumpcore, clumpcut; struct point true_coord, true_coord_rotation; // grad2.a = clump.a = 0; grad2.b = clump.b = 0; grad2.c = clump.c = 0; grad2.d = clump.d = 0; // for(int i = shalos; i < shalos + nhalos; i++) { //if (lens->ellipticity_potential[i] < 0.0001) printf("halo = %d, ellipticity_potential = %f\n", i, lens->ellipticity_potential[i]); //if(lens->ellipticity_potential[i] > 0.0001) if (true) { //True coord true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //std::cerr << "start" << std::endl; //Rotation type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // 81 comput t05 = lens->rcut[i] / (lens->rcut[i] - lens->rcore[i]); mdci05_hpc(x, y, lens->ellipticity_potential[i], lens->rcore[i], lens->b0[i], &clumpcore); mdci05_hpc(x, y, lens->ellipticity_potential[i], lens->rcut[i], lens->b0[i], &clumpcut); //printf("X %f Y %f Ga: %f %f\n",true_coord.x ,true_coord.y , clumpcore.a, clumpcut.a); //printf("X %f Y %f Gb: %f %f\n",true_coord.x ,true_coord.y, clumpcore.b, clumpcut.b); //printf("X %f Y %f Gc: %f %f\n", true_coord.x ,true_coord.y,clumpcore.c, clumpcut.c); //printf("X %f Y %f Gd: %f %f\n",true_coord.x ,true_coord.y, clumpcore.d, clumpcut.d); // clumpcore.a = t05 * (clumpcore.a - clumpcut.a); clumpcore.b = t05 * (clumpcore.b - clumpcut.b); clumpcore.c = t05 * (clumpcore.c - clumpcut.c); clumpcore.d = t05 * (clumpcore.d - clumpcut.d); //printmat(clumpcore); //printf("X %f Y %f Ga: %f %f\n",pImage->x ,pImage->y , clumpcore.a, t05); //printf("X %f Y %f Gb: %f %f\n",pImage->x ,pImage->y, clumpcore.b, clumpcut.b); //printf("X %f Y %f Gc: %f %f\n",pImage->x ,pImage->y, clumpcore.c, clumpcut.c); //printf("X %f Y %f Gd: %f %f\n",pImage->x ,pImage->y, clumpcore.d, clumpcut.d); //rotation matrix 1 clumpcut.a = clumpcore.a * cose + clumpcore.b * -sine; clumpcut.b = clumpcore.a * sine + clumpcore.b * cose; clumpcut.c = clumpcore.d * sine + clumpcore.c * cose; clumpcut.d = clumpcore.d * cose + clumpcore.c * -sine; //printf("X %f Y %f theta: %f %f %f\n",pImage->x ,pImage->y,lens->ellipticity_angle[i], cose, sine); //printmat(clumpcut); //rotation matrix 2 clump.a = cose*clumpcut.a - sine*clumpcut.d; clump.b = cose*clumpcut.b - sine*clumpcut.c; clump.c = sine*clumpcut.b + cose*clumpcut.c; clump.d = sine*clumpcut.a + cose*clumpcut.d; // //printf("X %f Y %f Ga: %f \n",pImage->x ,pImage->y , clump.a ); //printf("X %f Y %f Gb: %f \n",pImage->x ,pImage->y, clump.b ); //printf("X %f Y %f Gc: %f \n",pImage->x ,pImage->y, clump.c); //printf("X %f Y %f Gd: %f \n",pImage->x ,pImage->y, clump.d ); //printmat(clump); // grad2.a += clump.a; grad2.b += clump.b; grad2.c += clump.c; grad2.d += clump.d; } else if((RR = true_coord.x * true_coord.x + true_coord.y * true_coord.y) > 0.) { // Circular dPIE Elliasdottir 2007 Eq A23 slighly modified for t05 type_t X,Y,z,p,t05; X = lens->rcore[i]; Y = lens->rcut[i]; t05 = lens->b0[i] * Y / (Y - X); // 1/u because t05/sqrt(u) and normalised Q/sqrt(u) z = sqrt(RR + X * X) - X - sqrt(RR + Y * Y) + Y; // R*dphi/dR X = RR / X; Y = RR / Y; p = (1. - 1. / sqrt(1. + X / lens->rcore[i])) / X - (1. - 1. / sqrt(1. + Y / lens->rcut[i])) / Y; // d2phi/dR2 X = true_coord.x * true_coord.x / RR; Y = true_coord.y * true_coord.y / RR; clump.a = t05 * (p * X + z * Y / RR); clump.c = t05 * (p * Y + z * X / RR); X = true_coord.x * true_coord.y / RR; clump.b = clump.d = t05 * (p * X - z * X / RR); grad2.a += clump.a; grad2.b += clump.b; grad2.c += clump.c; grad2.d += clump.d; } else { clump.a = clump.c = lens->b0[i] / lens->rcore[i]/ 2.; clump.b = clump.d = 0.; grad2.a += clump.a; grad2.b += clump.b; grad2.c += clump.c; grad2.d += clump.d; } } // return(grad2); } // //This natrix handles the calling of the gradient functions for the different type without losing time to switch or if condition // typedef struct matrix (*halo_g2_func_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); halo_g2_func_t halo_g2_func[100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient2_81_SOA_v2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // // // struct matrix module_potentialDerivatives_totalGradient2_SOA(const struct point *pImage, const struct Potential_SOA *lens, int nhalos) { struct matrix grad2, clumpgrad; // grad2.a = clumpgrad.a = 0; grad2.b = clumpgrad.b = 0; grad2.c = clumpgrad.c = 0; grad2.d = clumpgrad.d = 0; // int shalos = 0; while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type and shalos + count < nhalos) count++; //std::cerr << "type = " << lens_type << " " << count << " " << shalos << " " << nhalos << " " << " func = " << halo_g2_func[lens_type] << std::endl; // clumpgrad = (*halo_g2_func[lens_type])(pImage, lens, shalos, count); // grad2.a += clumpgrad.a; grad2.b += clumpgrad.b; grad2.c += clumpgrad.c; grad2.d += clumpgrad.d; shalos += count; } return(grad2); } //