diff --git a/Makefile b/Makefile index 9017960..77160b3 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,21 @@ CXX=mpic++ +#CXX=g++ LD=${CXX} -CXXFLAGS+=-Wall -Wextra -Werror -pedantic -std=c++11 -O3 +#VERIFY=-fsanitize=thread +VERIFY= -D_DEBUG_MPI #-D_DEBUG +OPENMP=-fopenmp +NOOPENMP=-fno-openmp -Wno-unknown-pragmas +#OPTIM+=-O3 -D_MPI $(NOOPENMP) +OPTIM+=-O3 -D_MPI $(OPENMP) -march=native +CXXFLAGS+=-Wall -Wextra -Werror -pedantic -std=c++14 -g -fPIE $(OPTIM) $(VERIFY) LDFLAGS+=-lm $(CXXFLAGS) OBJS=poisson.o grid.o double_buffer.o simulation.o dumpers.o all: poisson poisson: $(OBJS) $(LD) -o $@ $(OBJS) $(LDFLAGS) clean: rm -f hello poisson *.o *~ diff --git a/double_buffer.cc b/double_buffer.cc index 67749d0..978eda5 100644 --- a/double_buffer.cc +++ b/double_buffer.cc @@ -1,25 +1,31 @@ /* -------------------------------------------------------------------------- */ #include "double_buffer.hh" #include "grid.hh" /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ DoubleBuffer::DoubleBuffer(int m, int n) : m_current(new Grid(m, n)), m_old(new Grid(m, n)) {} /* -------------------------------------------------------------------------- */ Grid & DoubleBuffer::current() { return *m_current; } /* -------------------------------------------------------------------------- */ Grid & DoubleBuffer::old() { return *m_old; } +/* -------------------------------------------------------------------------- */ +const Grid & DoubleBuffer::current() const { return *m_current; } + +/* -------------------------------------------------------------------------- */ +const Grid & DoubleBuffer::old() const { return *m_old; } + /* -------------------------------------------------------------------------- */ void DoubleBuffer::swap() { m_current.swap(m_old); } /* -------------------------------------------------------------------------- */ void DoubleBuffer::resize(int m, int n) { m_current->resize(m, n); m_old->resize(m, n); } diff --git a/double_buffer.hh b/double_buffer.hh index a05410b..1d21b5c 100644 --- a/double_buffer.hh +++ b/double_buffer.hh @@ -1,25 +1,28 @@ #ifndef DOUBLE_BUFFER #define DOUBLE_BUFFER /* -------------------------------------------------------------------------- */ #include /* -------------------------------------------------------------------------- */ #include "grid.hh" /* -------------------------------------------------------------------------- */ class DoubleBuffer { public: DoubleBuffer(int m = 0 , int n = 0); void resize(int m, int n); Grid & current(); Grid & old(); + const Grid & current() const; + const Grid & old() const; + void swap(); private: std::unique_ptr m_current; std::unique_ptr m_old; }; #endif /* DOUBLE_BUFFER */ diff --git a/dumpers.cc b/dumpers.cc index 5236b4c..ec03fe4 100644 --- a/dumpers.cc +++ b/dumpers.cc @@ -1,160 +1,214 @@ - /* -------------------------------------------------------------------------- */ #include "dumpers.hh" #include "grid.hh" /* -------------------------------------------------------------------------- */ #include #include #include #include /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ void Dumper::set_min(float min) { m_min = min; } void Dumper::set_max(float max) { m_max = max; } float Dumper::min() const { return m_min; } float Dumper::max() const { return m_max; } /* -------------------------------------------------------------------------- */ void DumperASCII::dump(const Grid & grid, int step) { std::ofstream fout; std::stringstream sfilename; sfilename << "out_" << std::setfill('0') << std::setw(5) << step << ".pgm"; fout.open(sfilename.str()); int m = grid.m(); int n = grid.n(); fout << "P2" << std::endl << "# CREATOR: Poisson program" << std::endl; fout << m << " " << n << std::endl; fout << 255 << std::endl; for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { int v = 255. * (grid(i, j) - m_min) / (m_max - m_min); v = std::min(v, 255); fout << v << std::endl; } } } /* -------------------------------------------------------------------------- */ void DumperBinary::dump(const Grid & grid, int step) { std::stringstream sfilename; - int prank, psize; + std::array coords{0, 0}, dims{0, 0}; +#if defined(_MPI) + int prank{0}, psize{1}; MPI_Comm_rank(m_communicator, &prank); MPI_Comm_size(m_communicator, &psize); + std::array periods{0, 0}; + MPI_Cart_get(m_communicator, dims.size(), dims.data(), periods.data(), coords.data()); +#else + dims = {1, 1}; +#endif sfilename << "out_" << std::setfill('0') << std::setw(5) << step << ".bmp"; int h = grid.m(); int w = grid.n(); + int global_w = w; + + std::array starts = {0, 0}; + std::array local_sizes = {h, w}; + std::array global_sizes = local_sizes; - int start = 0; +#if defined(_MPI) + // removing the ghosts + if(prank == 0) std::cout << dims[0] << " " << dims[1] << "\n"; - // removing the ghosts from the height - if (psize > 1) { - h = (prank == 0 || prank == psize - 1 ? h - 1 : h - 2); - start = (prank == 0 ? 0 : 1); + for (std::size_t i = 0; i < local_sizes.size(); ++i) { + if (dims[i] > 1) { + local_sizes[i] -= (coords[i] == 0 || coords[i] == dims[i] - 1 ? 1 : 2); + starts[i] += (coords[i] == 0 ? 0 : 1); + } } - // Gathering the size of every processors, this could be done as in the - // constructor of the Simulation instead - std::vector size_per_proc(psize); - MPI_Allgather(&h, 1, MPI_INT, size_per_proc.data(), 1, MPI_INT, - m_communicator); + std::array comms; // determining the local offset - int offset_h = 0; - for (int i = 0; i < prank; ++i) { - offset_h += size_per_proc[i]; + std::array offset = {0, 0}; + + std::vector> line_col_size(2); + + for (size_t i = 0; i < line_col_size.size(); ++i) { + std::array remain_dims = {false, false}; + remain_dims[i] = true; + MPI_Cart_sub(m_communicator, remain_dims.data(), &(comms[i])); + + line_col_size[i].resize(dims[i]); + MPI_Allgather(&local_sizes[i], 1, MPI_INT, line_col_size[i].data(), 1, + MPI_INT, comms[i]); + + global_sizes[i] = 0; + for (int j = 0; j < dims[i]; ++j) { + global_sizes[i] += line_col_size[i][j]; + } } - int total_h = offset_h; - for (int i = prank; i < psize; ++i) { - total_h += size_per_proc[i]; + //for (int i = dims[0] - 1; i > coords[0]; --i) { + for (int i = 0; i < coords[0]; ++i) { + offset[0] += line_col_size[0][i]; } - int row_size = 3 * w; + for (int i = 0; i < coords[1]; ++i) { + offset[1] += line_col_size[1][i]; + } +#endif + + global_w = 4 * ((24 * global_sizes[1] + 31) / 32); + // if the file width (3*w) is not a multiple of 4 adds enough bytes to make it // a multiple of 4 - int padding = (4 - (row_size) % 4) % 4; - row_size += padding; + int padding = global_w - 3 * global_sizes[1]; - int filesize = 54 + row_size * total_h; + w = 3 * local_sizes[1]; - std::vector img(row_size * h); + if (coords[1] == dims[1] - 1) { + w += padding; + } else { + padding = 0; + } + + int filesize = 54 + global_sizes[0] * global_w; + + std::vector img(local_sizes[0] * w); std::fill(img.begin(), img.end(), 0); - for (int i = 0; i < h; i++) { - for (int j = 0; j < w; j++) { - float v = ((grid(i + start, j) - m_min) / (m_max - m_min)); + for (int i = 0; i < local_sizes[0]; i++) { + for (int j = 0; j < local_sizes[1]; j++) { + float v = + ((grid(i + starts[0], j + starts[1]) - m_min) / (m_max - m_min)); float r = v * 255; // Red channel float g = v * 255; // Green channel - float b = v * 255; // Red channel + float b = 0.f; //prank * 255 / psize; // Red channel r = std::min(r, 255.f); g = std::min(g, 255.f); b = std::min(b, 255.f); - img[row_size * i + 3 * j + 2] = r; - img[row_size * i + 3 * j + 1] = g; - img[row_size * i + 3 * j + 0] = b; + img[w * i + 3 * j + 0] = r; + img[w * i + 3 * j + 1] = g; + img[w * i + 3 * j + 2] = b; } } + auto write_bytes = [](auto && array, auto && value) { + array[0] = value; + array[1] = value >> 8; + array[2] = value >> 16; + array[3] = value >> 24; + }; + std::array bmpfileheader = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0}; std::array bmpinfoheader = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0}; - bmpfileheader[2] = filesize; - bmpfileheader[3] = filesize >> 8; - bmpfileheader[4] = filesize >> 16; - bmpfileheader[5] = filesize >> 24; - - bmpinfoheader[4] = w; - bmpinfoheader[5] = w >> 8; - bmpinfoheader[6] = w >> 16; - bmpinfoheader[7] = w >> 24; - bmpinfoheader[8] = total_h; - bmpinfoheader[9] = total_h >> 8; - bmpinfoheader[10] = total_h >> 16; - bmpinfoheader[11] = total_h >> 24; - bmpinfoheader[20] = (filesize - 54); - bmpinfoheader[21] = (filesize - 54) >> 8; - bmpinfoheader[22] = (filesize - 54) >> 16; - bmpinfoheader[23] = (filesize - 54) >> 24; + write_bytes(&bmpfileheader[2], filesize); + + write_bytes(&bmpinfoheader[4], global_sizes[1]); + write_bytes(&bmpinfoheader[8], global_sizes[0]); + write_bytes(&bmpinfoheader[20], filesize - 54); +#if defined(_MPI) MPI_File fh; MPI_Status status; + MPI_Datatype subarray_t; + +#if defined(_DEBUG_MPI) + for (int i = 0; i < psize; ++i) { + if (prank == i) + std::cerr << prank << " (" << coords[0] << ", " << coords[1] << ")" + << " - " << local_sizes[0] << "x" << local_sizes[1] << "(" << w + << ") + " << offset[0] << "x" << offset[1] << "("<< 3*offset[1]<<") ++ [" + << global_sizes[0] << "x" << global_sizes[1] << "(" << global_w + << ")] " << padding << "\n"; + MPI_Barrier(m_communicator); + } +#endif + + global_sizes[1] = global_w; + local_sizes[1] = w; + offset[1] *= 3; + + MPI_Type_create_subarray(global_sizes.size(), global_sizes.data(), + local_sizes.data(), offset.data(), MPI_ORDER_C, + MPI_CHAR, &subarray_t); + MPI_Type_commit(&subarray_t); // opening a file in write and create mode MPI_File_open(MPI_COMM_WORLD, sfilename.str().c_str(), - MPI_MODE_WRONLY | MPI_MODE_CREATE, - MPI_INFO_NULL, &fh); + MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &fh); // defining the size of the file MPI_File_set_size(fh, filesize); // rank 0 writes the header if (prank == 0) { MPI_File_write_at(fh, 0, bmpfileheader.data(), 14, MPI_CHAR, &status); MPI_File_write_at(fh, 14, bmpinfoheader.data(), 40, MPI_CHAR, &status); } - int offset = 54 + row_size * offset_h; - - // We also could write that data with a write_at, the view is just to show - // different possibilities - MPI_File_set_view(fh, offset, MPI_CHAR, MPI_CHAR, "native", - MPI_INFO_NULL); + if(prank == 0) { + std::cout << filesize << std::endl; + } + MPI_File_set_view(fh, 54, MPI_CHAR, subarray_t, "native", MPI_INFO_NULL); MPI_File_write(fh, img.data(), img.size(), MPI_CHAR, &status); MPI_File_close(&fh); +#endif } diff --git a/dumpers.hh b/dumpers.hh index 5d55e8e..dc9b013 100644 --- a/dumpers.hh +++ b/dumpers.hh @@ -1,47 +1,58 @@ +#if defined(_MPI) #include +#endif #ifndef DUMPERS_HH #define DUMPERS_HH /* -------------------------------------------------------------------------- */ class Grid; /* -------------------------------------------------------------------------- */ class Dumper { public: +#if defined(_MPI) explicit Dumper(MPI_Comm communicator) : m_min(-1.), m_max(1.), m_communicator(communicator) {} - +#else + Dumper() : m_min(-1.), m_max(1.) {} +#endif virtual void dump(const Grid & m_grid, int step) = 0; void set_min(float min); void set_max(float max); float min() const; float max() const; protected: float m_min, m_max; - +#if defined(_MPI) MPI_Comm m_communicator; +#endif }; /* -------------------------------------------------------------------------- */ class DumperASCII : public Dumper { public: - explicit DumperASCII(MPI_Comm communicator) - : Dumper(communicator) {} +#if defined(_MPI) + explicit DumperASCII(MPI_Comm communicator) : Dumper(communicator) {} +#else + DumperASCII() : Dumper() {} +#endif virtual void dump(const Grid & grid, int step); }; /* -------------------------------------------------------------------------- */ class DumperBinary : public Dumper { public: - explicit DumperBinary(MPI_Comm communicator) - : Dumper(communicator) {} - +#if defined(_MPI) + explicit DumperBinary(MPI_Comm communicator) : Dumper(communicator) {} +#else + DumperBinary() : Dumper() {} +#endif virtual void dump(const Grid & grid, int step); }; #endif /* DUMPERS_HH */ diff --git a/grid.cc b/grid.cc index ef28218..1ac1ef1 100644 --- a/grid.cc +++ b/grid.cc @@ -1,60 +1,119 @@ /* -------------------------------------------------------------------------- */ #include "grid.hh" /* -------------------------------------------------------------------------- */ #include +#if defined(_DEBUG_MPI) +#include +#endif /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ Grid::Grid(int m, int n) : m_m(m), m_n(n), m_storage(m * n) { clear(); } +/* -------------------------------------------------------------------------- */ +Grid::~Grid() { +#if defined(_MPI) + MPI_Type_free(&column_t); + MPI_Type_free(&line_t); +#endif +} + /* -------------------------------------------------------------------------- */ void Grid::clear() { std::fill(m_storage.begin(), m_storage.end(), 0.); } /* -------------------------------------------------------------------------- */ void Grid::resize(int m, int n) { m_m = m; m_n = n; - m_storage.resize(m*n); + m_storage.resize(m * n); } /* -------------------------------------------------------------------------- */ int Grid::m() const { return m_m; } int Grid::n() const { return m_n; } /* -------------------------------------------------------------------------- */ -void Grid::initializeHaloCommunications(MPI_Comm & m_communicator) { - // retrieving the number of proc and the rank in the proc pool - MPI_Comm_rank(m_communicator, &m_prank); - MPI_Comm_size(m_communicator, &m_psize); +#if defined(_MPI) +void Grid::initializeHaloCommunications(MPI_Comm & m_communicator, + const std::array & ghosts) { + std::array coords; + int prank; + + MPI_Type_vector(m_m - ghosts[0], 1, m_n, MPI_FLOAT, &column_t); + MPI_Type_vector(1, m_n - ghosts[1], 1, MPI_FLOAT, &line_t); + + MPI_Type_commit(&column_t); + MPI_Type_commit(&line_t); + + MPI_Comm_rank(m_communicator, &prank); + MPI_Cart_coords(m_communicator, prank, coords.size(), coords.data()); // determining the rank of the neighbors - m_north_prank = (m_prank == 0 ? MPI_PROC_NULL : m_prank - 1); - m_south_prank = (m_prank == (m_psize - 1) ? MPI_PROC_NULL : m_prank + 1); - - /// preparing synchronize north - MPI_Send_init(&(operator()(1, 0)), m_n, MPI_FLOAT, m_north_prank, 0, - m_communicator, &m_requests[2]); - MPI_Recv_init(&(operator()(0, 0)), m_n, MPI_FLOAT, m_north_prank, 0, - m_communicator, &m_requests[0]); - - /// preparing synchronize south - MPI_Send_init(&(operator()(m_m - 2, 0)), m_n, MPI_FLOAT, m_south_prank, 0, - m_communicator, &m_requests[3]); - MPI_Recv_init(&(operator()(m_m - 1, 0)), m_n, MPI_FLOAT, m_south_prank, 0, - m_communicator, &m_requests[1]); -} + int left, right, top, bottom; + MPI_Cart_shift(m_communicator, 0, 1, &left, &right); + MPI_Cart_shift(m_communicator, 1, 1, &top, &bottom); -/* -------------------------------------------------------------------------- */ -void Grid::startSynchronization() { - MPI_Startall(4, m_requests.data()); -} +#if defined(_DEBUG_MPI) + static bool first = true; + if (first) { + int psize; + MPI_Comm_size(m_communicator, &psize); + for (int i = 0; i < psize; ++i) { + if (prank == i) { + std::cerr << prank << std::setw(7) << top << "\n" + << prank << std::setw(8) << "^\n" + << prank << std::setw(3) << left << " < . > " << right << "\n" + << prank << std::setw(8) << "v\n" + << prank << std::setw(7) << bottom << "\n\n"; + } + MPI_Barrier(m_communicator); + } + first = false; + } +#endif -/* -------------------------------------------------------------------------- */ -void Grid::waitReceives() { - MPI_Waitall(2, m_requests.data(), MPI_STATUS_IGNORE); -} + auto addr = [&](auto && i, auto && j) { return &(this->operator()(i, j)); }; -/* -------------------------------------------------------------------------- */ -void Grid::waitSends() { - MPI_Waitall(2, m_requests.data() + 2, MPI_STATUS_IGNORE); + MPI_Request * rrequest = m_requests.data(); + MPI_Request * srequest = m_requests.data() + 4; + + static int tag = 1; + /// preparing receives + { + auto send = addr(m_m - 2, 1); + auto recv = addr(0, 1); + MPI_Send_init(send, 1, line_t, bottom, tag, m_communicator, srequest); + MPI_Recv_init(recv, 1, line_t, top, tag, m_communicator, rrequest); + ++rrequest; + ++srequest; + ++tag; + } + { + auto send = addr(1, m_n - 2); + auto recv = addr(1, 0); + MPI_Send_init(send, 1, column_t, right, tag, m_communicator, srequest); + MPI_Recv_init(recv, 1, column_t, left, tag, m_communicator, rrequest); + ++rrequest; + ++srequest; + ++tag; + } + { + auto send = addr(1, 1); + auto recv = addr(m_m - 1, 1); + MPI_Send_init(send, 1, line_t, top, tag, m_communicator, srequest); + MPI_Recv_init(recv, 1, line_t, bottom, tag, m_communicator, rrequest); + ++rrequest; + ++srequest; + ++tag; + } + { + auto send = addr(1, 1); + auto recv = addr(1, m_n - 1); + MPI_Send_init(send, 1, column_t, left, tag, m_communicator, srequest); + MPI_Recv_init(recv, 1, column_t, right, tag, m_communicator, rrequest); + ++rrequest; + ++srequest; + ++tag; + } } +#endif diff --git a/grid.hh b/grid.hh index b68a933..5602db2 100644 --- a/grid.hh +++ b/grid.hh @@ -1,50 +1,134 @@ #ifndef GRID_HH #define GRID_HH /* -------------------------------------------------------------------------- */ -#include #include +#include +#if defined(_MPI) #include +#endif +#if defined(_OPENMP) +#include +#endif +#if defined(_DEBUG) +#include +#endif +/* -------------------------------------------------------------------------- */ +namespace { +#if not defined(_DEBUG) +std::size_t thread_id() __attribute__((unused)); +std::size_t nb_threads() __attribute__((unused)); +#endif + +std::size_t thread_id() { +#if defined(_OPENMP) + return omp_get_thread_num(); +#else + return 0; +#endif +} + +std::size_t nb_threads() { +#if defined(_OPENMP) + return omp_get_max_threads(); +#else + return 1; +#endif +} +} // namespace + +namespace { +struct DEBUG { +#if defined(_DEBUG) + static void print(const std::string & func) { + auto nthreads = nb_threads(); + auto nthread = thread_id(); +#pragma omp critical + std::cerr << nthread << "/" << nthreads << ": " << func << "\n"; + } +#else + static void print(const std::string & /*func*/) {} +#endif +}; +} // namespace + /* -------------------------------------------------------------------------- */ class Grid { public: Grid(int m = 0, int n = 0); + ~Grid(); /// access the value [i][j] of the grid inline float & operator()(int i, int j) { return m_storage[i * m_n + j]; } inline const float & operator()(int i, int j) const { return m_storage[i * m_n + j]; } void resize(int m, int n); /// set the grid to 0 void clear(); int m() const; int n() const; - void initializeHaloCommunications(MPI_Comm & m_communicator); - void startSynchronization(); - void waitReceives(); - void waitSends(); +#if defined(_MPI) + void initializeHaloCommunications(MPI_Comm & m_communicator, + const std::array & ghosts); + void startSynchronization() { + MPI_Request * requests = m_requests.data(); +#pragma omp task depend(in : requests [0:8]) + { + DEBUG::print(__PRETTY_FUNCTION__); + MPI_Startall(8, requests); + } + } -private: - int m_m, m_n; - std::vector m_storage; + void waitReceives() { + auto requests = m_requests.data(); +#pragma omp task depend(out : requests [0:4]) + { + DEBUG::print(__PRETTY_FUNCTION__); + MPI_Waitall(4, requests, MPI_STATUS_IGNORE); + } + } - /// local neighbors - int m_north_prank, m_south_prank; + void waitSends() { + auto requests = m_requests.data(); +#pragma omp task depend(out : requests [4:4]) + { + DEBUG::print(__PRETTY_FUNCTION__); + MPI_Waitall(4, requests + 4, MPI_STATUS_IGNORE); + } + } +#else + void startSynchronization() { +#pragma omp task + DEBUG::print(__PRETTY_FUNCTION__); + } + void waitReceives() { +#pragma omp task + DEBUG::print(__PRETTY_FUNCTION__); + } - /// proc rank - int m_prank; + void waitSends() { +#pragma omp task + DEBUG::print(__PRETTY_FUNCTION__); + } +#endif - /// nb process - int m_psize; +private: + int m_m, m_n; + std::vector m_storage; +#if defined(_MPI) /// request for asynchronous communications - std::array m_requests; + std::array m_requests; + + /// communication types + MPI_Datatype column_t, line_t; +#endif }; #endif /* GRID_HH */ diff --git a/poisson.cc b/poisson.cc index f8e402d..4edaee0 100644 --- a/poisson.cc +++ b/poisson.cc @@ -1,63 +1,91 @@ /* -------------------------------------------------------------------------- */ #include "simulation.hh" /* -------------------------------------------------------------------------- */ #include #include #include #include -#include /* -------------------------------------------------------------------------- */ -#include +#if defined(_MPI) #include +#endif +#if defined(_OPENMP) +#include +#endif /* -------------------------------------------------------------------------- */ -#define EPSILON 0.005 +#define EPSILON 0.1 typedef std::chrono::high_resolution_clock clk; typedef std::chrono::duration second; static void usage(const std::string & prog_name) { std::cerr << prog_name << " " << std::endl; exit(0); } - int main(int argc, char * argv[]) { - MPI_Init(&argc, &argv); - int prank, psize; + int prank{0}, psize{1}, nthreads{1}; +#if defined(_MPI) + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); + // assert(provided == MPI_THREAD_MULTIPLE); MPI_Comm_rank(MPI_COMM_WORLD, &prank); MPI_Comm_size(MPI_COMM_WORLD, &psize); +#endif + +#if defined (_OPENMP) + nthreads = omp_get_max_threads(); +#endif - if (argc != 2) usage(argv[0]); + if (argc != 2) + usage(argv[0]); std::stringstream args(argv[1]); int N; args >> N; - if(args.fail()) usage(argv[0]); + if (args.fail()) + usage(argv[0]); - Simulation simu(N, N, MPI_COMM_WORLD); + std::unique_ptr simu; - simu.set_initial_conditions(); +#if defined(_MPI) + simu = std::make_unique(N, N, MPI_COMM_WORLD); +#else + simu = std::make_unique(N, N); +#endif + simu->set_initial_conditions(); - simu.set_epsilon(EPSILON); + simu->set_epsilon(EPSILON); float l2; int k; auto start = clk::now(); - std::tie(l2, k) = simu.compute(); + std::tie(l2, k) = simu->compute(); auto end = clk::now(); second time = end - start; - if(prank == 0) - std::cout << psize << " " << N << " " - << k << " " << std::scientific << l2 << " " - << time.count() << std::endl; - + auto min = simu->min(); + auto max = simu->max(); + + if (prank == 0) + std::cout << "MPI: " << psize << " (" << simu->dims(0) << ", " << simu->dims(1) << ")\n" + << "OMP: " << nthreads << "\n" + << "Grid: " << N << "x" << N << "\n" + << "Bound: " << "[ " << min << " : " << max << " ]" << "\n" + << "Iter: " << k << "\n" + << "Norm: " << std::scientific << l2 << "\n" + << "Time: " << time.count() << "s" << "\n" + << "Flops: " << simu->complexity()*k/time.count()/1e9 << " GFlops" << std::endl; + +#if defined(_MPI) + simu.release(); // To force destruction of MPI related stuff MPI_Finalize(); +#endif return 0; } diff --git a/simulation.cc b/simulation.cc index 0278d94..1dd2abb 100644 --- a/simulation.cc +++ b/simulation.cc @@ -1,125 +1,310 @@ /* -------------------------------------------------------------------------- */ #include "simulation.hh" /* -------------------------------------------------------------------------- */ +#include #include #include -/* -------------------------------------------------------------------------- */ +#include +#if defined(_MPI) +#include +#endif /* -------------------------------------------------------------------------- */ -Simulation::Simulation(int m, int n, MPI_Comm communicator) - : m_global_m(m), m_global_n(n), m_epsilon(1e-7), m_h_m(1. / m), - m_h_n(1. / n), m_dumper(new DumperBinary(communicator)), - m_communicator(communicator) { +#if defined(_MPI) +Simulation::Simulation(int m, int n, MPI_Comm communicator) { +#else +Simulation::Simulation(int m, int n) { +#endif + +// retrieving the number of proc and the rank in the proc pool +#if defined(_MPI) + MPI_Comm_size(communicator, &m_psize); + + m_dims = {0, 0}; + std::array periods{0, 0}; + MPI_Dims_create(m_psize, m_dims.size(), m_dims.data()); + + assert(m_dims[0] * m_dims[1] == m_psize && "MPI_Dims_create is fucked up"); + + MPI_Cart_create(communicator, m_dims.size(), m_dims.data(), periods.data(), 0, + &m_communicator); - // retrieving the number of proc and the rank in the proc pool MPI_Comm_rank(m_communicator, &m_prank); - MPI_Comm_size(m_communicator, &m_psize); - // computation of the local size of the grid the remainder is spread equally - // on the first processors - m_local_m = m / m_psize + (m_prank < m % m_psize ? 1 : 0); - m_local_n = n; + MPI_Cart_coords(m_communicator, m_prank, m_coords.size(), m_coords.data()); - // adding the ghosts lines if needed - if (m_psize > 1) - m_local_m += (m_prank == 0 || m_prank == m_psize - 1) ? 1 : 2; + m_dumper = std::make_unique(m_communicator); +#else + m_dims = {1, 1}; + m_coords = {0, 0}; + m_dumper = std::make_unique(); +#endif - // computing the offsets of the local grid in the global one - m_offset_m = - (m / m_psize) * m_prank + (m_prank < m % m_psize ? m_prank : m % m_psize); - m_offset_n = 0; + m_global_size = {m, n}; - // resizing the different grids - m_grids.resize(m_local_m, m_local_n); - m_f.resize(m_local_m, m_local_n); + std::array ghosts{0, 0}; + + for (std::size_t i = 0; i < m_local_size.size(); ++i) { + // computation of the local size of the grid the remainder is spread equally + // on the first processors + m_local_size[i] = m_global_size[i] / m_dims[i] + + (m_coords[i] < m_global_size[i] % m_dims[i] ? 1 : 0); + + // adding the ghosts lines if needed + if (m_dims[i] > 1) + ghosts[i] = (m_coords[i] == 0 || m_coords[i] == m_dims[i] - 1) ? 1 : 2; + + m_local_size[i] += ghosts[i]; - m_grids.current().initializeHaloCommunications(communicator); - m_grids.old().initializeHaloCommunications(communicator); + // computing the offsets of the local grid in the global one + m_offset[i] = (m_global_size[i] / m_dims[i]) * m_coords[i] + + (m_coords[i] < m_global_size[i] % m_dims[i] + ? m_coords[i] + : m_global_size[i] % m_dims[i]); + + m_h[i] = 1. / m_global_size[i]; + } + +#if defined(_DEBUG_MPI) + for (int i = 0; i < m_psize; ++i) { + if (m_prank == i) + std::cerr << m_prank << " (" << m_coords[0] << ", " << m_coords[1] << ")" + << " - " << m_local_size[0] << "x" << m_local_size[1] << " + " + << m_offset[0] << "x" << m_offset[1] << " ++ [" << ghosts[0] + << "x" << ghosts[1] << "]\n"; + MPI_Barrier(m_communicator); + } +#endif + + // resizing the different grids + m_grids.resize(m_local_size[0], m_local_size[1]); + m_f.resize(m_local_size[0], m_local_size[1]); +#if defined(_MPI) + m_grids.current().initializeHaloCommunications(m_communicator, ghosts); + m_grids.old().initializeHaloCommunications(m_communicator, ghosts); +#endif } /* -------------------------------------------------------------------------- */ void Simulation::set_initial_conditions() { - int i_start = (m_prank == 0 ? 0 : 1); - int i_end = (m_prank == m_psize - 1 ? m_local_m : m_local_m - 1); + std::array start = {0, 0}; + std::array end = m_local_size; + + for (int i = 0; i < 2; ++i) { + start[i] += (m_coords[i] == 0 ? 0 : 1); + end[i] -= (m_coords[i] == m_dims[i] - 1 ? 0 : 1); + } + float min = 0., max = 0.; float a = 5.5 * M_PI; - for (int i = i_start; i < i_end; i++) { - for (int j = 0; j < m_local_n; j++) { - m_f(i, j) = -2. * a * a * - std::sin(a * (m_offset_m + i - i_start) * m_h_m) * - std::sin(a * (m_offset_n + j) * m_h_n); + for (int i = start[0]; i < end[0]; i++) { + for (int j = start[1]; j < end[1]; j++) { + m_f(i, j) = + -2. * a * a * std::sin(a * (m_offset[0] + i - start[0]) * m_h[0]) * + std::sin(a * (m_offset[1] + j - start[1]) * m_h[1]) * m_h[0] * m_h[1]; + + min = std::min(min, m_f(i, j)); + max = std::max(max, m_f(i, j)); } } + + m_dumper->set_min(min); + m_dumper->set_max(max); + m_dumper->dump(m_f, 1); } /* -------------------------------------------------------------------------- */ std::tuple Simulation::compute() { int s = 0; - float l2 = 0; + + m_epsilon = m_epsilon * std::min(m_h[0], m_h[1]); + + // std::cout << m_h_m << " " << m_h_n << " " << m_epsilon << std::endl; + float l2 = 0.; + +#pragma omp parallel default(shared) +#pragma omp single do { l2 = compute_step(); + DEBUG::print("do_step: " + std::to_string(l2)); m_grids.swap(); - m_dumper->dump(m_grids.old(), s); +#if defined(_MPI) + MPI_Allreduce(MPI_IN_PLACE, &l2, 1, MPI_FLOAT, MPI_SUM, m_communicator); + DEBUG::print("MPI_Allreduce: " + std::to_string(m_l2_shared)); +#endif + ++s; + + if (s == 100) + break; + } while (l2 > m_epsilon); + m_dumper->set_min(min()); + m_dumper->set_max(max()); + m_dumper->dump(m_grids.old(), 0); + return std::make_tuple(l2, s); } /* -------------------------------------------------------------------------- */ void Simulation::set_epsilon(float epsilon) { m_epsilon = epsilon; } /* -------------------------------------------------------------------------- */ float Simulation::epsilon() const { return m_epsilon; } /* -------------------------------------------------------------------------- */ inline float Simulation::compute_row(int i) { - float l2 = 0; + float l2 = 0.; Grid & u = m_grids.current(); Grid & uo = m_grids.old(); - for (int j = 1; j < m_local_n - 1; j++) { + for (int j = 1; j < m_local_size[1] - 1; j++) { // computation of the new step - u(i, j) = 0.25 * (uo(i - 1, j) + uo(i + 1, j) + uo(i, j - 1) + - uo(i, j + 1) - m_f(i, j) * m_h_m * m_h_n); + u(i, j) = (uo(i - 1, j) + uo(i + 1, j) + uo(i, j - 1) + uo(i, j + 1) - + m_f(i, j)) / + 4.; // L2 norm l2 += (uo(i, j) - u(i, j)) * (uo(i, j) - u(i, j)); } + DEBUG::print(std::string{__PRETTY_FUNCTION__} + " - i=" + std::to_string(i)); + return l2; } /* -------------------------------------------------------------------------- */ float Simulation::compute_step() { - float l2 = 0.; - Grid & uo = m_grids.old(); + auto nthreads = nb_threads(); + std::vector l2(nthreads, 0); + + auto & uo = m_grids.old(); + auto & u = m_grids.current(); + + auto _compute = [&](auto && i, auto && j, auto && nthread) { + u(i, j) = (uo(i - 1, j) + uo(i + 1, j) + uo(i, j - 1) + uo(i, j + 1) - + m_f(i, j)) / + 4.; - // start the permanent communications - uo.startSynchronization(); + l2[nthread] += (uo(i, j) - u(i, j)) * (uo(i, j) - u(i, j)); + }; +#pragma omp taskgroup + { + // start the permanent communications + uo.startSynchronization(); + +#pragma omp taskloop grainsize(50) // computing the inner rows that do not depend on the ghosts - for (int i = 2; i < m_local_m - 2; i++) { - l2 += compute_row(i); - } + for (int i = 2; i < m_local_size[0] - 2; i++) { + auto nthread = thread_id(); + for (int j = 2; j < m_local_size[1] - 2; j++) { + _compute(i, j, nthread); + } + } - /// wait to receive the ghosts before using them for them computation - uo.waitReceives(); + /// wait to receive the ghosts before using them for them computation + uo.waitReceives(); + } +#pragma omp taskgroup + { // computing the line that depends on the ghosts - l2 += compute_row(1); - l2 += compute_row(m_local_m - 2); +#pragma omp task depend(out : l2) + { + auto nthread = thread_id(); + for (int i = 2; i < m_local_size[1] - 2; i++) { + _compute(1, i, nthread); + } + } + +#pragma omp task depend(out : l2) + { + auto nthread = thread_id(); + for (int i = 2; i < m_local_size[1] - 2; i++) { + _compute(m_local_size[1] - 1, i, nthread); + } + } + +#pragma omp task depend(out : l2) + { + auto nthread = thread_id(); + for (int i = 2; i < m_local_size[0] - 2; i++) { + _compute(i, 1, nthread); + } + } + +#pragma omp task depend(out : l2) + { + auto nthread = thread_id(); + for (int i = 2; i < m_local_size[0] - 2; i++) { + _compute(i, m_local_size[1] - 1, nthread); + } + } + + /// wait to send everything before changing the buffers + uo.waitSends(); - /// wait to send everything before changing the buffers - uo.waitSends(); + // Summing the value of all the processors together +#pragma omp task depend(in : l2) + { + m_l2_shared = 0.; + for (decltype(nthreads) i = 0; i < nthreads; ++i) { + m_l2_shared += l2[i]; + } + DEBUG::print("Local reduce: " + std::to_string(m_l2_shared)); + } + } + return m_l2_shared; +} - // Summing the value of all the processors together - MPI_Allreduce(MPI_IN_PLACE, &l2, 1, MPI_FLOAT, MPI_SUM, m_communicator); +namespace { +template +auto accumulate_grid(const G & grid, float init, Func && func) { + auto m = grid.m(); + auto n = grid.n(); - return l2; + for (decltype(m) i = 1; i < m - 1; i++) { + for (decltype(n) j = 1; j < n - 1; j++) { + init = func(init, grid(i, j)); + } + } + return init; +} +} // namespace + +/* -------------------------------------------------------------------------- */ +float Simulation::min() const { + const auto & u0 = m_grids.old(); + + auto tmp = accumulate_grid( + u0, u0(1, 1), [](auto && a, auto && b) { return std::min(a, b); }); + +#if defined(_MPI) + MPI_Allreduce(MPI_IN_PLACE, &tmp, 1, MPI_FLOAT, MPI_MIN, m_communicator); +#endif + + return tmp; } + +/* -------------------------------------------------------------------------- */ +float Simulation::max() const { + const auto & u0 = m_grids.old(); + + auto tmp = accumulate_grid( + u0, u0(1, 1), [](auto && a, auto && b) { return std::max(a, b); }); + +#if defined(_MPI) + MPI_Allreduce(MPI_IN_PLACE, &tmp, 1, MPI_FLOAT, MPI_MAX, m_communicator); +#endif + + return tmp; +} + +/* -------------------------------------------------------------------------- */ diff --git a/simulation.hh b/simulation.hh index 1dfe321..e223819 100644 --- a/simulation.hh +++ b/simulation.hh @@ -1,76 +1,94 @@ #ifndef SIMULATION_HH #define SIMULATION_HH /* -------------------------------------------------------------------------- */ #include "double_buffer.hh" #include "dumpers.hh" #include "grid.hh" /* -------------------------------------------------------------------------- */ -#include -#include #include +#include +#include +#if defined(_MPI) #include +#endif /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ class Simulation { public: +#if defined(_MPI) Simulation(int m, int n, MPI_Comm communicator); - +#else + Simulation(int m, int n); +#endif /// set the initial conditions, Dirichlet and source term virtual void set_initial_conditions(); /// perform the simulation std::tuple compute(); /// access the precision void set_epsilon(float epsilon); float epsilon() const; + std::size_t complexity() { return 8 * m_global_size[0] * m_global_size[1]; } + + float min() const; + float max() const; + + float dims(std::size_t i) const { return m_dims[i]; } + protected: /// compute one step and return an error virtual float compute_step(); /// compute one row float compute_row(int i); private: /// Global problem size - int m_global_m, m_global_n; + std::array m_global_size; /// Local problem size - int m_local_m, m_local_n; + std::array m_local_size; /// local offset - int m_offset_m, m_offset_n; + std::array m_offset; - /// local neighbors - int m_north_prank, m_south_prank; + /// Cartesian dimensions + std::array m_dims; + + /// Cartesian coordinates + std::array m_coords; /// proc rank - int m_prank; + int m_prank{0}; /// communicator size - int m_psize; + int m_psize{1}; /// Precision to achieve float m_epsilon; /// grid spacing - float m_h_m; - float m_h_n; + std::array m_h; + + float m_l2_shared; /// Grids storage DoubleBuffer m_grids; /// source term Grid m_f; /// Dumper to use for outputs std::unique_ptr m_dumper; +#if defined(_MPI) /// communicator MPI_Comm m_communicator; +#endif }; #endif /* SIMULATION_HH */