diff --git a/src/core/loop.hh b/src/core/loop.hh index 6414ff8..4e37c1b 100644 --- a/src/core/loop.hh +++ b/src/core/loop.hh @@ -1,221 +1,226 @@ /** * @file * * @author Lucas Frérot * * @section LICENSE * * Copyright (©) 2017 EPFL (Ecole Polytechnique Fédérale de * Lausanne) Laboratory (LSMS - Laboratoire de Simulation en Mécanique des * Solides) * * Tamaas is free software: you can redistribute it and/or modify it under the * terms of the GNU Lesser General Public License as published by the Free * Software Foundation, either version 3 of the License, or (at your option) any * later version. * * Tamaas is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with Tamaas. If not, see . * */ /* -------------------------------------------------------------------------- */ #ifndef __LOOP_HH__ #define __LOOP_HH__ /* -------------------------------------------------------------------------- */ #include "loops/apply.hh" #include "loops/loop_utils.hh" #include "tamaas.hh" #include #include #include #include #include #include #include __BEGIN_TAMAAS__ template struct is_policy : std::false_type {}; template <> struct is_policy : std::true_type {}; template <> struct is_policy : std::true_type {}; template <> struct is_policy : std::true_type {}; template <> struct is_policy : std::true_type {}; template <> struct is_policy : std::true_type {}; template <> struct is_policy : std::true_type {}; /** * @brief Singleton class for automated loops using lambdas * This class is sweet candy :) It provides abstraction of the paralelism * paradigm used in loops and allows simple and less erro-prone loop syntax, * with minimum boiler plate. I love it <3 */ class Loop { public: /// Backends enumeration enum backend { omp, ///< [OpenMP](http://www.openmp.org/specifications/) backend cuda, ///< [Cuda](http://docs.nvidia.com/cuda/index.html) backend }; /// Helper class to count iterations within lambda-loop template class arange { public: using it_type = thrust::counting_iterator; - arange(T size) : range_size(size) {} - it_type begin(UInt i = 0) const { return it_type(T(0)); } + arange(T start, T size) : start(start), range_size(size) {} + it_type begin(UInt i = 0) const { return it_type(T(start)); } it_type end(UInt i = 0) const { return it_type(range_size); } UInt getNbComponents() const { return 1; } private: - T range_size; + T start, range_size; }; template static arange range(T size) { - return arange(size); + return arange(0, size); + } + + template + static arange range(U start, T size) { + return arange(start, size); } /// Loop functor over any number of grids template static void loop(Functor&& func, Grids&&... containers); /// Strided loop over any number of grids with parallel policy template static void stridedLoop(const thrust::execution_policy& policy, Functor&& func, Grids&&... containers) { stridedLoopImpl(policy, std::forward(func), std::forward(containers)...); } /// Strided loop over any number of grids template static typename std::enable_if::value, void>::type stridedLoop(Functor&& func, Grids&&... containers) { stridedLoopImpl(thrust::device, std::forward(func), std::forward(containers)...); } private: /// Implementation of strided loop overloads template static void stridedLoopImpl(const thrust::execution_policy& policy, Functor&& func, Grids&&... containers); public: /// Reduce over any number of grids template static auto reduce(Functor&& func, Grids&&... containers) -> decltype(func(containers(0)...)); /// Strided reduce over any number of grids template static auto stridedReduce(Functor&& func, Grids&&... containers) -> decltype(func(containers(0)...)); /// Constructor Loop() = delete; }; /* -------------------------------------------------------------------------- */ /* Template implementation */ /* -------------------------------------------------------------------------- */ template void Loop::loop(Functor&& func, Grids&&... containers) { auto begin = thrust::make_zip_iterator(thrust::make_tuple(containers.begin()...)); auto end = thrust::make_zip_iterator(thrust::make_tuple(containers.end()...)); thrust::for_each(begin, end, detail::ApplyFunctor(func)); #ifdef USE_CUDA cudaDeviceSynchronize(); #endif } /* -------------------------------------------------------------------------- */ template void Loop::stridedLoopImpl( const thrust::execution_policy& /*policy*/, Functor&& func, Grids&&... containers) { auto begin = thrust::make_zip_iterator( thrust::make_tuple(containers.begin(containers.getNbComponents())...)); auto end = thrust::make_zip_iterator( thrust::make_tuple(containers.end(containers.getNbComponents())...)); thrust::for_each(begin, end, detail::ApplyFunctor(func)); #ifdef USE_CUDA cudaDeviceSynchronize(); #endif } /* -------------------------------------------------------------------------- */ template auto Loop::reduce(Functor&& func, Grids&&... containers) -> decltype(func(containers(0)...)) { auto begin = thrust::make_zip_iterator(thrust::make_tuple(containers.begin()...)); auto end = thrust::make_zip_iterator(thrust::make_tuple(containers.end()...)); using reduce_type = decltype(func(containers(0)...)); using apply_type = detail::ApplyFunctor; auto red_helper = detail::reduction_helper(apply_type(func)); auto result = thrust::reduce( begin, end, red_helper.template init(), red_helper); #ifdef USE_CUDA cudaDeviceSynchronize(); #endif return result; } /* -------------------------------------------------------------------------- */ template auto Loop::stridedReduce(Functor&& func, Grids&&... containers) -> decltype(func(containers(0)...)) { auto begin = thrust::make_zip_iterator( thrust::make_tuple(containers.begin(containers.getNbComponents())...)); auto end = thrust::make_zip_iterator( thrust::make_tuple(containers.end(containers.getNbComponents())...)); using reduce_type = decltype(func(containers(0)...)); using apply_type = detail::ApplyFunctor; auto red_helper = detail::reduction_helper(apply_type(func)); auto result = thrust::reduce( begin, end, red_helper.template init(), red_helper); #ifdef USE_CUDA cudaDeviceSynchronize(); #endif return result; } /* -------------------------------------------------------------------------- */ __END_TAMAAS__ #undef EXEC_CASE_MACRO #undef REDUCE_CASE_MACRO #endif // __LOOP_HH__ diff --git a/tests/test_loop.cpp b/tests/test_loop.cpp index afc593e..064d694 100644 --- a/tests/test_loop.cpp +++ b/tests/test_loop.cpp @@ -1,263 +1,263 @@ /** * * @author Lucas Frérot * * @section LICENSE * * Copyright (©) 2017 EPFL (Ecole Polytechnique Fédérale de * Lausanne) Laboratory (LSMS - Laboratoire de Simulation en Mécanique des * Solides) * * Tamaas is free software: you can redistribute it and/or modify it under the * terms of the GNU Lesser General Public License as published by the Free * Software Foundation, either version 3 of the License, or (at your option) any * later version. * * Tamaas is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with Tamaas. If not, see . * */ /* -------------------------------------------------------------------------- */ #include "test.hh" #include "grid.hh" #include "static_types.hh" #include "loop.hh" /* -------------------------------------------------------------------------- */ /* WARNING: here we cannot use lambdas for tests because GoogleTest declares */ /* test functions as private members of classes which is incompatible with */ /* cuda's extended lambdas. I know... it's f*cking stupid */ /* -------------------------------------------------------------------------- */ using namespace tamaas; template struct AddOneInplace { CUDA_LAMBDA void operator()(T& x) { x += 1; } }; // Testing loops on one grid TEST(TestLoops, OneArgument) { Grid grid({20}, 1); Grid solution({20}, 1); auto add_one = [] (Real & x) { return x + 1; }; std::iota(grid.begin(), grid.end(), 1); // Makeing solution std::transform(grid.begin(), grid.end(), solution.begin(), add_one); auto add_one_inplace = AddOneInplace(); Loop::loop(add_one_inplace, grid); ASSERT_TRUE(compare(grid, solution, AreFloatEqual())) << "One argument loop failed"; } struct PrimalTest { CUDA_LAMBDA void operator()(Int& primal, Int& val) { val = (primal > 0)? -1:1; } }; // Testing loops on two grids TEST(TestLoops, TwoArguments) { // Why no ints? Grid grid({20, 20}, 1); Grid primal({20, 20}, 1); Grid solution({20, 20}, 1); primal(0, 0) = 1; primal(0, 1) = 1; primal(1, 0) = 1; primal(1, 1) = 1; std::transform(primal.begin(), primal.end(), solution.begin(), [] (Int & primal) { return (primal > 0)? -1:1; }); auto primal_test = PrimalTest(); Loop::loop(primal_test, primal, grid); ASSERT_TRUE(compare(solution, grid)) << "Two argument loop failed"; } struct AssignUInt { CUDA_LAMBDA void operator()(UInt& x, UInt i) { x = i; } }; // Testing an enumeration TEST(TestLoops, Enumeration) { Grid grid({100}, 1); Grid solution({100}, 1); std::iota(solution.begin(), solution.end(), 0); auto assign_uint = AssignUInt(); - Loop::loop(assign_uint, grid, Loop::arange(100)); + Loop::loop(assign_uint, grid, Loop::range(100)); ASSERT_TRUE(compare(solution, grid)) << "Enumeration loop failed"; } /* -------------------------------------------------------------------------- */ struct Identity { CUDA_LAMBDA UInt operator()(UInt& x) const { return x; } }; // Testing one grid reductions TEST(TestReductions, OneArgument) { Grid grid({6}, 1); std::iota(grid.begin(), grid.end(), 1); const auto id = Identity(); // Sum reduction UInt sol = std::accumulate(grid.begin(), grid.end(), 0, std::plus()); UInt red = Loop::reduce(id, grid); ASSERT_TRUE(sol == red) << "Addition reduction failed on one argument"; // Product reduction sol = std::accumulate(grid.begin(), grid.end(), 1, std::multiplies()); red = Loop::reduce(id, grid); ASSERT_TRUE(sol == red) << "Multiplication reduction failed on one argument"; // Min reduction sol = *std::min_element(grid.begin(), grid.end()); red = Loop::reduce(id, grid); ASSERT_TRUE(sol == red) << "Min reduction failed on one argument"; // Max reduction sol = *std::max_element(grid.begin(), grid.end()); red = Loop::reduce(id, grid); ASSERT_TRUE(sol == red) << "Max reduction failed on one argument"; } struct PrimalReduce { CUDA_LAMBDA UInt operator()(UInt& p, UInt& val) { return (p > 0)? val:0; } }; TEST(TestReductions, TwoArguments) { Grid grid({20}, 1); Grid primal({20}, 1); grid = 1; primal(0) = 1; primal(1) = 1; auto primal_reduce = PrimalReduce(); // Reduce on values where primal > 0 UInt red = Loop::reduce(primal_reduce, primal, grid); ASSERT_TRUE(red == 2) << "Two args reduction failed"; } /* -------------------------------------------------------------------------- */ TEST(TestStridedLoops, OneArgument) { Grid grid({10, 10}, 2); std::iota(grid.begin(), grid.end(), 1); Grid solution({10, 10}, 2); solution = grid; std::for_each(solution.begin(), solution.end(), [] (UInt & x) { if (x % 2 == 1) x += 1; }); auto add_one_inplace = AddOneInplace(); Loop::stridedLoop(add_one_inplace, grid); ASSERT_TRUE(compare(solution, grid)) << "One argument strided loop failed"; } template using WrapVector = VectorProxy; struct AddOneVector { CUDA_LAMBDA void operator()(WrapVector&& x) { x(0) += 1; } }; TEST(TestStridedLoops, VectorStride) { Grid grid({10, 10}, 2); std::iota(grid.begin(), grid.end(), 1); Grid solution({10, 10}, 2); solution = grid; std::for_each(solution.begin(), solution.end(), [] (UInt & x) { if (x % 2 == 1) x += 1; }); auto add_one_inplace = AddOneVector(); Loop::stridedLoop(add_one_inplace, grid); ASSERT_TRUE(compare(solution, grid)) << "Static vector strided loop failed"; } template using WrapMatrix = MatrixProxy; struct SetOneMatrix { CUDA_LAMBDA void operator()(WrapMatrix&& x) { x(0, 0) = 1; x(1, 1) = 1; } }; TEST(TestStridedLoops, MatrixStride) { Grid grid({10, 10}, 4); Grid solution({10, 10}, 4); std::iota(solution.begin(), solution.end(), 0); std::for_each(solution.begin(), solution.end(), [] (UInt & x) { if (x % 4 == 0 || x % 4 == 3) x = 1; else x = 0; }); auto set_one = SetOneMatrix(); Loop::stridedLoop(set_one, grid); ASSERT_TRUE(compare(solution, grid)) << "Static matrix strided loop failed"; } struct VectorReduction { CUDA_LAMBDA Vector operator()(VectorProxy&& v) const { return v; } }; struct BroadcastSet123 { CUDA_LAMBDA inline void operator()(VectorProxy&& v) const { v(0) = 1; v(1) = 2; v(2) = 3; } }; TEST(TestStridedReduction, VectorReduce) { Grid grid({10, 10}, 3); Loop::stridedLoop(BroadcastSet123(), grid); auto res = Loop::stridedReduce(VectorReduction(), grid); ASSERT_EQ(res(0), 100); ASSERT_EQ(res(1), 200); ASSERT_EQ(res(2), 300); }