+# Skip NVCC compilation and use host compiler directly
+host_only=0
+
+# Enable workaround for CUDA 6.5 for pragma ident
+replace_pragma_ident=0
+
+# Mark first host compiler argument
+first_xcompiler_arg=1
+
+temp_dir=${TMPDIR:-/tmp}
+
+#echo "Arguments: $# $@"
+
+while [ $# -gt 0 ]
+do
+ case $1 in
+ #show the executed command
+ --show|--nvcc-wrapper-show)
+ dry_run=1
+ ;;
+ #run host compilation only
+ --host-only)
+ host_only=1
+ ;;
+ #replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
+ --replace-pragma-ident)
+ replace_pragma_ident=1
+ ;;
+ #handle source files to be compiled as cuda files
+ *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
+ cpp_files="$cpp_files $1"
+ ;;
+ #Handle shared args (valid for both nvcc and the host compiler)
+ , m_next_index(ViewAllocateWithoutInitializing("UnorderedMap next index"), capacity()+1) // +1 so that the *_at functions can always return a valid reference
+# error "It doesn't make sense to build this file unless the Kokkos::Serial device is enabled. If you see this message, it probably means that there is an error in Kokkos' CMake build infrastructure."
+# error "You must enable at least one of the following execution spaces in order to build this test: Kokkos::Threads, Kokkos::OpenMP, or Kokkos::Serial."
+ "Whether to enable the Kokkos::Serial device. This device executes \"parallel\" kernels sequentially on a single CPU thread. It is enabled by default. If you disable this device, please enable at least one other CPU device, such as Kokkos::OpenMP or Kokkos::Threads."
+ "Whether Kokkos allows use of lambdas at the outer level of parallel dispatch (that is, as the argument to an outer parallel_for, parallel_reduce, or parallel_scan). This requires C++11. It also does not currently work with public releases of CUDA. As a result, even if C++11 is enabled, this will be OFF by default if CUDA is enabled. If this option is ON, the macro KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA will be defined. For compatibility with Kokkos' Makefile build system, it is also possible to define that macro on the command line."
+ Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+
+ // Make sure total block size is permissable
+ if ( m_team_size * m_vector_length > 1024 ) {
+ Impl::throw_runtime_exception(std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. Team size x vector length must be smaller than 1024."));
+ }
+ }
+
+ /** \brief Specify league size, request team size */
+ Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+
+ // Make sure total block size is permissable
+ if ( m_team_size * m_vector_length > 1024 ) {
+ Impl::throw_runtime_exception(std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. Team size x vector length must be smaller than 1024."));
+ Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+
+ // Make sure total block size is permissable
+ if ( m_team_size * m_vector_length > 1024 ) {
+ Impl::throw_runtime_exception(std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. Team size x vector length must be smaller than 1024."));
+ // The global parallel_reduce does not support vector_length other than 1 at the moment
+ if( arg_policy.vector_length() > 1)
+ Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA.");
+
+ if( m_team_size < 32)
+ Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA.");
+
+ // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
+ typedef Cuda DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+ typedef OpenMP DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+ typedef Threads DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+ typedef Serial DefaultExecutionSpace ;
+#else
+# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#endif
+
+#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+ typedef OpenMP DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+ typedef Threads DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+ typedef Serial DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_OPENMP )
+ typedef OpenMP DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_PTHREAD )
+ typedef Threads DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_SERIAL )
+ typedef Serial DefaultHostExecutionSpace ;
+#else
+# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+ Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" );
+ if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) {
+ //Find the number of threads (expecting --threads=XX)
+ if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0)))
+ Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+ Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+ if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0)))
+ Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+ Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+ //Find the number of device (expecting --device=XX)
+ if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0)))
+ Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+ Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+ //Find the number of device (expecting --device=XX)
+ if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
+ Impl::throw_runtime_exception("Error: expecting an '=INT[,INT]' after command line argument '--ndevices/--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+ int ndevices=-1;
+ int skip_device = 9999;
+
+ char* num1 = strchr(arg[iarg],'=')+1;
+ char* num2 = strpbrk(num1,",");
+ int num1_len = num2==NULL?strlen(num1):num2-num1;
+ Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+ Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices=XX,'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+//change this if you want to allow oversubscription of the system, by default only the range {1-(system size)} is tested
+#define FOR_GAUNTLET(x) for(unsigned x = (std::min)(std::thread::hardware_concurrency()*8,unsigned(sizeof(next_table)/sizeof(unsigned))); x; x = next_table[x-1])
+
+//set this to override the benchmark of barriers to use OMP barriers instead of n3998 std::barrier
+//#define USEOMP
+
+#if defined(__SYNCHRONIC_COMPATIBLE)
+ #define PREFIX "futex-"
+#else
+ #define PREFIX "backoff-"
+#endif
+
+//this test uses a custom Mersenne twister to eliminate implementation variation
+\mainpage Trilinos/Kokkos: Shared-memory programming interface and computational kernels
+
+\section Kokkos_Intro Introduction
+
+The %Kokkos package has two main components. The first, sometimes
+called "%Kokkos Array" or just "%Kokkos," implements a
+performance-portable shared-memory parallel programming model and data
+containers. The second, called "%Kokkos Classic," consists of
+computational kernels that support the %Tpetra package.
+
+\section Kokkos_Kokkos The %Kokkos programming model
+
+%Kokkos implements a performance-portable shared-memory parallel
+programming model and data containers. It lets you write an algorithm
+once, and just change a template parameter to get the optimal data
+layout for your hardware. %Kokkos has back-ends for the following
+parallel programming models:
+
+- Kokkos::Threads: POSIX Threads (Pthreads)
+- Kokkos::OpenMP: OpenMP
+- Kokkos::Cuda: NVIDIA's CUDA programming model for graphics
+ processing units (GPUs)
+- Kokkos::Serial: No thread parallelism
+
+%Kokkos also has optimizations for shared-memory parallel systems with
+nonuniform memory access (NUMA). Its containers can hold data of any
+primitive ("plain old") data type (and some aggregate types). %Kokkos
+Array may be used as a stand-alone programming model.
+
+%Kokkos' parallel operations include the following:
+
+- parallel_for: a thread-parallel "for loop"
+- parallel_reduce: a thread-parallel reduction
+- parallel_scan: a thread-parallel prefix scan operation
+
+as well as expert-level platform-independent interfaces to thread
+"teams," per-team "shared memory," synchronization, and atomic update
+operations.
+
+%Kokkos' data containers include the following:
+
+- Kokkos::View: A multidimensional array suitable for thread-parallel
+ operations. Its layout (e.g., row-major or column-major) is
+ optimized by default for the particular thread-parallel device.
+- Kokkos::Vector: A drop-in replacement for std::vector that eases
+ porting from standard sequential C++ data structures to %Kokkos'
+ parallel data structures.
+- Kokkos::UnorderedMap: A parallel lookup table comparable in
+ functionality to std::unordered_map.
+
+%Kokkos also uses the above basic containers to implement higher-level
+data structures, like sparse graphs and matrices.
+
+A good place to start learning about %Kokkos would be <a href="http://trilinos.sandia.gov/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf">these tutorial slides</a> from the 2013 Trilinos Users' Group meeting.
+
+\section Kokkos_Classic %Kokkos Classic
+
+"%Kokkos Classic" consists of computational kernels that support the
+%Tpetra package. These kernels include sparse matrix-vector multiply,
+sparse triangular solve, Gauss-Seidel, and dense vector operations.
+They are templated on the type of objects (\c Scalar) on which they
+operate. This component was not meant to be visible to users; it is
+an implementation detail of the %Tpetra distributed linear algebra
+package.
+
+%Kokkos Classic also implements a shared-memory parallel programming
+model. This inspired and preceded the %Kokkos programming model
+described in the previous section. Users should consider the %Kokkos
+Classic programming model deprecated, and prefer the new %Kokkos