Kokkos_Cuda_Parallel.hpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, May 23, 16:51

Kokkos_Cuda_Parallel.hpp
View Options

	/*
	//@HEADER
	// ************************************************************************
	//
	// Kokkos v. 2.0
	// Copyright (2014) Sandia Corporation
	//
	// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
	// the U.S. Government retains certain rights in this software.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// 1. Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	//
	// 2. Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	//
	// 3. Neither the name of the Corporation nor the names of the
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
	// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
	// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	//
	// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
	//
	// ************************************************************************
	//@HEADER
	*/

	#ifndef KOKKOS_CUDA_PARALLEL_HPP
	#define KOKKOS_CUDA_PARALLEL_HPP

	#include <iostream>
	#include <stdio.h>

	#include <Kokkos_Macros.hpp>

	/* only compile this file if CUDA is enabled for Kokkos */
	#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )

	#include <utility>
	#include <Kokkos_Parallel.hpp>

	#include <Cuda/Kokkos_CudaExec.hpp>
	#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
	#include <Cuda/Kokkos_Cuda_Internal.hpp>
	#include <Kokkos_Vectorization.hpp>

	#ifdef KOKKOSP_ENABLE_PROFILING
	#include <impl/Kokkos_Profiling_Interface.hpp>
	#include <typeinfo>
	#endif

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {

	template< typename Type >
	struct CudaJoinFunctor {
	typedef Type value_type ;

	KOKKOS_INLINE_FUNCTION
	static void join( volatile value_type & update ,
	volatile const value_type & input )
	{ update += input ; }
	};

	class CudaTeamMember {
	private:

	typedef Kokkos::Cuda execution_space ;
	typedef execution_space::scratch_memory_space scratch_memory_space ;

	void * m_team_reduce ;
	scratch_memory_space m_team_shared ;
	int m_league_rank ;
	int m_league_size ;

	public:

	#if defined( __CUDA_ARCH__ )

	__device__ inline
	const execution_space::scratch_memory_space & team_shmem() const
	{ return m_team_shared ; }

	__device__ inline int league_rank() const { return m_league_rank ; }
	__device__ inline int league_size() const { return m_league_size ; }
	__device__ inline int team_rank() const { return threadIdx.y ; }
	__device__ inline int team_size() const { return blockDim.y ; }

	__device__ inline void team_barrier() const { __syncthreads(); }

	template<class ValueType>
	__device__ inline void team_broadcast(ValueType& value, const int& thread_id) const {
	__shared__ ValueType sh_val;
	if(threadIdx.x == 0 && threadIdx.y == thread_id) {
	sh_val = val;
	}
	team_barrier();
	val = sh_val;
	}

	#ifdef KOKKOS_HAVE_CXX11
	template< class ValueType, class JoinOp >
	__device__ inline
	typename JoinOp::value_type team_reduce( const ValueType & value
	, const JoinOp & op_in ) const
	{
	typedef JoinLambdaAdapter<ValueType,JoinOp> JoinOpFunctor ;
	const JoinOpFunctor op(op_in);
	ValueType * const base_data = (ValueType *) m_team_reduce ;
	#else
	template< class JoinOp >
	__device__ inline
	typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
	, const JoinOp & op ) const
	{
	typedef JoinOp JoinOpFunctor ;
	typename JoinOp::value_type * const base_data = (typename JoinOp::value_type *) m_team_reduce ;
	#endif

	__syncthreads(); // Don't write in to shared data until all threads have entered this function

	if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }

	base_data[ threadIdx.y ] = value ;

	Impl::cuda_intra_block_reduce_scan<false,JoinOpFunctor,void>( op , base_data );

	return base_data[ blockDim.y - 1 ];
	}

	/** \brief Intra-team exclusive prefix sum with team_rank() ordering
	* with intra-team non-deterministic ordering accumulation.
	*
	* The global inter-team accumulation value will, at the end of the
	* league's parallel execution, be the scan's total.
	* Parallel execution ordering of the league's teams is non-deterministic.
	* As such the base value for each team's scan operation is similarly
	* non-deterministic.
	*/
	template< typename Type >
	__device__ inline Type team_scan( const Type & value , Type * const global_accum ) const
	{
	Type * const base_data = (Type *) m_team_reduce ;

	__syncthreads(); // Don't write in to shared data until all threads have entered this function

	if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }

	base_data[ threadIdx.y + 1 ] = value ;

	Impl::cuda_intra_block_reduce_scan<true,Impl::CudaJoinFunctor<Type>,void>( Impl::CudaJoinFunctor<Type>() , base_data + 1 );

	if ( global_accum ) {
	if ( blockDim.y == threadIdx.y + 1 ) {
	base_data[ blockDim.y ] = atomic_fetch_add( global_accum , base_data[ blockDim.y ] );
	}
	__syncthreads(); // Wait for atomic
	base_data[ threadIdx.y ] += base_data[ blockDim.y ] ;
	}

	return base_data[ threadIdx.y ];
	}

	/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
	*
	* The highest rank thread can compute the reduction total as
	* reduction_total = dev.team_scan( value ) + value ;
	*/
	template< typename Type >
	__device__ inline Type team_scan( const Type & value ) const
	{ return this->template team_scan<Type>( value , 0 ); }

	//----------------------------------------
	// Private for the driver

	__device__ inline
	CudaTeamMember( void * shared
	, const int shared_begin
	, const int shared_size
	, const int arg_league_rank
	, const int arg_league_size )
	: m_team_reduce( shared )
	, m_team_shared( ((char *)shared) + shared_begin , shared_size )
	, m_league_rank( arg_league_rank )
	, m_league_size( arg_league_size )
	{}

	#else

	const execution_space::scratch_memory_space & team_shmem() const {return m_team_shared;}

	int league_rank() const {return 0;}
	int league_size() const {return 1;}
	int team_rank() const {return 0;}
	int team_size() const {return 1;}

	void team_barrier() const {}
	template<class ValueType>
	void team_broadcast(ValueType& value, const int& thread_id) const {}

	template< class JoinOp >
	typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
	, const JoinOp & op ) const {return typename JoinOp::value_type();}

	template< typename Type >
	Type team_scan( const Type & value , Type * const global_accum ) const {return Type();}

	template< typename Type >
	Type team_scan( const Type & value ) const {return Type();}

	//----------------------------------------
	// Private for the driver

	CudaTeamMember( void * shared
	, const int shared_begin
	, const int shared_end
	, const int arg_league_rank
	, const int arg_league_size );

	#endif /* #if ! defined( __CUDA_ARCH__ ) */

	};

	} // namespace Impl

	template< class Arg0 , class Arg1 >
	class TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda >
	{
	private:

	enum { MAX_WARP = 8 };

	const int m_league_size ;
	const int m_team_size ;
	const int m_vector_length ;

	public:

	//! Tag this class as a kokkos execution policy
	typedef TeamPolicy execution_policy ;

	//! Execution space of this execution policy
	typedef Kokkos::Cuda execution_space ;

	typedef typename
	Impl::if_c< ! Impl::is_same< Kokkos::Cuda , Arg0 >::value , Arg0 , Arg1 >::type
	work_tag ;

	//----------------------------------------

	template< class FunctorType >
	inline static
	int team_size_max( const FunctorType & functor )
	{
	int n = MAX_WARP * Impl::CudaTraits::WarpSize ;

	for ( ; n ; n >>= 1 ) {
	const int shmem_size =
	/* for global reduce */ Impl::cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( functor , n )
	/* for team reduce / + ( n + 2 ) sizeof(double)
	/* for team shared */ + Impl::FunctorTeamShmemSize< FunctorType >::value( functor , n );

	if ( shmem_size < Impl::CudaTraits::SharedMemoryCapacity ) break ;
	}

	return n ;
	}

	template< class FunctorType >
	static int team_size_recommended( const FunctorType & functor )
	{ return team_size_max( functor ); }

	template< class FunctorType >
	static int team_size_recommended( const FunctorType & functor , const int vector_length)
	{
	int max = team_size_max( functor )/vector_length;
	if(max<1) max = 1;
	return max;
	}

	inline static
	int vector_length_max()
	{ return Impl::CudaTraits::WarpSize; }

	//----------------------------------------

	inline int vector_length() const { return m_vector_length ; }
	inline int team_size() const { return m_team_size ; }
	inline int league_size() const { return m_league_size ; }

	/** \brief Specify league size, request team size */
	TeamPolicy( execution_space & , int league_size_ , int team_size_request , int vector_length_request = 1 )
	: m_league_size( league_size_ )
	, m_team_size( team_size_request )
	, m_vector_length ( vector_length_request )
	{
	// Allow only power-of-two vector_length
	int check = 0;
	for(int k = 1; k <= vector_length_max(); k*=2)
	if(k == vector_length_request)
	check = 1;
	if(!check)
	Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");

	// Make sure league size is permissable
	if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
	Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
	}

	TeamPolicy( int league_size_ , int team_size_request , int vector_length_request = 1 )
	: m_league_size( league_size_ )
	, m_team_size( team_size_request )
	, m_vector_length ( vector_length_request )
	{
	// Allow only power-of-two vector_length
	int check = 0;
	for(int k = 1; k <= vector_length_max(); k*=2)
	if(k == vector_length_request)
	check = 1;
	if(!check)
	Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");

	// Make sure league size is permissable
	if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
	Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");

	}

	typedef Kokkos::Impl::CudaTeamMember member_type ;
	};

	} // namespace Kokkos

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {

	template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
	class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > >
	{
	private:

	typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > Policy ;

	const FunctorType m_functor ;
	const Policy m_policy ;

	ParallelFor();
	ParallelFor & operator = ( const ParallelFor & );

	template< class Tag >
	inline static
	__device__
	void driver( const FunctorType & functor
	, typename Impl::enable_if< Impl::is_same< Tag , void >::value
	, typename Policy::member_type const & >::type iwork
	)
	{ functor( iwork ); }

	template< class Tag >
	inline static
	__device__
	void driver( const FunctorType & functor
	, typename Impl::enable_if< ! Impl::is_same< Tag , void >::value
	, typename Policy::member_type const & >::type iwork
	)
	{ functor( Tag() , iwork ); }

	public:

	typedef FunctorType functor_type ;

	inline
	__device__
	void operator()(void) const
	{
	const typename Policy::member_type work_stride = blockDim.y * gridDim.x ;
	const typename Policy::member_type work_end = m_policy.end();

	for ( typename Policy::member_type
	iwork = m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x ;
	iwork < work_end ;
	iwork += work_stride ) {
	ParallelFor::template driver< typename Policy::work_tag >( m_functor, iwork );
	}
	}

	ParallelFor( const FunctorType & functor ,
	const Policy & policy )
	: m_functor( functor )
	, m_policy( policy )
	{
	const dim3 block( 1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
	const dim3 grid( std::min( ( int( policy.end() - policy.begin() ) + block.y - 1 ) / block.y
	, cuda_internal_maximum_grid_count() )
	, 1 , 1);

	CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
	}
	};

	template< class FunctorType , class Arg0 , class Arg1 >
	class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda > >
	{
	private:

	typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda > Policy ;

	public:

	typedef FunctorType functor_type ;
	typedef Cuda::size_type size_type ;

	private:

	// Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
	// shared memory utilization:
	//
	// [ team reduce space ]
	// [ team shared space ]
	//

	const FunctorType m_functor ;
	size_type m_shmem_begin ;
	size_type m_shmem_size ;
	size_type m_league_size ;

	template< class TagType >
	__device__ inline
	void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
	const typename Policy::member_type & >::type member ) const
	{ m_functor( member ); }

	template< class TagType >
	__device__ inline
	void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
	const typename Policy::member_type & >::type member ) const
	{ m_functor( TagType() , member ); }

	public:

	__device__ inline
	void operator()(void) const
	{
	// Iterate this block through the league
	for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {

	ParallelFor::template driver< typename Policy::work_tag >(
	typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
	, m_shmem_begin
	, m_shmem_size
	, league_rank
	, m_league_size ) );
	}
	}


	ParallelFor( const FunctorType & functor
	, const Policy & policy
	)
	: m_functor( functor )
	, m_shmem_begin( sizeof(double) * ( policy.team_size() + 2 ) )
	, m_shmem_size( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
	, m_league_size( policy.league_size() )
	{
	// Functor's reduce memory, team scan memory, and team shared memory depend upon team size.

	const int shmem_size_total = m_shmem_begin + m_shmem_size ;

	if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
	Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
	}

	const dim3 grid( int(policy.league_size()) , 1 , 1 );
	const dim3 block( policy.vector_length() , policy.team_size() , 1 );

	CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
	}
	};

	} // namespace Impl
	} // namespace Kokkos

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {

	template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
	class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > >
	{
	private:

	typedef Kokkos::RangePolicy<Arg0,Arg1,Arg2, Kokkos::Cuda > Policy ;
	typedef typename Policy::WorkRange work_range ;
	typedef typename Policy::work_tag work_tag ;
	typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
	typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;

	public:

	typedef typename ValueTraits::pointer_type pointer_type ;
	typedef typename ValueTraits::value_type value_type ;
	typedef typename ValueTraits::reference_type reference_type ;
	typedef FunctorType functor_type ;
	typedef Cuda::size_type size_type ;

	// Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1

	const FunctorType m_functor ;
	const Policy m_policy ;
	size_type * m_scratch_space ;
	size_type * m_scratch_flags ;
	size_type * m_unified_space ;

	// Determine block size constrained by shared memory:
	static inline
	unsigned local_block_size( const FunctorType & f )
	{
	unsigned n = CudaTraits::WarpSize * 8 ;
	while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( f , n ) ) { n >>= 1 ; }
	return n ;
	}

	template< class Tag >
	inline static
	__device__
	void driver( const FunctorType & functor
	, typename Impl::enable_if< Impl::is_same< Tag , void >::value
	, typename Policy::member_type const & >::type iwork
	, reference_type value )
	{ functor( iwork , value ); }

	template< class Tag >
	inline static
	__device__
	void driver( const FunctorType & functor
	, typename Impl::enable_if< ! Impl::is_same< Tag , void >::value
	, typename Policy::member_type const & >::type iwork
	, reference_type value )
	{ functor( Tag() , iwork , value ); }

	#ifndef KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION
	__device__ inline
	void operator()(void) const
	{
	const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
	word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );

	{
	reference_type value =
	ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );

	// Number of blocks is bounded so that the reduction can be limited to two passes.
	// Each thread block is given an approximately equal amount of work to perform.
	// Accumulate the values for this block.
	// The accumulation ordering does not match the final pass, but is arithmatically equivalent.

	const work_range range( m_policy , blockIdx.x , gridDim.x );

	for ( typename work_range::member_type iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
	iwork < iwork_end ; iwork += blockDim.y ) {
	ParallelReduce::template driver< work_tag >( m_functor , iwork , value );
	}
	}

	// Reduce with final value at blockDim.y - 1 location.
	if ( cuda_single_inter_block_reduce_scan<false,FunctorType,work_tag>(
	m_functor , blockIdx.x , gridDim.x ,
	kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {

	// This is the final block with the final result at the final threads' location

	size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
	size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;

	if ( threadIdx.y == 0 ) {
	Kokkos::Impl::FunctorFinal< FunctorType , work_tag >::final( m_functor , shared );
	}

	if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }

	for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
	}
	}
	#else
	__device__ inline
	void operator()(void) const
	{

	value_type value = 0;

	// Number of blocks is bounded so that the reduction can be limited to two passes.
	// Each thread block is given an approximately equal amount of work to perform.
	// Accumulate the values for this block.
	// The accumulation ordering does not match the final pass, but is arithmatically equivalent.

	const Policy range( m_policy , blockIdx.x , gridDim.x );

	for ( typename Policy::member_type iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
	iwork < iwork_end ; iwork += blockDim.y ) {
	ParallelReduce::template driver< work_tag >( m_functor , iwork , value );
	}

	pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
	int max_active_thread = range.end()-range.begin() < blockDim.y ? range.end() - range.begin():blockDim.y;
	max_active_thread = max_active_thread == 0?blockDim.y:max_active_thread;
	if(Impl::cuda_inter_block_reduction<FunctorType,Impl::JoinAdd<value_type> >
	(value,Impl::JoinAdd<value_type>(),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
	const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
	if(id==0) {
	Kokkos::Impl::FunctorFinal< FunctorType , work_tag >::final( m_functor , (void*) &value );
	*result = value;
	}
	}
	}
	#endif
	template< class HostViewType >
	ParallelReduce( const FunctorType & functor
	, const Policy & policy
	, const HostViewType & result
	)
	: m_functor( functor )
	, m_policy( policy )
	, m_scratch_space( 0 )
	, m_scratch_flags( 0 )
	, m_unified_space( 0 )
	{
	const int block_size = local_block_size( functor );
	const int block_count = std::min( int(block_size)
	, ( int(policy.end() - policy.begin()) + block_size - 1 ) / block_size
	);

	m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( functor ) * block_count );
	m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
	m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( functor ) );

	const dim3 grid( block_count , 1 , 1 );
	const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
	#ifdef KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION
	const int shmem = 0;
	#else
	const int shmem = cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( m_functor , block.y );
	#endif

	CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute

	Cuda::fence();

	if ( result.ptr_on_device() ) {
	if ( m_unified_space ) {
	const int count = ValueTraits::value_count( m_functor );
	for ( int i = 0 ; i < count ; ++i ) { result.ptr_on_device()[i] = pointer_type(m_unified_space)[i] ; }
	}
	else {
	const int size = ValueTraits::value_size( m_functor );
	DeepCopy<HostSpace,CudaSpace>( result.ptr_on_device() , m_scratch_space , size );
	}
	}
	}
	};

	template< class FunctorType , class Arg0 , class Arg1 >
	class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda > >
	{
	private:

	typedef Kokkos::TeamPolicy<Arg0,Arg1,Kokkos::Cuda> Policy ;
	typedef typename Policy::work_tag work_tag ;
	typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
	typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
	typedef typename ValueTraits::pointer_type pointer_type ;
	typedef typename ValueTraits::reference_type reference_type ;

	public:

	typedef FunctorType functor_type ;
	typedef Cuda::size_type size_type ;

	private:

	// Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
	// shared memory utilization:
	//
	// [ global reduce space ]
	// [ team reduce space ]
	// [ team shared space ]
	//

	const FunctorType m_functor ;
	size_type * m_scratch_space ;
	size_type * m_scratch_flags ;
	size_type * m_unified_space ;
	size_type m_team_begin ;
	size_type m_shmem_begin ;
	size_type m_shmem_size ;
	size_type m_league_size ;

	template< class TagType >
	__device__ inline
	void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
	const typename Policy::member_type & >::type member
	, reference_type update ) const
	{ m_functor( member , update ); }

	template< class TagType >
	__device__ inline
	void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
	const typename Policy::member_type & >::type member
	, reference_type update ) const
	{ m_functor( TagType() , member , update ); }

	public:

	__device__ inline
	void operator()(void) const
	{
	const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
	word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );

	reference_type value =
	ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );

	// Iterate this block through the league
	for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {

	ParallelReduce::template driver< work_tag >
	( typename Policy::member_type( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
	, m_shmem_begin
	, m_shmem_size
	, league_rank
	, m_league_size )
	, value );
	}

	// Reduce with final value at blockDim.y - 1 location.
	if ( cuda_single_inter_block_reduce_scan<false,FunctorType,work_tag>(
	m_functor , blockIdx.x , gridDim.x ,
	kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {

	// This is the final block with the final result at the final threads' location

	size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
	size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;

	if ( threadIdx.y == 0 ) {
	Kokkos::Impl::FunctorFinal< FunctorType , work_tag >::final( m_functor , shared );
	}

	if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }

	for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
	}
	}


	template< class HostViewType >
	ParallelReduce( const FunctorType & functor
	, const Policy & policy
	, const HostViewType & result
	)
	: m_functor( functor )
	, m_scratch_space( 0 )
	, m_scratch_flags( 0 )
	, m_unified_space( 0 )
	, m_team_begin( cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( functor , policy.team_size() ) )
	, m_shmem_begin( sizeof(double) * ( policy.team_size() + 2 ) )
	, m_shmem_size( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
	, m_league_size( policy.league_size() )
	{

	// The global parallel_reduce does not support vector_length other than 1 at the moment
	if(policy.vector_length() > 1)
	Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA.");

	// Functor's reduce memory, team scan memory, and team shared memory depend upon team size.

	const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
	const int not_power_of_two = 0 != ( policy.team_size() & ( policy.team_size() - 1 ) );

	if ( not_power_of_two \|\| CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
	Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
	}

	const int block_count = std::min( policy.league_size() , policy.team_size() );

	m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( functor ) * block_count );
	m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
	m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( functor ) );

	const dim3 grid( block_count , 1 , 1 );
	const dim3 block( 1 , policy.team_size() , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )

	CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute

	Cuda::fence();

	if ( result.ptr_on_device() ) {
	if ( m_unified_space ) {
	const int count = ValueTraits::value_count( m_functor );
	for ( int i = 0 ; i < count ; ++i ) { result.ptr_on_device()[i] = pointer_type(m_unified_space)[i] ; }
	}
	else {
	const int size = ValueTraits::value_size( m_functor );
	DeepCopy<HostSpace,CudaSpace>( result.ptr_on_device() , m_scratch_space , size );
	}
	}
	}
	};

	} // namespace Impl
	} // namespace Kokkos

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {

	template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
	class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > >
	{
	private:

	typedef Kokkos::RangePolicy<Arg0,Arg1,Arg2, Kokkos::Cuda > Policy ;
	typedef typename Policy::WorkRange work_range ;
	typedef typename Policy::work_tag work_tag ;
	typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
	typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
	typedef Kokkos::Impl::FunctorValueOps< FunctorType , work_tag > ValueOps ;

	public:
	typedef typename ValueTraits::pointer_type pointer_type ;
	typedef typename ValueTraits::reference_type reference_type ;
	typedef FunctorType functor_type ;
	typedef Cuda::size_type size_type ;

	// Algorithmic constraints:
	// (a) blockDim.y is a power of two
	// (b) blockDim.y == blockDim.z == 1
	// (c) gridDim.x <= blockDim.y * blockDim.y
	// (d) gridDim.y == gridDim.z == 1

	// Determine block size constrained by shared memory:
	static inline
	unsigned local_block_size( const FunctorType & f )
	{
	// blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
	// gridDim.x <= blockDim.y * blockDim.y
	//
	// 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing

	unsigned n = CudaTraits::WarpSize * 4 ;
	while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( f , n ) ) { n >>= 1 ; }
	return n ;
	}

	const FunctorType m_functor ;
	const Policy m_policy ;
	size_type * m_scratch_space ;
	size_type * m_scratch_flags ;
	size_type m_final ;

	template< class Tag >
	inline static
	__device__
	void driver( const FunctorType & functor
	, typename Impl::enable_if< Impl::is_same< Tag , void >::value
	, typename Policy::member_type const & >::type iwork
	, reference_type value
	, const bool final )
	{ functor( iwork , value , final ); }

	template< class Tag >
	inline static
	__device__
	void driver( const FunctorType & functor
	, typename Impl::enable_if< ! Impl::is_same< Tag , void >::value
	, typename Policy::member_type const & >::type iwork
	, reference_type value
	, const bool final )
	{ functor( Tag() , iwork , value , final ); }

	//----------------------------------------

	__device__ inline
	void initial(void) const
	{
	const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
	word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );

	size_type * const shared_value = kokkos_impl_cuda_shared_memory<size_type>() + word_count.value * threadIdx.y ;

	ValueInit::init( m_functor , shared_value );

	// Number of blocks is bounded so that the reduction can be limited to two passes.
	// Each thread block is given an approximately equal amount of work to perform.
	// Accumulate the values for this block.
	// The accumulation ordering does not match the final pass, but is arithmatically equivalent.

	const work_range range( m_policy , blockIdx.x , gridDim.x );

	for ( typename Policy::member_type iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
	iwork < iwork_end ; iwork += blockDim.y ) {
	ParallelScan::template driver< work_tag >
	( m_functor , iwork , ValueOps::reference( shared_value ) , false );
	}

	// Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
	// Blocks' scan values are written to 'blockIdx.x' location.
	// Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < gridDim.x
	cuda_single_inter_block_reduce_scan<true,FunctorType,work_tag>( m_functor , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags );
	}

	//----------------------------------------

	__device__ inline
	void final(void) const
	{
	const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
	word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );

	// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
	size_type * const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
	size_type * const shared_prefix = shared_data + word_count.value * threadIdx.y ;
	size_type * const shared_accum = shared_data + word_count.value * ( blockDim.y + 1 );

	// Starting value for this thread block is the previous block's total.
	if ( blockIdx.x ) {
	size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
	for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
	}
	else if ( 0 == threadIdx.y ) {
	ValueInit::init( m_functor , shared_accum );
	}

	const work_range range( m_policy , blockIdx.x , gridDim.x );

	for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {

	const typename Policy::member_type iwork = iwork_base + threadIdx.y ;

	__syncthreads(); // Don't overwrite previous iteration values until they are used

	ValueInit::init( m_functor , shared_prefix + word_count.value );

	// Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
	for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) {
	shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
	}

	if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.

	// Call functor to accumulate inclusive scan value for this work item
	if ( iwork < range.end() ) {
	ParallelScan::template driver< work_tag >
	( m_functor , iwork , ValueOps::reference( shared_prefix + word_count.value ) , false );
	}

	// Scan block values into locations shared_data[1..blockDim.y]
	cuda_intra_block_reduce_scan<true,FunctorType,work_tag>( m_functor , ValueTraits::pointer_type(shared_data+word_count.value) );

	{
	size_type * const block_total = shared_data + word_count.value * blockDim.y ;
	for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
	}

	// Call functor with exclusive scan value
	if ( iwork < range.end() ) {
	ParallelScan::template driver< work_tag >
	( m_functor , iwork , ValueOps::reference( shared_prefix ) , true );
	}
	}
	}

	//----------------------------------------

	__device__ inline
	void operator()(void) const
	{
	if ( ! m_final ) {
	initial();
	}
	else {
	final();
	}
	}

	ParallelScan( const FunctorType & functor ,
	const Policy & policy )
	: m_functor( functor )
	, m_policy( policy )
	, m_scratch_space( 0 )
	, m_scratch_flags( 0 )
	, m_final( false )
	{
	enum { GridMaxComputeCapability_2x = 0x0ffff };

	const int block_size = local_block_size( functor );

	const int grid_max = ( block_size * block_size ) < GridMaxComputeCapability_2x ?
	( block_size * block_size ) : GridMaxComputeCapability_2x ;

	// At most 'max_grid' blocks:
	const int nwork = policy.end() - policy.begin();
	const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size ));

	// How much work per block:
	const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ;

	// How many block are really needed for this much work:
	const dim3 grid( ( nwork + work_per_block - 1 ) / work_per_block , 1 , 1 );
	const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
	const int shmem = ValueTraits::value_size( functor ) * ( block_size + 2 );

	m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( functor ) * grid.x );
	m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );

	m_final = false ;
	CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute

	m_final = true ;
	CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
	}

	void wait() const { Cuda::fence(); }
	};

	} // namespace Impl
	} // namespace Kokkos

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {
	template<typename iType>
	struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
	typedef iType index_type;
	const iType start;
	const iType end;
	const iType increment;
	const CudaTeamMember& thread;

	#ifdef __CUDA_ARCH__
	__device__ inline
	TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
	start( threadIdx.y ),
	end( count ),
	increment( blockDim.y ),
	thread(thread_)
	{}
	__device__ inline
	TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_):
	start( begin_+threadIdx.y ),
	end( end_ ),
	increment( blockDim.y ),
	thread(thread_)
	{}
	#else
	KOKKOS_INLINE_FUNCTION
	TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
	start( 0 ),
	end( count ),
	increment( 1 ),
	thread(thread_)
	{}
	KOKKOS_INLINE_FUNCTION
	TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_):
	start( begin_ ),
	end( end_ ),
	increment( 1 ),
	thread(thread_)
	{}
	#endif
	};

	template<typename iType>
	struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
	typedef iType index_type;
	const iType start;
	const iType end;
	const iType increment;

	#ifdef __CUDA_ARCH__
	__device__ inline
	ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread, const iType& count):
	start( threadIdx.x ),
	end( count ),
	increment( blockDim.x )
	{}
	#else
	KOKKOS_INLINE_FUNCTION
	ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
	start( 0 ),
	end( count ),
	increment( 1 )
	{}
	#endif
	};

	} // namespace Impl

	template<typename iType>
	KOKKOS_INLINE_FUNCTION
	Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
	TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& count) {
	return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,count);
	}

	template<typename iType>
	KOKKOS_INLINE_FUNCTION
	Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
	TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& begin, const iType& end) {
	return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,begin,end);
	}

	template<typename iType>
	KOKKOS_INLINE_FUNCTION
	Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
	ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
	return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
	}

	KOKKOS_INLINE_FUNCTION
	Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam(const Impl::CudaTeamMember& thread) {
	return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread);
	}

	KOKKOS_INLINE_FUNCTION
	Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread(const Impl::CudaTeamMember& thread) {
	return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread);
	}

	} // namespace Kokkos

	namespace Kokkos {

	/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
	*
	* The range i=0..N-1 is mapped to all threads of the the calling thread team.
	* This functionality requires C++11 support.*/
	template<typename iType, class Lambda>
	KOKKOS_INLINE_FUNCTION
	void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries, const Lambda& lambda) {
	#ifdef __CUDA_ARCH__
	for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
	lambda(i);
	#endif
	}

	/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
	*
	* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
	* val is performed and put into result. This functionality requires C++11 support.*/
	template< typename iType, class Lambda, typename ValueType >
	KOKKOS_INLINE_FUNCTION
	void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries,
	const Lambda & lambda, ValueType& result) {

	#ifdef __CUDA_ARCH__
	result = ValueType();

	for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
	lambda(i,result);
	}

	Impl::cuda_intra_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });
	Impl::cuda_inter_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });

	#endif
	}

	/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
	*
	* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
	* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
	* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
	* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
	* '1 for '). This functionality requires C++11 support./
	template< typename iType, class Lambda, typename ValueType, class JoinType >
	KOKKOS_INLINE_FUNCTION
	void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries,
	const Lambda & lambda, const JoinType& join, ValueType& init_result) {

	#ifdef __CUDA_ARCH__
	ValueType result = init_result;

	for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
	lambda(i,result);
	}

	Impl::cuda_intra_warp_reduction(result, join );
	Impl::cuda_inter_warp_reduction(result, join );

	init_result = result;
	#endif
	}

	} //namespace Kokkos

	namespace Kokkos {
	/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
	*
	* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
	* This functionality requires C++11 support.*/
	template<typename iType, class Lambda>
	KOKKOS_INLINE_FUNCTION
	void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
	loop_boundaries, const Lambda& lambda) {
	#ifdef __CUDA_ARCH__
	for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
	lambda(i);
	#endif
	}

	/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
	*
	* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
	* val is performed and put into result. This functionality requires C++11 support.*/
	template< typename iType, class Lambda, typename ValueType >
	KOKKOS_INLINE_FUNCTION
	void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
	loop_boundaries, const Lambda & lambda, ValueType& result) {
	#ifdef __CUDA_ARCH__
	ValueType val = ValueType();

	for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
	lambda(i,val);
	}

	result = val;

	if (loop_boundaries.increment > 1)
	result += shfl_down(result, 1,loop_boundaries.increment);
	if (loop_boundaries.increment > 2)
	result += shfl_down(result, 2,loop_boundaries.increment);
	if (loop_boundaries.increment > 4)
	result += shfl_down(result, 4,loop_boundaries.increment);
	if (loop_boundaries.increment > 8)
	result += shfl_down(result, 8,loop_boundaries.increment);
	if (loop_boundaries.increment > 16)
	result += shfl_down(result, 16,loop_boundaries.increment);

	result = shfl(result,0,loop_boundaries.increment);
	#endif
	}

	/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
	*
	* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
	* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
	* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
	* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
	* '1 for '). This functionality requires C++11 support./
	template< typename iType, class Lambda, typename ValueType, class JoinType >
	KOKKOS_INLINE_FUNCTION
	void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
	loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {

	#ifdef __CUDA_ARCH__
	ValueType result = init_result;

	for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
	lambda(i,result);
	}

	if (loop_boundaries.increment > 1)
	join( result, shfl_down(result, 1,loop_boundaries.increment));
	if (loop_boundaries.increment > 2)
	join( result, shfl_down(result, 2,loop_boundaries.increment));
	if (loop_boundaries.increment > 4)
	join( result, shfl_down(result, 4,loop_boundaries.increment));
	if (loop_boundaries.increment > 8)
	join( result, shfl_down(result, 8,loop_boundaries.increment));
	if (loop_boundaries.increment > 16)
	join( result, shfl_down(result, 16,loop_boundaries.increment));

	init_result = shfl(result,0,loop_boundaries.increment);
	#endif
	}

	/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
	* for each i=0..N-1.
	*
	* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
	* Depending on the target execution space the operator might be called twice: once with final=false
	* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
	* "i" needs to be added to val no matter whether final==true or not. In a serial execution
	* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
	* to the final sum value over all vector lanes.
	* This functionality requires C++11 support.*/
	template< typename iType, class FunctorType >
	KOKKOS_INLINE_FUNCTION
	void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
	loop_boundaries, const FunctorType & lambda) {

	#ifdef __CUDA_ARCH__
	typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
	typedef typename ValueTraits::value_type value_type ;

	value_type scan_val = value_type();
	const int VectorLength = blockDim.x;

	iType loop_bound = ((loop_boundaries.end+VectorLength-1)/VectorLength) * VectorLength;
	for(int _i = threadIdx.x; _i < loop_bound; _i += VectorLength) {
	value_type val = value_type();
	if(_i<loop_boundaries.end)
	lambda(_i , val , false);

	value_type tmp = val;
	value_type result_i;

	if(threadIdx.x%VectorLength == 0)
	result_i = tmp;
	if (VectorLength > 1) {
	const value_type tmp2 = shfl_up(tmp, 1,VectorLength);
	if(threadIdx.x > 0)
	tmp+=tmp2;
	}
	if(threadIdx.x%VectorLength == 1)
	result_i = tmp;
	if (VectorLength > 3) {
	const value_type tmp2 = shfl_up(tmp, 2,VectorLength);
	if(threadIdx.x > 1)
	tmp+=tmp2;
	}
	if ((threadIdx.x%VectorLength >= 2) &&
	(threadIdx.x%VectorLength < 4))
	result_i = tmp;
	if (VectorLength > 7) {
	const value_type tmp2 = shfl_up(tmp, 4,VectorLength);
	if(threadIdx.x > 3)
	tmp+=tmp2;
	}
	if ((threadIdx.x%VectorLength >= 4) &&
	(threadIdx.x%VectorLength < 8))
	result_i = tmp;
	if (VectorLength > 15) {
	const value_type tmp2 = shfl_up(tmp, 8,VectorLength);
	if(threadIdx.x > 7)
	tmp+=tmp2;
	}
	if ((threadIdx.x%VectorLength >= 8) &&
	(threadIdx.x%VectorLength < 16))
	result_i = tmp;
	if (VectorLength > 31) {
	const value_type tmp2 = shfl_up(tmp, 16,VectorLength);
	if(threadIdx.x > 15)
	tmp+=tmp2;
	}
	if (threadIdx.x%VectorLength >= 16)
	result_i = tmp;

	val = scan_val + result_i - val;
	scan_val += shfl(tmp,VectorLength-1,VectorLength);
	if(_i<loop_boundaries.end)
	lambda(_i , val , true);
	}
	#endif
	}

	}

	namespace Kokkos {

	template<class FunctorType>
	KOKKOS_INLINE_FUNCTION
	void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
	#ifdef __CUDA_ARCH__
	if(threadIdx.x == 0) lambda();
	#endif
	}

	template<class FunctorType>
	KOKKOS_INLINE_FUNCTION
	void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
	#ifdef __CUDA_ARCH__
	if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
	#endif
	}

	template<class FunctorType, class ValueType>
	KOKKOS_INLINE_FUNCTION
	void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
	#ifdef __CUDA_ARCH__
	if(threadIdx.x == 0) lambda(val);
	val = shfl(val,0,blockDim.x);
	#endif
	}

	template<class FunctorType, class ValueType>
	KOKKOS_INLINE_FUNCTION
	void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
	#ifdef __CUDA_ARCH__
	if(threadIdx.x == 0 && threadIdx.y == 0) {
	lambda(val);
	}
	single_struct.team_member.team_broadcast(val,0);
	#endif
	}

	}

	namespace Kokkos {

	namespace Impl {
	template< class FunctorType, class ExecPolicy, class ValueType , class Tag = typename ExecPolicy::work_tag>
	struct CudaFunctorAdapter {
	const FunctorType f;
	typedef ValueType value_type;
	CudaFunctorAdapter(const FunctorType& f_):f(f_) {}

	__device__ inline
	void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, ValueType& val) const {
	//Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
	f(typename ExecPolicy::work_tag(), i,val);
	}
	};

	template< class FunctorType, class ExecPolicy, class ValueType >
	struct CudaFunctorAdapter<FunctorType,ExecPolicy,ValueType,void> {
	const FunctorType f;
	typedef ValueType value_type;
	CudaFunctorAdapter(const FunctorType& f_):f(f_) {}

	__device__ inline
	void operator() (const typename ExecPolicy::member_type& i, ValueType& val) const {
	//Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
	f(i,val);
	}

	};

	template< class FunctorType, class Enable = void>
	struct ReduceFunctorHasInit {
	enum {value = false};
	};

	template< class FunctorType>
	struct ReduceFunctorHasInit<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type > {
	enum {value = true};
	};

	template< class FunctorType, class Enable = void>
	struct ReduceFunctorHasJoin {
	enum {value = false};
	};

	template< class FunctorType>
	struct ReduceFunctorHasJoin<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type > {
	enum {value = true};
	};

	template< class FunctorType, class Enable = void>
	struct ReduceFunctorHasFinal {
	enum {value = false};
	};

	template< class FunctorType>
	struct ReduceFunctorHasFinal<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type > {
	enum {value = true};
	};

	template< class FunctorType, bool Enable =
	( FunctorDeclaresValueType<FunctorType,void>::value) \|\|
	( ReduceFunctorHasInit<FunctorType>::value ) \|\|
	( ReduceFunctorHasJoin<FunctorType>::value ) \|\|
	( ReduceFunctorHasFinal<FunctorType>::value )
	>
	struct IsNonTrivialReduceFunctor {
	enum {value = false};
	};

	template< class FunctorType>
	struct IsNonTrivialReduceFunctor<FunctorType, true> {
	enum {value = true};
	};

	template<class FunctorType, class ResultType, class Tag, bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value >
	struct FunctorReferenceType {
	typedef ResultType& reference_type;
	};

	template<class FunctorType, class ResultType, class Tag>
	struct FunctorReferenceType<FunctorType, ResultType, Tag, true> {
	typedef typename Kokkos::Impl::FunctorValueTraits< FunctorType ,Tag >::reference_type reference_type;
	};

	}

	// general policy and view ouput
	template< class ExecPolicy , class FunctorTypeIn , class ViewType >
	inline
	void parallel_reduce( const ExecPolicy & policy
	, const FunctorTypeIn & functor_in
	, const ViewType & result_view
	, const std::string& str = ""
	, typename Impl::enable_if<
	( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value &&
	Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
	)>::type * = 0 )
	{
	enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
	typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType;
	FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in));

	#ifdef KOKKOSP_ENABLE_PROFILING
	uint64_t kpID = 0;
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
	}
	#endif

	(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view );

	#ifdef KOKKOSP_ENABLE_PROFILING
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::endParallelScan(kpID);
	}
	#endif
	}

	// general policy and pod or array of pod output
	template< class ExecPolicy , class FunctorTypeIn , class ResultType>
	inline
	void parallel_reduce( const ExecPolicy & policy
	, const FunctorTypeIn & functor_in
	, ResultType& result_ref
	, const std::string& str = ""
	, typename Impl::enable_if<
	( ! Impl::is_view<ResultType>::value &&
	! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value &&
	! Impl::is_integral< ExecPolicy >::value &&
	Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 )
	{
	typedef typename Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType;

	typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
	typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ;

	// Wrap the result output request in a view to inform the implementation
	// of the type and memory space.

	typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
	, typename ValueTraits::value_type
	, typename ValueTraits::pointer_type
	>::type value_type ;
	Kokkos::View< value_type
	, HostSpace
	, Kokkos::MemoryUnmanaged
	>
	result_view( ValueOps::pointer( result_ref )
	, 1
	);

	#ifdef KOKKOSP_ENABLE_PROFILING
	uint64_t kpID = 0;
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
	}
	#endif

	(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( FunctorType(functor_in) , policy , result_view );

	#ifdef KOKKOSP_ENABLE_PROFILING
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::endParallelScan(kpID);
	}
	#endif
	}

	// general policy and pod or array of pod output
	template< class ExecPolicy , class FunctorType>
	inline
	void parallel_reduce( const ExecPolicy & policy
	, const FunctorType & functor
	, typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type result_ref
	, const std::string& str = ""
	, typename Impl::enable_if<
	( Impl::IsNonTrivialReduceFunctor<FunctorType>::value &&
	! Impl::is_integral< ExecPolicy >::value &&
	Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 )
	{
	typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
	typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ;

	// Wrap the result output request in a view to inform the implementation
	// of the type and memory space.

	typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
	, typename ValueTraits::value_type
	, typename ValueTraits::pointer_type
	>::type value_type ;

	Kokkos::View< value_type
	, HostSpace
	, Kokkos::MemoryUnmanaged
	>
	result_view( ValueOps::pointer( result_ref )
	, ValueTraits::value_count( functor )
	);

	#ifdef KOKKOSP_ENABLE_PROFILING
	uint64_t kpID = 0;
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
	}
	#endif

	(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view );

	#ifdef KOKKOSP_ENABLE_PROFILING
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::endParallelScan(kpID);
	}
	#endif
	}

	// integral range policy and view ouput
	template< class FunctorTypeIn , class ViewType >
	inline
	void parallel_reduce( const size_t work_count
	, const FunctorTypeIn & functor_in
	, const ViewType & result_view
	, const std::string& str = ""
	, typename Impl::enable_if<( Impl::is_view<ViewType>::value &&
	Impl::is_same<
	typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space,
	Kokkos::Cuda>::value
	)>::type * = 0 )
	{
	enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
	typedef typename
	Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space
	execution_space ;

	typedef RangePolicy< execution_space > ExecPolicy ;

	typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType;

	FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in));

	#ifdef KOKKOSP_ENABLE_PROFILING
	uint64_t kpID = 0;
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
	}
	#endif

	(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , ExecPolicy(0,work_count) , result_view );

	#ifdef KOKKOSP_ENABLE_PROFILING
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::endParallelScan(kpID);
	}
	#endif

	}

	// integral range policy and pod or array of pod output
	template< class FunctorTypeIn , class ResultType>
	inline
	void parallel_reduce( const size_t work_count
	, const FunctorTypeIn & functor_in
	, ResultType& result
	, const std::string& str = ""
	, typename Impl::enable_if< ! Impl::is_view<ResultType>::value &&
	! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value &&
	Impl::is_same<
	typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space,
	Kokkos::Cuda>::value >::type * = 0 )
	{
	typedef typename
	Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space
	execution_space ;
	typedef Kokkos::RangePolicy< execution_space > ExecPolicy ;

	typedef Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType;


	typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
	typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ;


	// Wrap the result output request in a view to inform the implementation
	// of the type and memory space.

	typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
	, typename ValueTraits::value_type
	, typename ValueTraits::pointer_type
	>::type value_type ;

	Kokkos::View< value_type
	, HostSpace
	, Kokkos::MemoryUnmanaged
	>
	result_view( ValueOps::pointer( result )
	, 1
	);

	#ifdef KOKKOSP_ENABLE_PROFILING
	uint64_t kpID = 0;
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
	}
	#endif

	(void) Impl::ParallelReduce< FunctorType , ExecPolicy >( FunctorType(functor_in) , ExecPolicy(0,work_count) , result_view );

	#ifdef KOKKOSP_ENABLE_PROFILING
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::endParallelScan(kpID);
	}
	#endif
	}

	template< class FunctorType>
	inline
	void parallel_reduce( const size_t work_count
	, const FunctorType & functor
	, typename Kokkos::Impl::FunctorValueTraits< FunctorType , void >::reference_type result
	, const std::string& str = ""
	, typename Impl::enable_if< Impl::IsNonTrivialReduceFunctor<FunctorType>::value &&
	Impl::is_same<
	typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
	Kokkos::Cuda>::value >::type * = 0 )
	{

	typedef typename
	Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
	execution_space ;
	typedef Kokkos::RangePolicy< execution_space > ExecPolicy ;



	typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
	typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ;


	// Wrap the result output request in a view to inform the implementation
	// of the type and memory space.

	typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
	, typename ValueTraits::value_type
	, typename ValueTraits::pointer_type
	>::type value_type ;

	Kokkos::View< value_type
	, HostSpace
	, Kokkos::MemoryUnmanaged
	>
	result_view( ValueOps::pointer( result )
	, ValueTraits::value_count( functor )
	);

	#ifdef KOKKOSP_ENABLE_PROFILING
	uint64_t kpID = 0;
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
	}
	#endif

	(void) Impl::ParallelReduce< FunctorType , ExecPolicy >( functor , ExecPolicy(0,work_count) , result_view );

	#ifdef KOKKOSP_ENABLE_PROFILING
	if(Kokkos::Experimental::profileLibraryLoaded()) {
	Kokkos::Experimental::endParallelScan(kpID);
	}
	#endif
	}

	#ifdef KOKKOS_HAVE_CUDA
	template< class ExecPolicy , class FunctorType , class ResultType >
	inline
	void parallel_reduce( const std::string & str
	, const ExecPolicy & policy
	, const FunctorType & functor
	, ResultType * result)
	{
	#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
	Kokkos::fence();
	std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
	#endif

	parallel_reduce(policy,functor,result,str);

	#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
	Kokkos::fence();
	std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
	#endif
	(void) str;
	}

	template< class ExecPolicy , class FunctorType , class ResultType >
	inline
	void parallel_reduce( const std::string & str
	, const ExecPolicy & policy
	, const FunctorType & functor
	, ResultType & result)
	{
	#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
	Kokkos::fence();
	std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
	#endif

	parallel_reduce(policy,functor,result,str);

	#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
	Kokkos::fence();
	std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
	#endif
	(void) str;
	}

	template< class ExecPolicy , class FunctorType >
	inline
	void parallel_reduce( const std::string & str
	, const ExecPolicy & policy
	, const FunctorType & functor)
	{
	#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
	Kokkos::fence();
	std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
	#endif

	parallel_reduce(policy,functor,str);

	#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
	Kokkos::fence();
	std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
	#endif
	(void) str;
	}
	#endif
	} // namespace Kokkos
	#endif /* defined( __CUDACC__ ) */

	#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */

Kokkos_Cuda_Parallel.hppNo OneTemporaryActions

File Metadata

Kokkos_Cuda_Parallel.hppView Options

Event Timeline

Kokkos_Cuda_Parallel.hpp
No OneTemporary
Actions

Kokkos_Cuda_Parallel.hpp
View Options