Kokkos_Cuda_Parallel.hpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, May 28, 04:11

Kokkos_Cuda_Parallel.hpp
View Options

	/*
	//@HEADER
	// ************************************************************************
	//
	// Kokkos: Manycore Performance-Portable Multidimensional Arrays
	// Copyright (2012) Sandia Corporation
	//
	// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
	// the U.S. Government retains certain rights in this software.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// 1. Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	//
	// 2. Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	//
	// 3. Neither the name of the Corporation nor the names of the
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
	// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
	// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	//
	// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
	//
	// ************************************************************************
	//@HEADER
	*/

	#ifndef KOKKOS_CUDA_PARALLEL_HPP
	#define KOKKOS_CUDA_PARALLEL_HPP

	#include <iostream>
	#include <stdio.h>

	#if defined( __CUDACC__ )

	#include <utility>
	#include <Kokkos_Parallel.hpp>

	#include <Cuda/Kokkos_CudaExec.hpp>
	#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {

	template< class FunctorType , class WorkSpec >
	class ParallelFor< FunctorType , WorkSpec /* size_t */ , Cuda > {
	private:

	const FunctorType m_functor ;
	const Cuda::size_type m_work ;

	ParallelFor();
	ParallelFor & operator = ( const ParallelFor & );

	public:

	inline
	__device__
	void operator()(void) const
	{
	const Cuda::size_type work_stride = blockDim.x * gridDim.x ;

	for ( Cuda::size_type
	iwork = threadIdx.x + blockDim.x * blockIdx.x ;
	iwork < m_work ;
	iwork += work_stride ) {
	m_functor( iwork );
	}
	}

	ParallelFor( const FunctorType & functor ,
	const size_t work )
	: m_functor( functor )
	, m_work( work )
	{
	const dim3 block( CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1, 1);
	const dim3 grid( std::min( ( m_work + block.x - 1 ) / block.x , cuda_internal_maximum_grid_count() ) , 1 , 1 );

	CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
	}
	};

	template< class FunctorType >
	class ParallelFor< FunctorType , ParallelWorkRequest , Cuda > {
	private:

	const FunctorType m_functor ;
	const ParallelWorkRequest m_work ;
	const int m_shmem ;

	ParallelFor();
	ParallelFor & operator = ( const ParallelFor & );

	public:

	inline
	__device__
	void operator()(void) const
	{
	CudaExec exec( 0 , m_shmem );
	m_functor( Cuda( exec ) );
	}

	ParallelFor( const FunctorType & functor ,
	const ParallelWorkRequest & work )
	: m_functor( functor )
	, m_work( std::min( work.league_size , size_t(cuda_internal_maximum_grid_count()) ) ,
	std::min( work.team_size , size_t(CudaTraits::WarpSize * cuda_internal_maximum_warp_count()) ) )
	, m_shmem( FunctorShmemSize< FunctorType >::value( functor ) )
	{
	const dim3 grid( m_work.league_size , 1 , 1 );
	const dim3 block( m_work.team_size , 1, 1 );

	CudaParallelLaunch< ParallelFor >( *this , grid , block , m_shmem );
	}
	};

	} // namespace Impl
	} // namespace Kokkos

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {

	template< class FunctorType >
	class ParallelFor< FunctorType , CudaWorkConfig , Cuda > {
	public:

	const FunctorType m_work_functor ;

	inline
	__device__
	void operator()(void) const
	{
	Cuda::size_type iwork = threadIdx.x + blockDim.x * (
	threadIdx.y + blockDim.y * (
	threadIdx.z + blockDim.z * (
	blockIdx.x + gridDim.x * (
	blockIdx.y + gridDim.y * (
	blockIdx.z )))));

	m_work_functor( iwork );
	}

	ParallelFor( const FunctorType & functor ,
	const CudaWorkConfig & work_config )
	: m_work_functor( functor )
	{
	const dim3 grid( work_config.grid[0] ,
	work_config.grid[1] ,
	work_config.grid[2] );

	const dim3 block( work_config.block[0] ,
	work_config.block[1] ,
	work_config.block[2] );

	CudaParallelLaunch< ParallelFor >( *this , grid , block , work_config.shared );
	}
	};

	} // namespace Impl
	} // namespace Kokkos

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {

	template< class FunctorType , class WorkSpec >
	class ParallelReduce< FunctorType , WorkSpec , Cuda >
	{
	public:
	typedef ReduceAdapter< FunctorType > Reduce ;
	typedef typename Reduce::pointer_type pointer_type ;
	typedef typename Reduce::reference_type reference_type ;
	typedef Cuda::size_type size_type ;

	// Algorithmic constraints:
	// (a) blockSize is a power of two
	// (b) blockDim.x == BlockSize == 1 << BlockSizeShift
	// (c) blockDim.y == blockDim.z == 1

	enum { WarpCount = 8 };
	enum { BlockSize = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
	enum { BlockSizeShift = power_of_two< BlockSize >::value };
	enum { BlockSizeMask = BlockSize - 1 };

	enum { GridMaxComputeCapability_2x = 0x0ffff };
	enum { GridMax = BlockSize };

	const FunctorType m_functor ;
	size_type * m_scratch_space ;
	size_type * m_scratch_flags ;
	size_type * m_unified_space ;
	pointer_type m_host_pointer ;
	size_type m_work ;
	size_type m_work_per_block ;
	size_type m_local_block_count ;
	size_type m_global_block_begin ;
	size_type m_global_block_count ;


	__device__ inline
	void operator()(void) const
	{
	extern __shared__ size_type shared_data[];

	const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
	word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );

	{
	reference_type value = Reduce::reference( shared_data + threadIdx.x * word_count.value );

	m_functor.init( value );

	// Number of blocks is bounded so that the reduction can be limited to two passes.
	// Each thread block is given an approximately equal amount of work to perform.
	// Accumulate the values for this block.
	// The accumulation ordering does not match the final pass, but is arithmatically equivalent.

	const size_type iwork_beg = blockIdx.x * m_work_per_block ;
	const size_type iwork_end = iwork_beg + m_work_per_block < m_work
	? iwork_beg + m_work_per_block : m_work ;

	for ( size_type iwork = threadIdx.x + iwork_beg ; iwork < iwork_end ; iwork += BlockSize ) {
	m_functor( iwork , value );
	}
	}

	// Reduce with final value at BlockSize - 1 location.
	if ( cuda_single_inter_block_reduce_scan<false,BlockSize>(
	m_functor , m_global_block_begin + blockIdx.x , m_global_block_count ,
	shared_data , m_scratch_space , m_scratch_flags ) ) {

	// This is the final block with the final result at the final threads' location

	size_type * const shared = shared_data + BlockSizeMask * word_count.value ;
	size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;

	if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }

	if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }

	for ( unsigned i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i]; }
	}
	}

	ParallelReduce( const FunctorType & functor ,
	const size_t nwork ,
	const pointer_type result = 0 ,
	const bool execute_immediately = true )
	: m_functor( functor )
	, m_scratch_space( 0 )
	, m_scratch_flags( 0 )
	, m_unified_space( 0 )
	, m_host_pointer( result )
	, m_work( nwork )
	, m_work_per_block( 0 )
	, m_local_block_count( 0 )
	, m_global_block_begin( 0 )
	, m_global_block_count( 0 )
	{
	// At most 'max_grid' blocks:
	const int max_grid = std::min( int(GridMax) , int(( nwork + BlockSizeMask ) / BlockSize ));

	// How much work per block:
	m_work_per_block = ( nwork + max_grid - 1 ) / max_grid ;

	// How many block are really needed for this much work:
	m_local_block_count = ( nwork + m_work_per_block - 1 ) / m_work_per_block ;
	m_global_block_count = m_local_block_count ;

	m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * m_local_block_count );
	m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
	m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );

	if ( execute_immediately ) { execute(); }
	}

	inline
	void execute() const
	{
	const dim3 grid( m_local_block_count , 1 , 1 );
	const dim3 block( BlockSize , 1 , 1 );
	const int shmem = cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( m_functor );

	CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
	}

	void wait() const
	{
	Cuda::fence();

	if ( m_host_pointer ) {
	if ( m_unified_space ) {
	const int count = Reduce::value_count( m_functor );
	for ( int i = 0 ; i < count ; ++i ) { m_host_pointer[i] = pointer_type(m_unified_space)[i] ; }
	}
	else {
	const int size = Reduce::value_size( m_functor );
	DeepCopy<HostSpace,CudaSpace>( m_host_pointer , m_scratch_space , size );
	}
	}
	}
	};


	template< class FunctorType >
	class ParallelReduce< FunctorType , ParallelWorkRequest , Cuda >
	{
	public:
	typedef ReduceAdapter< FunctorType > Reduce ;
	typedef typename Reduce::pointer_type pointer_type ;
	typedef typename Reduce::reference_type reference_type ;
	typedef Cuda::size_type size_type ;

	// Algorithmic constraints:
	// (a) blockSize is a power of two
	// (b) blockDim.x == BlockSize == 1 << BlockSizeShift
	// (b) blockDim.y == blockDim.z == 1

	enum { WarpCount = 8 };
	enum { BlockSize = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
	enum { BlockSizeShift = power_of_two< BlockSize >::value };
	enum { BlockSizeMask = BlockSize - 1 };

	enum { GridMaxComputeCapability_2x = 0x0ffff };
	enum { GridMax = BlockSize };

	const FunctorType m_functor ;
	size_type * m_scratch_space ;
	size_type * m_scratch_flags ;
	size_type * m_unified_space ;
	pointer_type m_host_pointer ;
	size_type m_shmem_begin ;
	size_type m_shmem_end ;
	size_type m_local_block_count ;
	size_type m_global_block_begin ;
	size_type m_global_block_count ;

	__device__ inline
	void operator()(void) const
	{
	extern __shared__ size_type shared_data[];

	const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
	word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );

	{
	reference_type value = Reduce::reference( shared_data + threadIdx.x * word_count.value );

	m_functor.init( value );

	CudaExec exec( m_shmem_begin , m_shmem_end );

	m_functor( Cuda( exec ) , value );
	}

	// Reduce with final value at BlockSize - 1 location.
	if ( cuda_single_inter_block_reduce_scan<false,BlockSize>(
	m_functor , m_global_block_begin + blockIdx.x , m_global_block_count ,
	shared_data , m_scratch_space , m_scratch_flags ) ) {

	// This is the final block with the final result at the final threads' location

	size_type * const shared = shared_data + BlockSizeMask * word_count.value ;
	size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;

	if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }

	if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }

	for ( unsigned i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i]; }
	}
	}


	ParallelReduce( const FunctorType & functor ,
	const ParallelWorkRequest & work ,
	const pointer_type result = 0 ,
	const bool execute_immediately = true )
	: m_functor( functor )
	, m_scratch_space( 0 )
	, m_scratch_flags( 0 )
	, m_unified_space( 0 )
	, m_host_pointer( result )
	, m_shmem_begin( cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( functor ) )
	, m_shmem_end( cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( functor )
	+ FunctorShmemSize< FunctorType >::value( functor ) )
	, m_local_block_count( 0 )
	, m_global_block_begin( 0 )
	, m_global_block_count( 0 )
	{
	m_local_block_count = std::min( int(GridMax) , int(work.league_size) );
	m_global_block_count = std::min( int(GridMax) , int(work.league_size) );
	m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * m_local_block_count );
	m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
	m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );

	if ( execute_immediately ) { execute(); }
	}

	inline
	void execute() const
	{
	const dim3 grid( m_local_block_count , 1 , 1 );
	const dim3 block( BlockSize , 1 , 1 );

	CudaParallelLaunch< ParallelReduce >( *this, grid, block, m_shmem_end ); // copy to device and execute
	}

	void wait() const
	{
	Cuda::fence();

	if ( m_host_pointer ) {
	if ( m_unified_space ) {
	const int count = Reduce::value_count( m_functor );
	for ( int i = 0 ; i < count ; ++i ) { m_host_pointer[i] = pointer_type(m_unified_space)[i] ; }
	}
	else {
	const int size = Reduce::value_size( m_functor );
	DeepCopy<HostSpace,CudaSpace>( m_host_pointer , m_scratch_space , size );
	}
	}
	}
	};

	} // namespace Impl
	} // namespace Kokkos

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {

	template< class Functor >
	class MultiFunctorParallelReduceMember ;

	template<>
	class MultiFunctorParallelReduceMember<void>
	{
	private:

	MultiFunctorParallelReduceMember( const MultiFunctorParallelReduceMember & );
	MultiFunctorParallelReduceMember & operator = ( const MultiFunctorParallelReduceMember & );

	protected:

	MultiFunctorParallelReduceMember() {}

	public:

	virtual unsigned block_count() const = 0 ;

	virtual ~MultiFunctorParallelReduceMember() {}

	virtual void execute( void * const host_pointer ,
	const unsigned global_block_begin ,
	const unsigned global_block_count ) = 0 ;

	virtual void wait() const = 0 ;
	};

	template< class Functor >
	class MultiFunctorParallelReduceMember : public MultiFunctorParallelReduceMember<void> {
	public:
	ParallelReduce< Functor , size_t , Cuda > m_functor ;

	MultiFunctorParallelReduceMember( const Functor & f , size_t nwork )
	: MultiFunctorParallelReduceMember<void>()
	, m_functor( f , nwork , 0 , false )
	{}

	virtual unsigned block_count() const { return m_functor.m_local_block_count ; }

	virtual void execute( void * const host_pointer ,
	const unsigned global_block_begin ,
	const unsigned global_block_count )
	{
	m_functor.m_host_pointer = typename ReduceAdapter< Functor >::pointer_type(host_pointer);
	m_functor.m_global_block_begin = global_block_begin ;
	m_functor.m_global_block_count = global_block_count ;
	m_functor.execute();
	}

	virtual void wait() const { m_functor.wait(); }
	};

	} // namespace Impl
	} // namespace Kokkos

	namespace Kokkos {

	template<>
	class MultiFunctorParallelReduce< Cuda >
	{
	private:

	typedef std::vector< Impl::MultiFunctorParallelReduceMember<void> * > MemberVector ;

	MemberVector m_functors ;

	public:

	MultiFunctorParallelReduce()
	: m_functors()
	{}

	~MultiFunctorParallelReduce()
	{
	while ( ! m_functors.empty() ) {
	delete m_functors.back();
	m_functors.pop_back();
	}
	}

	template< class FunctorType >
	void push_back( const size_t work_count , const FunctorType & f )
	{
	m_functors.push_back( new Impl::MultiFunctorParallelReduceMember<FunctorType>( f , work_count ) );
	}

	void execute( void * host_pointer )
	{
	typename MemberVector::iterator m ;

	Cuda::size_type block_count = 0 ;

	for ( m = m_functors.begin() ; m != m_functors.end() ; ++m ) {
	block_count += (*m)->block_count();
	}

	Cuda::size_type block_offset = 0 ;

	for ( m = m_functors.begin() ; m != m_functors.end() ; ++m ) {
	(*m)->execute( host_pointer , block_offset , block_count );
	block_offset += (*m)->block_count();
	}
	}

	void wait() const
	{
	if ( ! m_functors.empty() ) { (m_functors.back())->wait(); }
	}
	};

	} // namespace Kokkos

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	namespace Kokkos {
	namespace Impl {

	template< class FunctorType , class WorkSpec >
	class ParallelScan< FunctorType , WorkSpec , Cuda >
	{
	public:
	typedef ReduceAdapter< FunctorType > Reduce ;
	typedef typename Reduce::pointer_type pointer_type ;
	typedef typename Reduce::reference_type reference_type ;
	typedef Cuda::size_type size_type ;

	// Algorithmic constraints:
	// (a) blockSize is a power of two
	// (b) blockDim.x == BlockSize == 1 << BlockSizeShift
	// (b) blockDim.y == blockDim.z == 1
	// (c) gridDim.x <= blockDim.x * blockDim.x
	// (d) gridDim.y == gridDim.z == 1

	// blockDim.x must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
	// gridDim.x <= blockDim.x * blockDim.x
	//
	// 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing

	enum { WarpCount = 4 };
	enum { BlockSize = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
	enum { BlockSizeShift = power_of_two< BlockSize >::value };
	enum { BlockSizeMask = BlockSize - 1 };

	enum { GridMaxComputeCapability_2x = 0x0ffff };
	enum { GridMax = ( BlockSize * BlockSize ) < GridMaxComputeCapability_2x
	? ( BlockSize * BlockSize ) : GridMaxComputeCapability_2x };

	const FunctorType m_functor ;
	size_type * m_scratch_space ;
	size_type * m_scratch_flags ;
	const size_type m_work ;
	size_type m_work_per_block ;
	size_type m_final ;

	//----------------------------------------

	__device__ inline
	void initial(void) const
	{
	extern __shared__ size_type shared_data[];

	const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
	word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );

	size_type * const shared_value = shared_data + word_count.value * threadIdx.x ;

	m_functor.init( Reduce::reference( shared_value ) );

	// Number of blocks is bounded so that the reduction can be limited to two passes.
	// Each thread block is given an approximately equal amount of work to perform.
	// Accumulate the values for this block.
	// The accumulation ordering does not match the final pass, but is arithmatically equivalent.

	const size_type iwork_beg = blockIdx.x * m_work_per_block ;
	const size_type iwork_end = iwork_beg + m_work_per_block < m_work
	? iwork_beg + m_work_per_block : m_work ;

	for ( size_type iwork = threadIdx.x + iwork_beg ; iwork < iwork_end ; iwork += BlockSize ) {
	m_functor( iwork , Reduce::reference( shared_value ) , false );
	}

	// Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
	// Blocks' scan values are written to 'blockIdx.x' location.
	// Block-groups' scan values are at: i = ( j * BlockSize - 1 ) for i < gridDim.x
	cuda_single_inter_block_reduce_scan<true,BlockSize>( m_functor , blockIdx.x , gridDim.x , shared_data , m_scratch_space , m_scratch_flags );
	}

	//----------------------------------------

	__device__ inline
	void final(void) const
	{
	extern __shared__ size_type shared_data[];

	const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
	word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );

	// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
	size_type * const shared_prefix = shared_data + word_count.value * threadIdx.x ;
	size_type * const shared_accum = shared_data + word_count.value * ( BlockSize + 1 );

	// Starting value for this thread block is the previous block's total.
	if ( blockIdx.x ) {
	size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
	for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
	}
	else if ( 0 == threadIdx.x ) {
	m_functor.init( Reduce::reference( shared_accum ) );
	}

	unsigned iwork_beg = blockIdx.x * m_work_per_block ;
	const unsigned iwork_end = iwork_beg + m_work_per_block ;

	for ( ; iwork_beg < iwork_end ; iwork_beg += BlockSize ) {

	const unsigned iwork = threadIdx.x + iwork_beg ;

	__syncthreads(); // Don't overwrite previous iteration values until they are used

	m_functor.init( Reduce::reference( shared_prefix + word_count.value ) );

	// Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
	for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) {
	shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
	}

	if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.

	// Call functor to accumulate inclusive scan value for this work item
	if ( iwork < m_work ) { m_functor( iwork , Reduce::reference( shared_prefix + word_count.value ) , false ); }

	// Scan block values into locations shared_data[1..BlockSize]
	cuda_intra_block_reduce_scan<true>( m_functor , Reduce::pointer_type(shared_data+word_count.value) );

	{
	size_type * const block_total = shared_data + word_count.value * blockDim.x ;
	for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
	}

	// Call functor with exclusive scan value
	if ( iwork < m_work ) { m_functor( iwork , Reduce::reference( shared_prefix ) , true ); }
	}
	}

	//----------------------------------------

	__device__ inline
	void operator()(void) const
	{
	if ( ! m_final ) {
	initial();
	}
	else {
	final();
	}
	}

	ParallelScan( const FunctorType & functor ,
	const size_t nwork )
	: m_functor( functor )
	, m_scratch_space( 0 )
	, m_scratch_flags( 0 )
	, m_work( nwork )
	, m_work_per_block( 0 )
	, m_final( false )
	{
	// At most 'max_grid' blocks:
	const int max_grid = std::min( int(GridMax) , int(( nwork + BlockSizeMask ) / BlockSize ));

	// How much work per block:
	m_work_per_block = ( nwork + max_grid - 1 ) / max_grid ;

	// How many block are really needed for this much work:
	const dim3 grid( ( nwork + m_work_per_block - 1 ) / m_work_per_block , 1 , 1 );
	const dim3 block( BlockSize , 1 , 1 );
	const int shmem = Reduce::value_size( functor ) * ( BlockSize + 2 );

	m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * grid.x );
	m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );

	m_final = false ;
	CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute

	m_final = true ;
	CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
	}

	void wait() const { Cuda::fence(); }
	};

	} // namespace Impl
	} // namespace Kokkos

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	#if defined( __CUDA_ARCH__ )

	namespace Kokkos {
	namespace Impl {

	template< typename Type >
	struct CudaJoinFunctor {
	typedef Type value_type ;

	KOKKOS_INLINE_FUNCTION
	static void join( volatile value_type & update ,
	volatile const value_type & input )
	{ update += input ; }
	};

	} // namespace Impl
	} // namespace Kokkos

	namespace Kokkos {

	template< typename TypeLocal , typename TypeGlobal >
	__device__ inline TypeGlobal Cuda::team_scan( const TypeLocal & value , TypeGlobal * const global_accum )
	{
	enum { BlockSizeMax = 512 };

	__shared__ TypeGlobal base_data[ BlockSizeMax + 1 ];

	__syncthreads(); // Don't write in to shared data until all threads have entered this function

	if ( 0 == threadIdx.x ) { base_data[0] = 0 ; }

	base_data[ threadIdx.x + 1 ] = value ;

	Impl::cuda_intra_block_reduce_scan<true>( Impl::CudaJoinFunctor<TypeGlobal>() , base_data + 1 );

	if ( global_accum ) {
	if ( blockDim.x == threadIdx.x + 1 ) {
	base_data[ blockDim.x ] = atomic_fetch_add( global_accum , base_data[ blockDim.x ] );
	}
	__syncthreads(); // Wait for atomic
	base_data[ threadIdx.x ] += base_data[ blockDim.x ] ;
	}

	return base_data[ threadIdx.x ];
	}

	template< typename Type >
	__device__ inline Type Cuda::team_scan( const Type & value )
	{ return team_scan( value , (Type*) 0 ); }

	} // namespace Kokkos

	#else /* ! defined( __CUDA_ARCH__ ) */

	namespace Kokkos {

	template< typename Type > inline Type Cuda::team_scan( const Type & ) { return 0 ; }

	template< typename TypeLocal , typename TypeGlobal >
	inline TypeGlobal Cuda::team_scan( const TypeLocal & , TypeGlobal * const ) { return 0 ; }

	} // namespace Kokkos

	#endif /* ! defined( __CUDA_ARCH__ ) */

	//----------------------------------------------------------------------------
	//----------------------------------------------------------------------------

	#endif /* defined( __CUDACC__ ) */

	#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */

Kokkos_Cuda_Parallel.hppNo OneTemporaryActions

File Metadata

Kokkos_Cuda_Parallel.hppView Options

Event Timeline

Kokkos_Cuda_Parallel.hpp
No OneTemporary
Actions

Kokkos_Cuda_Parallel.hpp
View Options