Page MenuHomec4science

Kokkos_Cuda_Parallel.hpp
No OneTemporary

File Metadata

Created
Sun, May 12, 08:09

Kokkos_Cuda_Parallel.hpp

/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_PARALLEL_HPP
#define KOKKOS_CUDA_PARALLEL_HPP
#include <Kokkos_Macros.hpp>
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
#include <iostream>
#include <algorithm>
#include <cstdio>
#include <cstdint>
#include <utility>
#include <Kokkos_Parallel.hpp>
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <Kokkos_Vectorization.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <typeinfo>
#endif
#include <KokkosExp_MDRangePolicy.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class ... Properties >
class TeamPolicyInternal< Kokkos::Cuda , Properties ... >: public PolicyTraits<Properties ... >
{
public:
//! Tag this class as a kokkos execution policy
typedef TeamPolicyInternal execution_policy ;
typedef PolicyTraits<Properties ... > traits;
private:
enum { MAX_WARP = 8 };
int m_league_size ;
int m_team_size ;
int m_vector_length ;
int m_team_scratch_size[2] ;
int m_thread_scratch_size[2] ;
int m_chunk_size;
public:
//! Execution space of this execution policy
typedef Kokkos::Cuda execution_space ;
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
m_league_size = p.m_league_size;
m_team_size = p.m_team_size;
m_vector_length = p.m_vector_length;
m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size;
return *this;
}
//----------------------------------------
template< class FunctorType >
inline static
int team_size_max( const FunctorType & functor )
{
int n = MAX_WARP * Impl::CudaTraits::WarpSize ;
for ( ; n ; n >>= 1 ) {
const int shmem_size =
/* for global reduce */ Impl::cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,typename traits::work_tag>( functor , n )
/* for team reduce */ + ( n + 2 ) * sizeof(double)
/* for team shared */ + Impl::FunctorTeamShmemSize< FunctorType >::value( functor , n );
if ( shmem_size < Impl::CudaTraits::SharedMemoryCapacity ) break ;
}
return n ;
}
template< class FunctorType >
static int team_size_recommended( const FunctorType & functor )
{ return team_size_max( functor ); }
template< class FunctorType >
static int team_size_recommended( const FunctorType & functor , const int vector_length)
{
int max = team_size_max( functor )/vector_length;
if(max<1) max = 1;
return max;
}
inline static
int vector_length_max()
{ return Impl::CudaTraits::WarpSize; }
//----------------------------------------
inline int vector_length() const { return m_vector_length ; }
inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; }
inline int scratch_size(int level, int team_size_ = -1) const {
if(team_size_<0) team_size_ = m_team_size;
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level];
}
inline int team_scratch_size(int level) const {
return m_team_scratch_size[level];
}
inline int thread_scratch_size(int level) const {
return m_thread_scratch_size[level];
}
TeamPolicyInternal()
: m_league_size( 0 )
, m_team_size( 0 )
, m_vector_length( 0 )
, m_team_scratch_size {0,0}
, m_thread_scratch_size {0,0}
, m_chunk_size ( 32 )
{}
/** \brief Specify league size, request team size */
TeamPolicyInternal( execution_space &
, int league_size_
, int team_size_request
, int vector_length_request = 1 )
: m_league_size( league_size_ )
, m_team_size( team_size_request )
, m_vector_length( vector_length_request )
, m_team_scratch_size {0,0}
, m_thread_scratch_size {0,0}
, m_chunk_size ( 32 )
{
// Allow only power-of-two vector_length
if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
}
// Make sure league size is permissable
if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
// Make sure total block size is permissable
if ( m_team_size * m_vector_length > 1024 ) {
Impl::throw_runtime_exception(std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. Team size x vector length must be smaller than 1024."));
}
}
/** \brief Specify league size, request team size */
TeamPolicyInternal( execution_space &
, int league_size_
, const Kokkos::AUTO_t & /* team_size_request */
, int vector_length_request = 1 )
: m_league_size( league_size_ )
, m_team_size( -1 )
, m_vector_length( vector_length_request )
, m_team_scratch_size {0,0}
, m_thread_scratch_size {0,0}
, m_chunk_size ( 32 )
{
// Allow only power-of-two vector_length
if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
}
// Make sure league size is permissable
if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
}
TeamPolicyInternal( int league_size_
, int team_size_request
, int vector_length_request = 1 )
: m_league_size( league_size_ )
, m_team_size( team_size_request )
, m_vector_length ( vector_length_request )
, m_team_scratch_size {0,0}
, m_thread_scratch_size {0,0}
, m_chunk_size ( 32 )
{
// Allow only power-of-two vector_length
if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
}
// Make sure league size is permissable
if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
// Make sure total block size is permissable
if ( m_team_size * m_vector_length > 1024 ) {
Impl::throw_runtime_exception(std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. Team size x vector length must be smaller than 1024."));
}
}
TeamPolicyInternal( int league_size_
, const Kokkos::AUTO_t & /* team_size_request */
, int vector_length_request = 1 )
: m_league_size( league_size_ )
, m_team_size( -1 )
, m_vector_length ( vector_length_request )
, m_team_scratch_size {0,0}
, m_thread_scratch_size {0,0}
, m_chunk_size ( 32 )
{
// Allow only power-of-two vector_length
if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
}
// Make sure league size is permissable
if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
}
inline int chunk_size() const { return m_chunk_size ; }
/** \brief set chunk_size to a discrete value*/
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
TeamPolicyInternal p = *this;
p.m_chunk_size = chunk_size_;
return p;
}
/** \brief set per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
TeamPolicyInternal p = *this;
p.m_team_scratch_size[level] = per_team.value;
return p;
};
/** \brief set per thread scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
TeamPolicyInternal p = *this;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
/** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
TeamPolicyInternal p = *this;
p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
typedef Kokkos::Impl::CudaTeamMember member_type ;
};
} // namspace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::RangePolicy< Traits ... >
, Kokkos::Cuda
>
{
private:
typedef Kokkos::RangePolicy< Traits ... > Policy;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::launch_bounds LaunchBounds ;
const FunctorType m_functor ;
const Policy m_policy ;
ParallelFor() = delete ;
ParallelFor & operator = ( const ParallelFor & ) = delete ;
template< class TagType >
inline __device__
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const Member i ) const
{ m_functor( i ); }
template< class TagType >
inline __device__
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const Member i ) const
{ m_functor( TagType() , i ); }
public:
typedef FunctorType functor_type ;
inline
__device__
void operator()(void) const
{
const Member work_stride = blockDim.y * gridDim.x ;
const Member work_end = m_policy.end();
for ( Member
iwork = m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x ;
iwork < work_end ;
iwork += work_stride ) {
this-> template exec_range< WorkTag >( iwork );
}
}
inline
void execute() const
{
const int nwork = m_policy.end() - m_policy.begin();
const dim3 block( 1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
ParallelFor( const FunctorType & arg_functor ,
const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{ }
};
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
using RP = Policy;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
typedef typename Policy::launch_bounds LaunchBounds;
const FunctorType m_functor ;
const Policy m_rp ;
public:
inline
__device__
void operator()(void) const
{
Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
}
inline
void execute() const
{
const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
if ( RP::rank == 2 )
{
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
const dim3 grid(
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
, 1
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 3 )
{
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
const dim3 grid(
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 4 )
{
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
const dim3 grid(
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
, static_cast<index_type>(maxblocks) )
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
, std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 5 )
{
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
const dim3 grid(
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
, static_cast<index_type>(maxblocks) )
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
, static_cast<index_type>(maxblocks) )
, std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 6 )
{
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
const dim3 grid(
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
, static_cast<index_type>(maxblocks) )
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
, static_cast<index_type>(maxblocks) )
, std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
, static_cast<index_type>(maxblocks) )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else
{
printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
Kokkos::abort("Aborting");
}
} //end execute
// inline
ParallelFor( const FunctorType & arg_functor
, Policy arg_policy )
: m_functor( arg_functor )
, m_rp( arg_policy )
{}
};
template< class FunctorType , class ... Properties >
class ParallelFor< FunctorType
, Kokkos::TeamPolicy< Properties ... >
, Kokkos::Cuda
>
{
private:
typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::launch_bounds LaunchBounds ;
public:
typedef FunctorType functor_type ;
typedef Cuda::size_type size_type ;
private:
// Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
// shared memory utilization:
//
// [ team reduce space ]
// [ team shared space ]
//
const FunctorType m_functor ;
const size_type m_league_size ;
const size_type m_team_size ;
const size_type m_vector_size ;
const int m_shmem_begin ;
const int m_shmem_size ;
void* m_scratch_ptr[2] ;
const int m_scratch_size[2] ;
template< class TagType >
__device__ inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const Member & member ) const
{ m_functor( member ); }
template< class TagType >
__device__ inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const Member & member ) const
{ m_functor( TagType() , member ); }
public:
__device__ inline
void operator()(void) const
{
// Iterate this block through the league
int threadid = 0;
if ( m_scratch_size[1]>0 ) {
__shared__ int base_thread_id;
if (threadIdx.x==0 && threadIdx.y==0 ) {
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
int done = 0;
while (!done) {
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
if(!done) {
threadid += blockDim.x * blockDim.y;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
}
}
base_thread_id = threadid;
}
__syncthreads();
threadid = base_thread_id;
}
const int int_league_size = (int)m_league_size;
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >(
typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
, m_shmem_begin
, m_shmem_size
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
, m_scratch_size[1]
, league_rank
, m_league_size ) );
}
if ( m_scratch_size[1]>0 ) {
__syncthreads();
if (threadIdx.x==0 && threadIdx.y==0 )
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
}
}
inline
void execute() const
{
const int shmem_size_total = m_shmem_begin + m_shmem_size ;
const dim3 grid( int(m_league_size) , 1 , 1 );
const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
}
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy
)
: m_functor( arg_functor )
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
, m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) )
, m_shmem_size( arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
, m_scratch_ptr{NULL,NULL}
, m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)}
{
// Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
const int shmem_size_total = m_shmem_begin + m_shmem_size ;
if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
}
if ( int(m_team_size) >
int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor >
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
}
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ... >
, ReducerType
, Kokkos::Cuda
>
{
private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
public:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::value_type value_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef FunctorType functor_type ;
typedef Cuda::size_type size_type ;
// Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
// Some crutch to do function overloading
private:
typedef double DummyShflReductionType;
typedef int DummySHMEMReductionType;
public:
// Make the exec_range calls call to Reduce::DeviceIterateTile
template< class TagType >
__device__ inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const Member & i , reference_type update ) const
{ m_functor( i , update ); }
template< class TagType >
__device__ inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const Member & i , reference_type update ) const
{ m_functor( TagType() , i , update ); }
__device__ inline
void operator() () const {
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
}
__device__ inline
void run(const DummySHMEMReductionType& ) const
{
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
{
reference_type value =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
const WorkRange range( m_policy , blockIdx.x , gridDim.x );
for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
iwork < iwork_end ; iwork += blockDim.y ) {
this-> template exec_range< WorkTag >( iwork , value );
}
}
// Reduce with final value at blockDim.y - 1 location.
if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
// This is the final block with the final result at the final threads' location
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
if ( threadIdx.y == 0 ) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
}
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
}
}
__device__ inline
void run(const DummyShflReductionType&) const
{
value_type value;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
const WorkRange range( m_policy , blockIdx.x , gridDim.x );
for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
iwork < iwork_end ; iwork += blockDim.y ) {
this-> template exec_range< WorkTag >( iwork , value );
}
pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
int max_active_thread = range.end()-range.begin() < blockDim.y ? range.end() - range.begin():blockDim.y;
max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
value_type init;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
if(id==0) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
*result = value;
}
}
}
// Determine block size constrained by shared memory:
static inline
unsigned local_block_size( const FunctorType & f )
{
unsigned n = CudaTraits::WarpSize * 8 ;
while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
return n ;
}
inline
void execute()
{
const int nwork = m_policy.end() - m_policy.begin();
if ( nwork ) {
const int block_size = local_block_size( m_functor );
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
// REQUIRED ( 1 , N , 1 )
const dim3 block( 1 , block_size , 1 );
// Required grid.x <= block.y
const dim3 grid( std::min( int(block.y) , int( ( nwork + block.y - 1 ) / block.y ) ) , 1 , 1 );
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
}
}
}
else {
if (m_result_ptr) {
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
}
}
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
{ }
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ReducerType & reducer)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.view().ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
{ }
};
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::launch_bounds LaunchBounds;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
public:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::value_type value_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef FunctorType functor_type ;
typedef Cuda::size_type size_type ;
// Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
const FunctorType m_functor ;
const Policy m_policy ; // used for workrange and nwork
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
// Some crutch to do function overloading
private:
typedef double DummyShflReductionType;
typedef int DummySHMEMReductionType;
public:
inline
__device__
void
exec_range( reference_type update ) const
{
Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
}
inline
__device__
void operator() (void) const {
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
}
__device__ inline
void run(const DummySHMEMReductionType& ) const
{
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
{
reference_type value =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
this-> exec_range( value );
}
// Reduce with final value at blockDim.y - 1 location.
// Problem: non power-of-two blockDim
if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
// This is the final block with the final result at the final threads' location
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
if ( threadIdx.y == 0 ) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
}
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
}
}
__device__ inline
void run(const DummyShflReductionType&) const
{
value_type value;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
const Member work_part =
( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block
this-> exec_range( value );
pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
value_type init;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
if(id==0) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
*result = value;
}
}
}
// Determine block size constrained by shared memory:
static inline
unsigned local_block_size( const FunctorType & f )
{
unsigned n = CudaTraits::WarpSize * 8 ;
while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
return n ;
}
inline
void execute()
{
const int nwork = m_policy.m_num_tiles;
if ( nwork ) {
int block_size = m_policy.m_prod_tile_dims;
// CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
// Nearest power of two
int exponent_pow_two = std::ceil( std::log2(block_size) );
block_size = std::pow(2, exponent_pow_two);
int suggested_blocksize = local_block_size( m_functor );
block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
// REQUIRED ( 1 , N , 1 )
const dim3 block( 1 , block_size , 1 );
// Required grid.x <= block.y
const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 );
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
}
}
}
else {
if (m_result_ptr) {
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
}
}
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
{}
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ReducerType & reducer)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.view().ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
{}
};
//----------------------------------------------------------------------------
#if 1
template< class FunctorType , class ReducerType, class ... Properties >
class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Properties ... >
, ReducerType
, Kokkos::Cuda
>
{
private:
typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef typename ValueTraits::value_type value_type ;
public:
typedef FunctorType functor_type ;
typedef Cuda::size_type size_type ;
enum { UseShflReduction = (true && ValueTraits::StaticValueSize) };
private:
typedef double DummyShflReductionType;
typedef int DummySHMEMReductionType;
// Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
// shared memory utilization:
//
// [ global reduce space ]
// [ team reduce space ]
// [ team shared space ]
//
const FunctorType m_functor ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
size_type m_team_begin ;
size_type m_shmem_begin ;
size_type m_shmem_size ;
void* m_scratch_ptr[2] ;
int m_scratch_size[2] ;
const size_type m_league_size ;
const size_type m_team_size ;
const size_type m_vector_size ;
template< class TagType >
__device__ inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const Member & member , reference_type update ) const
{ m_functor( member , update ); }
template< class TagType >
__device__ inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const Member & member , reference_type update ) const
{ m_functor( TagType() , member , update ); }
public:
__device__ inline
void operator() () const {
int threadid = 0;
if ( m_scratch_size[1]>0 ) {
__shared__ int base_thread_id;
if (threadIdx.x==0 && threadIdx.y==0 ) {
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
int done = 0;
while (!done) {
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
if(!done) {
threadid += blockDim.x * blockDim.y;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
}
}
base_thread_id = threadid;
}
__syncthreads();
threadid = base_thread_id;
}
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0), threadid );
if ( m_scratch_size[1]>0 ) {
__syncthreads();
if (threadIdx.x==0 && threadIdx.y==0 )
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
}
}
__device__ inline
void run(const DummySHMEMReductionType&, const int& threadid) const
{
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
reference_type value =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
// Iterate this block through the league
const int int_league_size = (int)m_league_size;
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
, m_shmem_size
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
, m_scratch_size[1]
, league_rank
, m_league_size )
, value );
}
// Reduce with final value at blockDim.y - 1 location.
if ( cuda_single_inter_block_reduce_scan<false,FunctorType,WorkTag>(
ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
// This is the final block with the final result at the final threads' location
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
if ( threadIdx.y == 0 ) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
}
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
}
}
__device__ inline
void run(const DummyShflReductionType&, const int& threadid) const
{
value_type value;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
// Iterate this block through the league
const int int_league_size = (int)m_league_size;
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
, m_shmem_size
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
, m_scratch_size[1]
, league_rank
, m_league_size )
, value );
}
pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
value_type init;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
if(Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)) {
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
if(id==0) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
*result = value;
}
}
}
inline
void execute()
{
const int nwork = m_league_size * m_team_size ;
if ( nwork ) {
const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
:std::min( m_league_size , m_team_size );
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
const dim3 block( m_vector_size , m_team_size , 1 );
const dim3 grid( block_count , 1 , 1 );
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
}
}
}
else {
if (m_result_ptr) {
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
}
}
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
, m_team_begin( 0 )
, m_shmem_begin( 0 )
, m_shmem_size( 0 )
, m_scratch_ptr{NULL,NULL}
, m_scratch_size{
arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
), arg_policy.scratch_size(1,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
)}
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
{
// Return Init value if the number of worksets is zero
if( arg_policy.league_size() == 0) {
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , arg_result.ptr_on_device() );
return ;
}
m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size );
m_shmem_begin = sizeof(double) * ( m_team_size + 2 );
m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size );
m_scratch_ptr[1] = cuda_resize_scratch_space(static_cast<std::int64_t>(m_scratch_size[1])*(static_cast<std::int64_t>(Cuda::concurrency()/(m_team_size*m_vector_size))));
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size);
// The global parallel_reduce does not support vector_length other than 1 at the moment
if( (arg_policy.vector_length() > 1) && !UseShflReduction )
Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types.");
if( (m_team_size < 32) && !UseShflReduction )
Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types.");
// Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
if (! Kokkos::Impl::is_integral_power_of_two( m_team_size ) && !UseShflReduction ) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
}
if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
}
if ( unsigned(m_team_size) >
unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
}
}
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ReducerType & reducer)
: m_functor( arg_functor )
, m_reducer( reducer )
, m_result_ptr( reducer.view().ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
, m_team_begin( 0 )
, m_shmem_begin( 0 )
, m_shmem_size( 0 )
, m_scratch_ptr{NULL,NULL}
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
{
// Return Init value if the number of worksets is zero
if( arg_policy.league_size() == 0) {
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
return ;
}
m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size );
m_shmem_begin = sizeof(double) * ( m_team_size + 2 );
m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size );
m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size);
// The global parallel_reduce does not support vector_length other than 1 at the moment
if( (arg_policy.vector_length() > 1) && !UseShflReduction )
Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types.");
if( (m_team_size < 32) && !UseShflReduction )
Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types.");
// Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
if ( (! Kokkos::Impl::is_integral_power_of_two( m_team_size ) && !UseShflReduction ) ||
CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
}
if ( int(m_team_size) >
int(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
}
}
};
//----------------------------------------------------------------------------
#else
//----------------------------------------------------------------------------
template< class FunctorType , class ReducerType, class ... Properties >
class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Properties ... >
, ReducerType
, Kokkos::Cuda
>
{
private:
enum : int { align_scratch_value = 0x0100 /* 256 */ };
enum : int { align_scratch_mask = align_scratch_value - 1 };
KOKKOS_INLINE_FUNCTION static constexpr
int align_scratch( const int n )
{
return ( n & align_scratch_mask )
? n + align_scratch_value - ( n & align_scratch_mask ) : n ;
}
//----------------------------------------
// Reducer does not wrap a functor
template< class R = ReducerType , class F = void >
struct reducer_type : public R {
template< class S >
using rebind = reducer_type< typename R::rebind<S> , void > ;
KOKKOS_INLINE_FUNCTION
reducer_type( FunctorType const *
, ReducerType const * arg_reducer
, typename R::value_type * arg_value )
: R( *arg_reducer , arg_value ) {}
};
// Reducer does wrap a functor
template< class R >
struct reducer_type< R , FunctorType > : public R {
template< class S >
using rebind = reducer_type< typename R::rebind<S> , FunctorType > ;
KOKKOS_INLINE_FUNCTION
reducer_type( FunctorType const * arg_functor
, ReducerType const *
, typename R::value_type * arg_value )
: R( arg_functor , arg_value ) {}
};
//----------------------------------------
typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... > Policy ;
typedef CudaTeamMember Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename reducer_type<>::pointer_type pointer_type ;
typedef typename reducer_type<>::reference_type reference_type ;
typedef typename reducer_type<>::value_type value_type ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::FunctorAnalysis
< Kokkos::Impl::FunctorPatternInterface::REDUCE
, Policy
, FunctorType
> Analysis ;
public:
typedef FunctorType functor_type ;
typedef Cuda::size_type size_type ;
private:
const FunctorType m_functor ;
const reducer_type<> m_reducer ;
size_type * m_scratch_space ;
size_type * m_unified_space ;
size_type m_team_begin ;
size_type m_shmem_begin ;
size_type m_shmem_size ;
void* m_scratch_ptr[2] ;
int m_scratch_size[2] ;
const size_type m_league_size ;
const size_type m_team_size ;
const size_type m_vector_size ;
template< class TagType >
__device__ inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const Member & member , reference_type update ) const
{ m_functor( member , update ); }
template< class TagType >
__device__ inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const Member & member , reference_type update ) const
{ m_functor( TagType() , member , update ); }
public:
__device__ inline
void operator() () const
{
void * const shmem = kokkos_impl_cuda_shared_memory<char>();
const bool reduce_to_host =
std::is_same< typename reducer_type<>::memory_space
, Kokkos::HostSpace >::value &&
m_reducer.data();
value_type value ;
typename reducer_type<>::rebind< CudaSpace >
reduce( & m_functor , & m_reducer , & value );
reduce.init( reduce.data() );
// Iterate this block through the league
for ( int league_rank = blockIdx.x
; league_rank < m_league_size
; league_rank += gridDim.x ) {
// Initialization of team member data:
const Member member
( shmem
, m_shmem_team_begin
, m_shmem_team_size
, reinterpret_cast<char*>(m_scratch_space) + m_global_team_begin
, m_global_team_size
, league_rank
, m_league_size );
ParallelReduce::template
exec_team< WorkTag >( member , reduce.reference() );
}
if ( Member::global_reduce( reduce
, m_scratch_space
, reinterpret_cast<char*>(m_scratch_space)
+ aligned_flag_size
, shmem
, m_shmem_size ) ) {
// Single thread with data in value
reduce.final( reduce.data() );
if ( reduce_to_host ) {
reducer.copy( m_unified_space , reduce.data() );
}
}
}
inline
void execute()
{
const bool reduce_to_host =
std::is_same< typename reducer_type<>::memory_space
, Kokkos::HostSpace >::value &&
m_reducer.data();
const bool reduce_to_gpu =
std::is_same< typename reducer_type<>::memory_space
, Kokkos::CudaSpace >::value &&
m_reducer.data();
if ( m_league_size && m_team_size ) {
const int value_size = Analysis::value_size( m_functor );
m_scratch_space = cuda_internal_scratch_space( m_scratch_size );
m_unified_space = cuda_internal_scratch_unified( value_size );
const dim3 block( m_vector_size , m_team_size , m_team_per_block );
const dim3 grid( m_league_size , 1 , 1 );
const int shmem = m_shmem_team_begin + m_shmem_team_size ;
// copy to device and execute
CudaParallelLaunch<ParallelReduce,LaunchBounds>( *this, grid, block, shmem );
Cuda::fence();
if ( reduce_to_host ) {
m_reducer.copy( m_reducer.data() , pointer_type(m_unified_space) );
}
}
else if ( reduce_to_host ) {
m_reducer.init( m_reducer.data() );
}
else if ( reduce_to_gpu ) {
value_type tmp ;
m_reduce.init( & tmp );
cudaMemcpy( m_reduce.data() , & tmp , cudaMemcpyHostToDevice );
}
}
/**\brief Set up parameters and allocations for kernel launch.
*
* block = { vector_size , team_size , team_per_block }
* grid = { number_of_teams , 1 , 1 }
*
* shmem = shared memory for:
* [ team_reduce_buffer
* , team_scratch_buffer_level_0 ]
* reused by:
* [ global_reduce_buffer ]
*
* global_scratch for:
* [ global_reduce_flag_buffer
* , global_reduce_value_buffer
* , team_scratch_buffer_level_1 * max_concurrent_team ]
*/
ParallelReduce( FunctorType && arg_functor
, Policy && arg_policy
, ReducerType const & arg_reducer
)
: m_functor( arg_functor )
// the input reducer may wrap the input functor so must
// generate a reducer bound to the copied functor.
, m_reducer( & m_functor , & arg_reducer , arg_reducer.data() )
, m_scratch_space( 0 )
, m_unified_space( 0 )
, m_team_begin( 0 )
, m_shmem_begin( 0 )
, m_shmem_size( 0 )
, m_scratch_ptr{NULL,NULL}
, m_league_size( arg_policy.league_size() )
, m_team_per_block( 0 )
, m_team_size( arg_policy.team_size() )
, m_vector_size( arg_policy.vector_length() )
{
if ( 0 == m_league_size ) return ;
const int value_size = Analysis::value_size( m_functor );
//----------------------------------------
// Vector length must be <= WarpSize and power of two
const bool ok_vector = m_vector_size < CudaTraits::WarpSize &&
Kokkos::Impl::is_integral_power_of_two( m_vector_size );
//----------------------------------------
if ( 0 == m_team_size ) {
// Team size is AUTO, use a whole block per team.
// Calculate block size using the occupance calculator.
// Occupancy calculator assumes whole block.
m_team_size =
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >
( arg_functor
, arg_policy.vector_length()
, arg_policy.team_scratch_size(0)
, arg_policy.thread_scratch_size(0) / arg_policy.vector_length() );
m_team_per_block = 1 ;
}
//----------------------------------------
// How many CUDA threads per team.
// If more than a warp or multiple teams cannot exactly fill a warp
// then only one team per block.
const int team_threads = m_team_size * m_vector_size ;
if ( ( CudaTraits::WarpSize < team_threads ) ||
( CudaTraits::WarpSize % team_threads ) ) {
m_team_per_block = 1 ;
}
//----------------------------------------
// How much team scratch shared memory determined from
// either the functor or the policy:
if ( CudaTraits::WarpSize < team_threads ) {
// Need inter-warp team reduction (collectives) shared memory
// Speculate an upper bound for the value size
m_shmem_team_begin =
align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) );
}
m_shmem_team_size = arg_policy.scratch_size(0,m_team_size);
if ( 0 == m_shmem_team_size ) {
m_shmem_team_size = Analysis::team_shmem_size( m_functor , m_team_size );
}
m_shmem_team_size = align_scratch( m_shmem_team_size );
// Can fit a team in a block:
const bool ok_shmem_team =
( m_shmem_team_begin + m_shmem_team_size )
< CudaTraits::SharedMemoryCapacity ;
//----------------------------------------
if ( 0 == m_team_per_block ) {
// Potentially more than one team per block.
// Determine number of teams per block based upon
// how much team scratch can fit and exactly filling each warp.
const int team_per_warp = team_threads / CudaTraits::WarpSize ;
const int max_team_per_block =
Kokkos::Impl::CudaTraits::SharedMemoryCapacity
/ shmem_team_scratch_size ;
for ( m_team_per_block = team_per_warp ;
m_team_per_block + team_per_warp < max_team_per_block ;
m_team_per_block += team_per_warp );
}
//----------------------------------------
// How much global reduce scratch shared memory.
int shmem_global_reduce_size = 8 * value_size ;
//----------------------------------------
// Global scratch memory requirements.
const int aligned_flag_size = align_scratch( sizeof(int) );
const int max_concurrent_block =
cuda_internal_maximum_concurrent_block_count();
// Reduce space has claim flag followed by vaue buffer
const int global_reduce_value_size =
max_concurrent_block *
( aligned_flag_size + align_scratch( value_size ) );
// Scratch space has claim flag followed by scratch buffer
const int global_team_scratch_size =
max_concurrent_block * m_team_per_block *
( aligned_flag_size +
align_scratch( arg_policy.scratch_size(1,m_team_size) / m_vector_size )
);
const int global_size = aligned_flag_size
+ global_reduce_value_size
+ global_team_scratch_size ;
m_global_reduce_begin = aligned_flag_size ;
m_global_team_begin = m_global_reduce_begin + global_reduce_value_size ;
m_global_size = m_global_team_begin + global_team_scratch_size ;
}
};
#endif
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelScan< FunctorType
, Kokkos::RangePolicy< Traits ... >
, Kokkos::Cuda
>
{
private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType, WorkTag > ValueOps ;
public:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef FunctorType functor_type ;
typedef Cuda::size_type size_type ;
private:
// Algorithmic constraints:
// (a) blockDim.y is a power of two
// (b) blockDim.y == blockDim.z == 1
// (c) gridDim.x <= blockDim.y * blockDim.y
// (d) gridDim.y == gridDim.z == 1
const FunctorType m_functor ;
const Policy m_policy ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type m_final ;
template< class TagType >
__device__ inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const Member & i , reference_type update , const bool final_result ) const
{ m_functor( i , update , final_result ); }
template< class TagType >
__device__ inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const Member & i , reference_type update , const bool final_result ) const
{ m_functor( TagType() , i , update , final_result ); }
//----------------------------------------
__device__ inline
void initial(void) const
{
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
size_type * const shared_value = kokkos_impl_cuda_shared_memory<size_type>() + word_count.value * threadIdx.y ;
ValueInit::init( m_functor , shared_value );
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
const WorkRange range( m_policy , blockIdx.x , gridDim.x );
for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
iwork < iwork_end ; iwork += blockDim.y ) {
this-> template exec_range< WorkTag >( iwork , ValueOps::reference( shared_value ) , false );
}
// Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
// Blocks' scan values are written to 'blockIdx.x' location.
// Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < gridDim.x
cuda_single_inter_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags );
}
//----------------------------------------
__device__ inline
void final(void) const
{
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
size_type * const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
size_type * const shared_prefix = shared_data + word_count.value * threadIdx.y ;
size_type * const shared_accum = shared_data + word_count.value * ( blockDim.y + 1 );
// Starting value for this thread block is the previous block's total.
if ( blockIdx.x ) {
size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
}
else if ( 0 == threadIdx.y ) {
ValueInit::init( m_functor , shared_accum );
}
const WorkRange range( m_policy , blockIdx.x , gridDim.x );
for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
const typename Policy::member_type iwork = iwork_base + threadIdx.y ;
__syncthreads(); // Don't overwrite previous iteration values until they are used
ValueInit::init( m_functor , shared_prefix + word_count.value );
// Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) {
shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
}
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
// Call functor to accumulate inclusive scan value for this work item
if ( iwork < range.end() ) {
this-> template exec_range< WorkTag >( iwork , ValueOps::reference( shared_prefix + word_count.value ) , false );
}
// Scan block values into locations shared_data[1..blockDim.y]
cuda_intra_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , typename ValueTraits::pointer_type(shared_data+word_count.value) );
{
size_type * const block_total = shared_data + word_count.value * blockDim.y ;
for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
}
// Call functor with exclusive scan value
if ( iwork < range.end() ) {
this-> template exec_range< WorkTag >( iwork , ValueOps::reference( shared_prefix ) , true );
}
}
}
public:
//----------------------------------------
__device__ inline
void operator()(void) const
{
if ( ! m_final ) {
initial();
}
else {
final();
}
}
// Determine block size constrained by shared memory:
static inline
unsigned local_block_size( const FunctorType & f )
{
// blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
// gridDim.x <= blockDim.y * blockDim.y
//
// 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing
unsigned n = CudaTraits::WarpSize * 4 ;
while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
return n ;
}
inline
void execute()
{
const int nwork = m_policy.end() - m_policy.begin();
if ( nwork ) {
enum { GridMaxComputeCapability_2x = 0x0ffff };
const int block_size = local_block_size( m_functor );
const int grid_max =
( block_size * block_size ) < GridMaxComputeCapability_2x ?
( block_size * block_size ) : GridMaxComputeCapability_2x ;
// At most 'max_grid' blocks:
const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size ));
// How much work per block:
const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
// How many block are really needed for this much work:
const int grid_x = ( nwork + work_per_block - 1 ) / work_per_block ;
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * grid_x );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
const dim3 grid( grid_x , 1 , 1 );
const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
m_final = false ;
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
m_final = true ;
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
}
}
ParallelScan( const FunctorType & arg_functor ,
const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_final( false )
{ }
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType, class ExecPolicy, class ValueType , class Tag = typename ExecPolicy::work_tag>
struct CudaFunctorAdapter {
const FunctorType f;
typedef ValueType value_type;
CudaFunctorAdapter(const FunctorType& f_):f(f_) {}
__device__ inline
void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, ValueType& val) const {
//Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
f(typename ExecPolicy::work_tag(), i,val);
}
};
template< class FunctorType, class ExecPolicy, class ValueType >
struct CudaFunctorAdapter<FunctorType,ExecPolicy,ValueType,void> {
const FunctorType f;
typedef ValueType value_type;
CudaFunctorAdapter(const FunctorType& f_):f(f_) {}
__device__ inline
void operator() (const typename ExecPolicy::member_type& i, ValueType& val) const {
//Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
f(i,val);
}
__device__ inline
void operator() (typename ExecPolicy::member_type& i, ValueType& val) const {
//Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
f(i,val);
}
};
template<class FunctorType, class ResultType, class Tag, bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value >
struct FunctorReferenceType {
typedef ResultType& reference_type;
};
template<class FunctorType, class ResultType, class Tag>
struct FunctorReferenceType<FunctorType, ResultType, Tag, true> {
typedef typename Kokkos::Impl::FunctorValueTraits< FunctorType ,Tag >::reference_type reference_type;
};
template< class FunctorTypeIn, class ExecPolicy, class ValueType>
struct ParallelReduceFunctorType<FunctorTypeIn,ExecPolicy,ValueType,Cuda> {
enum {FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ValueType> >::type functor_type;
static functor_type functor(const FunctorTypeIn& functor_in) {
return Impl::if_c<FunctorHasValueType,FunctorTypeIn,functor_type>::select(functor_in,functor_type(functor_in));
}
};
}
} // namespace Kokkos
#endif /* defined( __CUDACC__ ) */
#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */

Event Timeline