Page MenuHomec4science

KokkosExp_Cuda_IterateTile_Refactor.hpp
No OneTemporary

File Metadata

Created
Fri, May 10, 23:29

KokkosExp_Cuda_IterateTile_Refactor.hpp

/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
#define KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
#include <Kokkos_Macros.hpp>
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
#include <iostream>
#include <algorithm>
#include <cstdio>
#include <utility>
// #include<Cuda/Kokkos_CudaExec.hpp>
// Including the file above leads to following type of errors:
// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed
// use existing Kokkos functionality, e.g. max blocks, once resolved
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <typeinfo>
#endif
namespace Kokkos { namespace Experimental { namespace Impl {
namespace Refactor {
// ------------------------------------------------------------------ //
// ParallelFor iteration pattern
template< int N , typename RP , typename Functor , typename Tag >
struct DeviceIterateTile;
//Rank 2
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<2,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
inline __device__
void exec_range() const
{
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1);
}
}
}
}
}
// LR
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
m_func(offset_0 , offset_1);
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<2,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
inline __device__
void exec_range() const
{
if (RP::inner_direction == RP::Left) {
// Loop over size maxnumblocks until full range covered
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1);
}
}
}
}
}
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
m_func(Tag(), offset_0 , offset_1);
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
//Rank 3
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<3,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
inline __device__
void exec_range() const
{
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2);
}
}
}
}
}
}
}
// LR
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
m_func(offset_0 , offset_1 , offset_2);
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<3,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
inline __device__
void exec_range() const
{
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
}
}
}
}
}
}
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
//Rank 4
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<4,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
const index_type temp0 = m_rp.m_tile_end[0];
const index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
}
}
}
}
}
}
}
}
// LR
else {
const index_type temp0 = m_rp.m_tile_end[0];
const index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<4,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if (RP::inner_direction == RP::Left) {
const index_type temp0 = m_rp.m_tile_end[0];
const index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
}
}
}
}
}
}
}
}
}
else {
const index_type temp0 = m_rp.m_tile_end[0];
const index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
//Rank 5
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<5,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
}
}
}
}
}
}
}
}
}
}
// LR
else {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
}
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<5,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
}
}
}
}
}
}
}
}
}
}
// LR
else {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
}
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
//Rank 6
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<6,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
temp0 = m_rp.m_tile_end[4];
temp1 = m_rp.m_tile_end[5];
const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
}
}
}
}
}
}
}
}
}
}
}
}
// LR
else {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
temp0 = m_rp.m_tile_end[4];
temp1 = m_rp.m_tile_end[5];
const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
}
}
}
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<6,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
temp0 = m_rp.m_tile_end[4];
temp1 = m_rp.m_tile_end[5];
const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
}
}
}
}
}
}
}
}
}
}
}
}
// LR
else {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
temp0 = m_rp.m_tile_end[4];
temp1 = m_rp.m_tile_end[5];
const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
}
}
}
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
} // Refactor
// ----------------------------------------------------------------------------------
namespace Reduce {
template < typename T >
using is_void = std::is_same< T, void >;
template < typename T >
struct is_array_type : std::false_type
{
using value_type = T;
};
template < typename T >
struct is_array_type< T* > : std::true_type
{
using value_type = T;
};
template < typename T >
struct is_array_type< T[] > : std::true_type
{
using value_type = T;
};
// ------------------------------------------------------------------ //
template< int N , typename RP , typename Functor , typename Tag , typename ValueType , typename Enable = void >
struct DeviceIterateTile;
// ParallelReduce iteration pattern
// Scalar reductions
// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of tiles and reduction algorithm constraints
// extract n-dim tile offsets (i.e. tile's global starting mulit-index) from the tileid = blockid using tile dimensions
// local indices within a tile extracted from (index_type)threadIdx.x using tile dims, constrained by blocksize
// combine tile and local id info for multi-dim global ids
// Pattern:
// Each block+thread is responsible for a tile+local_id combo (additional when striding by num_blocks)
// 1. create offset arrays
// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max num blocks)
// 3. temps set for tile_idx and thrd_idx, which will be modified
// 4. if LL vs LR:
// determine tile starting point offsets (multidim)
// determine local index offsets (multidim)
// concatentate tile offset + local offset for global multi-dim index
// if offset withinin range bounds AND local offset within tile bounds, call functor
// ValueType = T
//Rank 2
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
// Deduce this blocks tile_id
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{
m_func( m_offset[0], m_offset[1], m_v );
}
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
//Rank 3
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
//Rank 4
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
//Rank 5
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
//Rank 6
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// ValueType = T[], T*
//Rank 2
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{
m_func( m_offset[0], m_offset[1], m_v );
}
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
}
} //end for loop over num_tiles - product of tiles in each direction
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
//Rank 3
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
//Rank 4
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
//Rank 5
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
//Rank 6
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
} // Reduce
// ----------------------------------------------------------------------------------
} } } //end namespace Kokkos::Experimental::Impl
#endif
#endif

Event Timeline