Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F62109852
KokkosExp_Cuda_IterateTile_Refactor.hpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, May 10, 23:29
Size
107 KB
Mime Type
text/x-c++
Expires
Sun, May 12, 23:29 (2 d)
Engine
blob
Format
Raw Data
Handle
17604879
Attached To
rLAMMPS lammps
KokkosExp_Cuda_IterateTile_Refactor.hpp
View Options
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
#define KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
#include <Kokkos_Macros.hpp>
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
#include <iostream>
#include <algorithm>
#include <cstdio>
#include <utility>
// #include<Cuda/Kokkos_CudaExec.hpp>
// Including the file above leads to following type of errors:
// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed
// use existing Kokkos functionality, e.g. max blocks, once resolved
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <typeinfo>
#endif
namespace Kokkos { namespace Experimental { namespace Impl {
namespace Refactor {
// ------------------------------------------------------------------ //
// ParallelFor iteration pattern
template< int N , typename RP , typename Functor , typename Tag >
struct DeviceIterateTile;
//Rank 2
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<2,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
inline __device__
void exec_range() const
{
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1);
}
}
}
}
}
// LR
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
m_func(offset_0 , offset_1);
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<2,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
inline __device__
void exec_range() const
{
if (RP::inner_direction == RP::Left) {
// Loop over size maxnumblocks until full range covered
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1);
}
}
}
}
}
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
m_func(Tag(), offset_0 , offset_1);
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
//Rank 3
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<3,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
inline __device__
void exec_range() const
{
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2);
}
}
}
}
}
}
}
// LR
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
m_func(offset_0 , offset_1 , offset_2);
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<3,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
inline __device__
void exec_range() const
{
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
}
}
}
}
}
}
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
//Rank 4
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<4,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
const index_type temp0 = m_rp.m_tile_end[0];
const index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
}
}
}
}
}
}
}
}
// LR
else {
const index_type temp0 = m_rp.m_tile_end[0];
const index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<4,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if (RP::inner_direction == RP::Left) {
const index_type temp0 = m_rp.m_tile_end[0];
const index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
}
}
}
}
}
}
}
}
}
else {
const index_type temp0 = m_rp.m_tile_end[0];
const index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
//Rank 5
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<5,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
}
}
}
}
}
}
}
}
}
}
// LR
else {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
}
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<5,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
}
}
}
}
}
}
}
}
}
}
// LR
else {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
}
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
//Rank 6
// Specializations for void tag type
template< typename RP , typename Functor >
struct DeviceIterateTile<6,RP,Functor,void >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
temp0 = m_rp.m_tile_end[4];
temp1 = m_rp.m_tile_end[5];
const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
}
}
}
}
}
}
}
}
}
}
}
}
// LR
else {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
temp0 = m_rp.m_tile_end[4];
temp1 = m_rp.m_tile_end[5];
const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
}
}
}
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag >
struct DeviceIterateTile<6,RP,Functor,Tag>
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ )
: m_rp(rp_)
, m_func(f_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
// LL
if (RP::inner_direction == RP::Left) {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
temp0 = m_rp.m_tile_end[4];
temp1 = m_rp.m_tile_end[5];
const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
( temp1 <= max_blocks ? temp1 : max_blocks ) );
const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
}
}
}
}
}
}
}
}
}
}
}
}
// LR
else {
index_type temp0 = m_rp.m_tile_end[0];
index_type temp1 = m_rp.m_tile_end[1];
const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
temp0 = m_rp.m_tile_end[2];
temp1 = m_rp.m_tile_end[3];
const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
temp0 = m_rp.m_tile_end[4];
temp1 = m_rp.m_tile_end[5];
const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
( temp0 <= max_blocks ? temp0 : max_blocks ) );
const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
}
}
}
}
}
}
}
}
}
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
};
} // Refactor
// ----------------------------------------------------------------------------------
namespace Reduce {
template < typename T >
using is_void = std::is_same< T, void >;
template < typename T >
struct is_array_type : std::false_type
{
using value_type = T;
};
template < typename T >
struct is_array_type< T* > : std::true_type
{
using value_type = T;
};
template < typename T >
struct is_array_type< T[] > : std::true_type
{
using value_type = T;
};
// ------------------------------------------------------------------ //
template< int N , typename RP , typename Functor , typename Tag , typename ValueType , typename Enable = void >
struct DeviceIterateTile;
// ParallelReduce iteration pattern
// Scalar reductions
// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of tiles and reduction algorithm constraints
// extract n-dim tile offsets (i.e. tile's global starting mulit-index) from the tileid = blockid using tile dimensions
// local indices within a tile extracted from (index_type)threadIdx.x using tile dims, constrained by blocksize
// combine tile and local id info for multi-dim global ids
// Pattern:
// Each block+thread is responsible for a tile+local_id combo (additional when striding by num_blocks)
// 1. create offset arrays
// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max num blocks)
// 3. temps set for tile_idx and thrd_idx, which will be modified
// 4. if LL vs LR:
// determine tile starting point offsets (multidim)
// determine local index offsets (multidim)
// concatentate tile offset + local offset for global multi-dim index
// if offset withinin range bounds AND local offset within tile bounds, call functor
// ValueType = T
//Rank 2
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
// Deduce this blocks tile_id
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{
m_func( m_offset[0], m_offset[1], m_v );
}
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
//Rank 3
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
//Rank 4
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
//Rank 5
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
//Rank 6
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
ValueType & m_v;
};
// ValueType = T[], T*
//Rank 2
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{
m_func( m_offset[0], m_offset[1], m_v );
}
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
}
} //end for loop over num_tiles - product of tiles in each direction
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
//Rank 3
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
inline __device__
void exec_range() const
{
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
//Rank 4
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for void tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
inline __device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
//Rank 5
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
//Rank 6
// Specializations for void tag type
template< typename RP , typename Functor , typename ValueType >
struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
// Specializations for tag type
template< typename RP , typename Functor , typename Tag, typename ValueType >
struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
{
using index_type = typename RP::index_type;
using value_type = typename is_array_type< ValueType >::value_type;
__device__
DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
: m_rp(rp_)
, m_func(f_)
, m_v(v_)
{}
static constexpr index_type max_blocks = 65535;
//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
inline __device__
void exec_range() const
{
//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
index_type m_offset[RP::rank]; // tile starting global id offset
index_type m_local_offset[RP::rank]; // tile starting global id offset
for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
index_type thrd_idx = (index_type)threadIdx.y;
bool in_bounds = true;
// LL
if (RP::inner_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
// LR
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
// tile-local indices identified with (index_type)threadIdx.y
m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
thrd_idx /= m_rp.m_tile[i];
m_offset[i] += m_local_offset[i];
if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
in_bounds &= false;
}
}
if ( in_bounds )
{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
}
}
}
} //end exec_range
private:
const RP & m_rp;
const Functor & m_func;
value_type* m_v;
};
} // Reduce
// ----------------------------------------------------------------------------------
} } } //end namespace Kokkos::Experimental::Impl
#endif
#endif
Event Timeline
Log In to Comment