KokkosExp_Cuda_IterateTile_Refactor.hpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Jul 17, 08:35

KokkosExp_Cuda_IterateTile_Refactor.hpp
View Options

	/*
	//@HEADER
	// ************************************************************************
	//
	// Kokkos v. 2.0
	// Copyright (2014) Sandia Corporation
	//
	// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
	// the U.S. Government retains certain rights in this software.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// 1. Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	//
	// 2. Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	//
	// 3. Neither the name of the Corporation nor the names of the
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
	// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
	// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	//
	// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
	//
	// ************************************************************************
	//@HEADER
	*/

	#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
	#define KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP

	#include <Kokkos_Macros.hpp>
	#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )

	#include <iostream>
	#include <algorithm>
	#include <cstdio>

	#include <utility>

	// #include<Cuda/Kokkos_CudaExec.hpp>
	// Including the file above leads to following type of errors:
	// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed
	// use existing Kokkos functionality, e.g. max blocks, once resolved

	#if defined(KOKKOS_ENABLE_PROFILING)
	#include <impl/Kokkos_Profiling_Interface.hpp>
	#include <typeinfo>
	#endif

	namespace Kokkos { namespace Experimental { namespace Impl {

	namespace Refactor {

	// ------------------------------------------------------------------ //
	// ParallelFor iteration pattern
	template< int N , typename RP , typename Functor , typename Tag >
	struct DeviceIterateTile;

	//Rank 2
	// Specializations for void tag type
	template< typename RP , typename Functor >
	struct DeviceIterateTile<2,RP,Functor,void >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	inline __device__
	void exec_range() const
	{
	// LL
	if (RP::inner_direction == RP::Left) {
	for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
	const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
	if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

	for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
	const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
	if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
	m_func(offset_0 , offset_1);
	}
	}
	}
	}
	}
	// LR
	else {
	for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
	const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
	if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {

	for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
	const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
	if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
	m_func(offset_0 , offset_1);
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};

	// Specializations for tag type
	template< typename RP , typename Functor , typename Tag >
	struct DeviceIterateTile<2,RP,Functor,Tag>
	{
	using index_type = typename RP::index_type;

	inline __device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	inline __device__
	void exec_range() const
	{
	if (RP::inner_direction == RP::Left) {
	// Loop over size maxnumblocks until full range covered
	for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
	const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
	if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

	for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
	const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
	if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
	m_func(Tag(), offset_0 , offset_1);
	}
	}
	}
	}
	}
	else {
	for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
	const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
	if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {

	for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
	const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
	if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
	m_func(Tag(), offset_0 , offset_1);
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};


	//Rank 3
	// Specializations for void tag type
	template< typename RP , typename Functor >
	struct DeviceIterateTile<3,RP,Functor,void >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	inline __device__
	void exec_range() const
	{
	// LL
	if (RP::inner_direction == RP::Left) {
	for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
	const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
	if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {

	for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
	const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
	if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

	for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
	const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
	if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
	m_func(offset_0 , offset_1 , offset_2);
	}
	}
	}
	}
	}
	}
	}
	// LR
	else {
	for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
	const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
	if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {

	for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
	const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
	if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

	for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
	const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
	if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
	m_func(offset_0 , offset_1 , offset_2);
	}
	}
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};

	// Specializations for void tag type
	template< typename RP , typename Functor , typename Tag >
	struct DeviceIterateTile<3,RP,Functor,Tag>
	{
	using index_type = typename RP::index_type;

	inline __device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	inline __device__
	void exec_range() const
	{
	if (RP::inner_direction == RP::Left) {
	for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
	const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
	if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {

	for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
	const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
	if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

	for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
	const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
	if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
	m_func(Tag(), offset_0 , offset_1 , offset_2);
	}
	}
	}
	}
	}
	}
	}
	else {
	for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
	const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
	if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {

	for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
	const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
	if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

	for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
	const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
	if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
	m_func(Tag(), offset_0 , offset_1 , offset_2);
	}
	}
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};


	//Rank 4
	// Specializations for void tag type
	template< typename RP , typename Functor >
	struct DeviceIterateTile<4,RP,Functor,void >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	// LL
	if (RP::inner_direction == RP::Left) {
	const index_type temp0 = m_rp.m_tile_end[0];
	const index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
	const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
	const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
	const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];

	for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
	const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
	if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {

	for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
	const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
	if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {

	for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
	m_func(offset_0 , offset_1 , offset_2 , offset_3);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	// LR
	else {
	const index_type temp0 = m_rp.m_tile_end[0];
	const index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
	const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
	const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
	const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];

	for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

	for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
	const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
	if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {

	for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
	const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
	if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
	m_func(offset_0 , offset_1 , offset_2 , offset_3);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};

	// Specializations for void tag type
	template< typename RP , typename Functor , typename Tag >
	struct DeviceIterateTile<4,RP,Functor,Tag>
	{
	using index_type = typename RP::index_type;

	inline __device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if (RP::inner_direction == RP::Left) {
	const index_type temp0 = m_rp.m_tile_end[0];
	const index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
	const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
	const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
	const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];

	for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
	const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
	if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {

	for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
	const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
	if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {

	for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
	m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	else {
	const index_type temp0 = m_rp.m_tile_end[0];
	const index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
	const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
	const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
	const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];

	for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

	for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
	const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
	if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {

	for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
	const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
	if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
	m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};


	//Rank 5
	// Specializations for void tag type
	template< typename RP , typename Functor >
	struct DeviceIterateTile<5,RP,Functor,void >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	// LL
	if (RP::inner_direction == RP::Left) {

	index_type temp0 = m_rp.m_tile_end[0];
	index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
	const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
	const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
	const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];

	temp0 = m_rp.m_tile_end[2];
	temp1 = m_rp.m_tile_end[3];
	const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
	const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
	const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
	const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];

	for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
	const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
	if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {

	for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
	const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
	if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

	for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
	const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
	if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

	for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
	m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	// LR
	else {
	index_type temp0 = m_rp.m_tile_end[0];
	index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
	const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
	const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
	const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];

	temp0 = m_rp.m_tile_end[2];
	temp1 = m_rp.m_tile_end[3];
	const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
	const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
	const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
	const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];

	for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

	for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
	const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
	if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

	for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
	const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
	if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

	for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
	const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
	if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
	m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};

	// Specializations for tag type
	template< typename RP , typename Functor , typename Tag >
	struct DeviceIterateTile<5,RP,Functor,Tag>
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	// LL
	if (RP::inner_direction == RP::Left) {
	index_type temp0 = m_rp.m_tile_end[0];
	index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
	const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
	const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
	const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];

	temp0 = m_rp.m_tile_end[2];
	temp1 = m_rp.m_tile_end[3];
	const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
	const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
	const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
	const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];

	for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
	const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
	if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {

	for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
	const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
	if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

	for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
	const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
	if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

	for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
	m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	// LR
	else {
	index_type temp0 = m_rp.m_tile_end[0];
	index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
	const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
	const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
	const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];

	temp0 = m_rp.m_tile_end[2];
	temp1 = m_rp.m_tile_end[3];
	const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
	const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
	const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
	const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];

	for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

	for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
	const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
	if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

	for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
	const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
	if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

	for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
	const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
	if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
	m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};


	//Rank 6
	// Specializations for void tag type
	template< typename RP , typename Functor >
	struct DeviceIterateTile<6,RP,Functor,void >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	// LL
	if (RP::inner_direction == RP::Left) {
	index_type temp0 = m_rp.m_tile_end[0];
	index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
	const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
	const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
	const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];

	temp0 = m_rp.m_tile_end[2];
	temp1 = m_rp.m_tile_end[3];
	const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
	const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
	const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
	const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];

	temp0 = m_rp.m_tile_end[4];
	temp1 = m_rp.m_tile_end[5];
	const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
	const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
	const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
	const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];

	for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
	const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
	if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {

	for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
	const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
	if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

	for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
	const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
	if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

	for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
	const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
	if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

	for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
	m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	// LR
	else {
	index_type temp0 = m_rp.m_tile_end[0];
	index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
	const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
	const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
	const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];

	temp0 = m_rp.m_tile_end[2];
	temp1 = m_rp.m_tile_end[3];
	const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
	const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
	const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
	const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];

	temp0 = m_rp.m_tile_end[4];
	temp1 = m_rp.m_tile_end[5];
	const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
	const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
	const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
	const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];

	for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

	for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
	const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
	if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

	for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
	const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
	if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

	for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
	const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
	if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

	for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
	const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
	if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
	m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};

	// Specializations for tag type
	template< typename RP , typename Functor , typename Tag >
	struct DeviceIterateTile<6,RP,Functor,Tag>
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ )
	: m_rp(rp_)
	, m_func(f_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	// LL
	if (RP::inner_direction == RP::Left) {
	index_type temp0 = m_rp.m_tile_end[0];
	index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
	const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
	const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
	const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];

	temp0 = m_rp.m_tile_end[2];
	temp1 = m_rp.m_tile_end[3];
	const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
	const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
	const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
	const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];

	temp0 = m_rp.m_tile_end[4];
	temp1 = m_rp.m_tile_end[5];
	const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
	const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
	( temp1 <= max_blocks ? temp1 : max_blocks ) );

	const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
	const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
	const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
	const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];

	for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
	const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
	if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {

	for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
	const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
	if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

	for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
	const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
	if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

	for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
	const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
	if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

	for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
	m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	// LR
	else {
	index_type temp0 = m_rp.m_tile_end[0];
	index_type temp1 = m_rp.m_tile_end[1];
	const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
	const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
	const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
	const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];

	temp0 = m_rp.m_tile_end[2];
	temp1 = m_rp.m_tile_end[3];
	const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
	const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
	const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
	const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];

	temp0 = m_rp.m_tile_end[4];
	temp1 = m_rp.m_tile_end[5];
	const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
	const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
	( temp0 <= max_blocks ? temp0 : max_blocks ) );

	const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
	const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
	const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
	const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];

	for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
	const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
	if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

	for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
	const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
	if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

	for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
	const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
	if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

	for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
	const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
	if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

	for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
	const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
	if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

	for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
	const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
	if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
	m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	};

	} // Refactor

	// ----------------------------------------------------------------------------------

	namespace Reduce {

	template < typename T >
	using is_void = std::is_same< T, void >;

	template < typename T >
	struct is_array_type : std::false_type
	{
	using value_type = T;
	};

	template < typename T >
	struct is_array_type< T* > : std::true_type
	{
	using value_type = T;
	};

	template < typename T >
	struct is_array_type< T[] > : std::true_type
	{
	using value_type = T;
	};

	// ------------------------------------------------------------------ //
	template< int N , typename RP , typename Functor , typename Tag , typename ValueType , typename Enable = void >
	struct DeviceIterateTile;

	// ParallelReduce iteration pattern
	// Scalar reductions

	// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of tiles and reduction algorithm constraints
	// extract n-dim tile offsets (i.e. tile's global starting mulit-index) from the tileid = blockid using tile dimensions
	// local indices within a tile extracted from (index_type)threadIdx.x using tile dims, constrained by blocksize
	// combine tile and local id info for multi-dim global ids

	// Pattern:
	// Each block+thread is responsible for a tile+local_id combo (additional when striding by num_blocks)
	// 1. create offset arrays
	// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max num blocks)
	// 3. temps set for tile_idx and thrd_idx, which will be modified
	// 4. if LL vs LR:
	// determine tile starting point offsets (multidim)
	// determine local index offsets (multidim)
	// concatentate tile offset + local offset for global multi-dim index
	// if offset withinin range bounds AND local offset within tile bounds, call functor

	// ValueType = T
	//Rank 2
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< !is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	inline __device__
	void exec_range() const
	{
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	// Deduce this blocks tile_id
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{
	m_func( m_offset[0], m_offset[1], m_v );
	}
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_v ); }
	}
	}
	}

	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	// Specializations for tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;

	inline __device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	inline __device__
	void exec_range() const
	{
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	//Rank 3
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	inline __device__
	void exec_range() const
	{
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	// Specializations for void tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;

	inline __device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	inline __device__
	void exec_range() const
	{
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	//Rank 4
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	// Specializations for void tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;

	inline __device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	//Rank 5
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	// Specializations for tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	//Rank 6
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	// Specializations for tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	ValueType & m_v;
	};


	// ValueType = T[], T*
	//Rank 2
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	inline __device__
	void exec_range() const
	{
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{
	m_func( m_offset[0], m_offset[1], m_v );
	}
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};


	// Specializations for tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	inline __device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	inline __device__
	void exec_range() const
	{
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
	}
	} //end for loop over num_tiles - product of tiles in each direction
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};


	//Rank 3
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	inline __device__
	void exec_range() const
	{
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};


	// Specializations for void tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	inline __device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	inline __device__
	void exec_range() const
	{
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};


	//Rank 4
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};


	// Specializations for void tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	inline __device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};


	//Rank 5
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};


	// Specializations for tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};


	//Rank 6
	// Specializations for void tag type
	template< typename RP , typename Functor , typename ValueType >
	struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};


	// Specializations for tag type
	template< typename RP , typename Functor , typename Tag, typename ValueType >
	struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
	{
	using index_type = typename RP::index_type;
	using value_type = typename is_array_type< ValueType >::value_type;

	__device__
	DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
	: m_rp(rp_)
	, m_func(f_)
	, m_v(v_)
	{}

	static constexpr index_type max_blocks = 65535;
	//static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);

	inline __device__
	void exec_range() const
	{
	//enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
	//const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
	if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
	index_type m_offset[RP::rank]; // tile starting global id offset
	index_type m_local_offset[RP::rank]; // tile starting global id offset

	for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
	index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
	index_type thrd_idx = (index_type)threadIdx.y;
	bool in_bounds = true;

	// LL
	if (RP::inner_direction == RP::Left) {
	for (int i=0; i<RP::rank; ++i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
	}
	// LR
	else {
	for (int i=RP::rank-1; i>=0; --i) {
	m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
	tile_idx /= m_rp.m_tile_end[i];

	// tile-local indices identified with (index_type)threadIdx.y
	m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
	thrd_idx /= m_rp.m_tile[i];

	m_offset[i] += m_local_offset[i];
	if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
	in_bounds &= false;
	}
	}
	if ( in_bounds )
	{ m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
	}
	}
	}
	} //end exec_range

	private:
	const RP & m_rp;
	const Functor & m_func;
	value_type* m_v;
	};

	} // Reduce

	// ----------------------------------------------------------------------------------

	} } } //end namespace Kokkos::Experimental::Impl

	#endif
	#endif

KokkosExp_Cuda_IterateTile_Refactor.hppNo OneTemporaryActions

File Metadata

KokkosExp_Cuda_IterateTile_Refactor.hppView Options

Event Timeline

KokkosExp_Cuda_IterateTile_Refactor.hpp
No OneTemporary
Actions

KokkosExp_Cuda_IterateTile_Refactor.hpp
View Options