Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F71856180
Kokkos_HostThreadTeam.hpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jul 13, 10:53
Size
35 KB
Mime Type
text/x-c++
Expires
Mon, Jul 15, 10:53 (1 d, 20 h)
Engine
blob
Format
Raw Data
Handle
19003477
Attached To
rLAMMPS lammps
Kokkos_HostThreadTeam.hpp
View Options
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP
#define KOKKOS_IMPL_HOSTTHREADTEAM_HPP
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Pair.hpp>
#include <Kokkos_Atomic.hpp>
#include <Kokkos_ExecPolicy.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_FunctorAnalysis.hpp>
#include <impl/Kokkos_Rendezvous.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace
Kokkos
{
namespace
Impl
{
template
<
class
HostExecSpace
>
class
HostThreadTeamMember
;
class
HostThreadTeamData
{
public
:
template
<
class
>
friend
class
HostThreadTeamMember
;
// Assume upper bounds on number of threads:
// pool size <= 1024 threads
// team size <= 64 threads
enum
:
int
{
max_pool_members
=
1024
};
enum
:
int
{
max_team_members
=
64
};
enum
:
int
{
max_pool_rendezvous
=
rendezvous_buffer_size
(
max_pool_members
)
};
enum
:
int
{
max_team_rendezvous
=
rendezvous_buffer_size
(
max_team_members
)
};
private
:
// per-thread scratch memory buffer chunks:
//
// [ pool_members ] = [ m_pool_members .. m_pool_rendezvous )
// [ pool_rendezvous ] = [ m_pool_rendezvous .. m_team_rendezvous )
// [ team_rendezvous ] = [ m_team_rendezvous .. m_pool_reduce )
// [ pool_reduce ] = [ m_pool_reduce .. m_team_reduce )
// [ team_reduce ] = [ m_team_reduce .. m_team_shared )
// [ team_shared ] = [ m_team_shared .. m_thread_local )
// [ thread_local ] = [ m_thread_local .. m_scratch_size )
enum
:
int
{
m_pool_members
=
0
};
enum
:
int
{
m_pool_rendezvous
=
m_pool_members
+
max_pool_members
};
enum
:
int
{
m_team_rendezvous
=
m_pool_rendezvous
+
max_pool_rendezvous
};
enum
:
int
{
m_pool_reduce
=
m_team_rendezvous
+
max_team_rendezvous
};
using
pair_int_t
=
Kokkos
::
pair
<
int
,
int
>
;
pair_int_t
m_work_range
;
int64_t
m_work_end
;
int64_t
*
m_scratch
;
// per-thread buffer
int64_t
*
m_pool_scratch
;
// == pool[0]->m_scratch
int64_t
*
m_team_scratch
;
// == pool[ 0 + m_team_base ]->m_scratch
int
m_pool_rank
;
int
m_pool_size
;
int
m_team_reduce
;
int
m_team_shared
;
int
m_thread_local
;
int
m_scratch_size
;
int
m_team_base
;
int
m_team_rank
;
int
m_team_size
;
int
m_team_alloc
;
int
m_league_rank
;
int
m_league_size
;
int
m_work_chunk
;
int
m_steal_rank
;
// work stealing rank
int
mutable
m_team_rendezvous_step
;
HostThreadTeamData
*
team_member
(
int
r
)
const
noexcept
{
return
((
HostThreadTeamData
**
)(
m_pool_scratch
+
m_pool_members
))[
m_team_base
+
r
];
}
// Rendezvous pattern:
// if ( rendezvous(root) ) {
// ... only root thread here while all others wait ...
// rendezvous_release();
// }
// else {
// ... all other threads release here ...
// }
//
// Requires: buffer[ ( max_threads / 8 ) * 4 + 4 ]; 0 == max_threads % 8
//
static
int
rendezvous
(
int64_t
*
const
buffer
,
int
&
rendezvous_step
,
int
const
size
,
int
const
rank
)
noexcept
;
static
void
rendezvous_release
(
int64_t
*
const
buffer
,
int
const
rendezvous_step
)
noexcept
;
public
:
inline
int
team_rendezvous
(
int
const
root
)
const
noexcept
{
return
1
==
m_team_size
?
1
:
HostThreadTeamData
::
rendezvous
(
m_team_scratch
+
m_team_rendezvous
,
m_team_rendezvous_step
,
m_team_size
,
(
m_team_rank
+
m_team_size
-
root
)
%
m_team_size
);
}
inline
int
team_rendezvous
()
const
noexcept
{
return
1
==
m_team_size
?
1
:
HostThreadTeamData
::
rendezvous
(
m_team_scratch
+
m_team_rendezvous
,
m_team_rendezvous_step
,
m_team_size
,
m_team_rank
);
}
inline
void
team_rendezvous_release
()
const
noexcept
{
if
(
1
<
m_team_size
)
{
HostThreadTeamData
::
rendezvous_release
(
m_team_scratch
+
m_team_rendezvous
,
m_team_rendezvous_step
);
}
}
inline
int
pool_rendezvous
()
const
noexcept
{
static
constexpr
int
yield_wait
=
#if defined( KOKKOS_COMPILER_IBM )
// If running on IBM POWER architecture the global
// level rendzvous should immediately yield when
// waiting for other threads in the pool to arrive.
1
#else
0
#endif
;
return
1
==
m_pool_size
?
1
:
Kokkos
::
Impl
::
rendezvous
(
m_pool_scratch
+
m_pool_rendezvous
,
m_pool_size
,
m_pool_rank
,
yield_wait
);
}
inline
void
pool_rendezvous_release
()
const
noexcept
{
if
(
1
<
m_pool_size
)
{
Kokkos
::
Impl
::
rendezvous_release
(
m_pool_scratch
+
m_pool_rendezvous
);
}
}
//----------------------------------------
constexpr
HostThreadTeamData
()
noexcept
:
m_work_range
(
-
1
,
-
1
)
,
m_work_end
(
0
)
,
m_scratch
(
0
)
,
m_pool_scratch
(
0
)
,
m_team_scratch
(
0
)
,
m_pool_rank
(
0
)
,
m_pool_size
(
1
)
,
m_team_reduce
(
0
)
,
m_team_shared
(
0
)
,
m_thread_local
(
0
)
,
m_scratch_size
(
0
)
,
m_team_base
(
0
)
,
m_team_rank
(
0
)
,
m_team_size
(
1
)
,
m_team_alloc
(
1
)
,
m_league_rank
(
0
)
,
m_league_size
(
1
)
,
m_work_chunk
(
0
)
,
m_steal_rank
(
0
)
,
m_team_rendezvous_step
(
0
)
{}
//----------------------------------------
// Organize array of members into a pool.
// The 0th member is the root of the pool.
// Requires: members are not already in a pool.
// Requires: called by one thread.
// Pool members are ordered as "close" - sorted by NUMA and then CORE
// Each thread is its own team with team_size == 1.
static
void
organize_pool
(
HostThreadTeamData
*
members
[]
,
const
int
size
);
// Called by each thread within the pool
void
disband_pool
();
//----------------------------------------
// Each thread within a pool organizes itself into a team.
// Must be called by all threads of the pool.
// Organizing threads into a team performs a barrier across the
// entire pool to insure proper initialization of the team
// rendezvous mechanism before a team rendezvous can be performed.
//
// Return true if a valid member of a team.
// Return false if not a member and thread should be idled.
int
organize_team
(
const
int
team_size
);
// Each thread within a pool disbands itself from current team.
// Each thread becomes its own team with team_size == 1.
// Must be called by all threads of the pool.
void
disband_team
();
//----------------------------------------
constexpr
int
pool_rank
()
const
{
return
m_pool_rank
;
}
constexpr
int
pool_size
()
const
{
return
m_pool_size
;
}
HostThreadTeamData
*
pool_member
(
int
r
)
const
noexcept
{
return
((
HostThreadTeamData
**
)(
m_pool_scratch
+
m_pool_members
))[
r
];
}
//----------------------------------------
private
:
enum
:
int
{
mask_to_16
=
0x0f
};
// align to 16 bytes
enum
:
int
{
shift_to_8
=
3
};
// size to 8 bytes
public
:
static
constexpr
int
align_to_int64
(
int
n
)
{
return
(
(
n
+
mask_to_16
)
&
~
mask_to_16
)
>>
shift_to_8
;
}
constexpr
int
pool_reduce_bytes
()
const
{
return
m_scratch_size
?
sizeof
(
int64_t
)
*
(
m_team_reduce
-
m_pool_reduce
)
:
0
;
}
constexpr
int
team_reduce_bytes
()
const
{
return
sizeof
(
int64_t
)
*
(
m_team_shared
-
m_team_reduce
);
}
constexpr
int
team_shared_bytes
()
const
{
return
sizeof
(
int64_t
)
*
(
m_thread_local
-
m_team_shared
);
}
constexpr
int
thread_local_bytes
()
const
{
return
sizeof
(
int64_t
)
*
(
m_scratch_size
-
m_thread_local
);
}
constexpr
int
scratch_bytes
()
const
{
return
sizeof
(
int64_t
)
*
m_scratch_size
;
}
// Memory chunks:
int64_t
*
scratch_buffer
()
const
noexcept
{
return
m_scratch
;
}
int64_t
*
pool_reduce
()
const
noexcept
{
return
m_pool_scratch
+
m_pool_reduce
;
}
int64_t
*
pool_reduce_local
()
const
noexcept
{
return
m_scratch
+
m_pool_reduce
;
}
int64_t
*
team_reduce
()
const
noexcept
{
return
m_team_scratch
+
m_team_reduce
;
}
int64_t
*
team_reduce_local
()
const
noexcept
{
return
m_scratch
+
m_team_reduce
;
}
int64_t
*
team_shared
()
const
noexcept
{
return
m_team_scratch
+
m_team_shared
;
}
int64_t
*
local_scratch
()
const
noexcept
{
return
m_scratch
+
m_thread_local
;
}
// Given:
// pool_reduce_size = number bytes for pool reduce
// team_reduce_size = number bytes for team reduce
// team_shared_size = number bytes for team shared memory
// thread_local_size = number bytes for thread local memory
// Return:
// total number of bytes that must be allocated
static
size_t
scratch_size
(
int
pool_reduce_size
,
int
team_reduce_size
,
int
team_shared_size
,
int
thread_local_size
)
{
pool_reduce_size
=
align_to_int64
(
pool_reduce_size
);
team_reduce_size
=
align_to_int64
(
team_reduce_size
);
team_shared_size
=
align_to_int64
(
team_shared_size
);
thread_local_size
=
align_to_int64
(
thread_local_size
);
const
size_t
total_bytes
=
(
m_pool_reduce
+
pool_reduce_size
+
team_reduce_size
+
team_shared_size
+
thread_local_size
)
*
sizeof
(
int64_t
);
return
total_bytes
;
}
// Given:
// alloc_ptr = pointer to allocated memory
// alloc_size = number bytes of allocated memory
// pool_reduce_size = number bytes for pool reduce/scan operations
// team_reduce_size = number bytes for team reduce/scan operations
// team_shared_size = number bytes for team-shared memory
// thread_local_size = number bytes for thread-local memory
// Return:
// total number of bytes that must be allocated
void
scratch_assign
(
void
*
const
alloc_ptr
,
size_t
const
alloc_size
,
int
pool_reduce_size
,
int
team_reduce_size
,
int
team_shared_size
,
int
/* thread_local_size */
)
{
pool_reduce_size
=
align_to_int64
(
pool_reduce_size
);
team_reduce_size
=
align_to_int64
(
team_reduce_size
);
team_shared_size
=
align_to_int64
(
team_shared_size
);
// thread_local_size = align_to_int64( thread_local_size );
m_scratch
=
(
int64_t
*
)
alloc_ptr
;
m_team_reduce
=
m_pool_reduce
+
pool_reduce_size
;
m_team_shared
=
m_team_reduce
+
team_reduce_size
;
m_thread_local
=
m_team_shared
+
team_shared_size
;
m_scratch_size
=
align_to_int64
(
alloc_size
);
#if 0
fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
, int(m_pool_members)
, int(m_pool_rendezvous)
, int(m_pool_reduce)
, int(m_team_reduce)
, int(m_team_shared)
, int(m_thread_local)
, int(m_scratch_size)
);
fflush(stdout);
#endif
}
//----------------------------------------
// Get a work index within the range.
// First try to steal from beginning of own teams's partition.
// If that fails then try to steal from end of another teams' partition.
int
get_work_stealing
()
noexcept
;
//----------------------------------------
// Set the initial work partitioning of [ 0 .. length ) among the teams
// with granularity of chunk
void
set_work_partition
(
int64_t
const
length
,
int
const
chunk
)
noexcept
{
// Minimum chunk size to insure that
// m_work_end < std::numeric_limits<int>::max() * m_work_chunk
int
const
chunk_min
=
(
length
+
std
::
numeric_limits
<
int
>::
max
()
)
/
std
::
numeric_limits
<
int
>::
max
();
m_work_end
=
length
;
m_work_chunk
=
std
::
max
(
chunk
,
chunk_min
);
// Number of work chunks and partitioning of that number:
int
const
num
=
(
m_work_end
+
m_work_chunk
-
1
)
/
m_work_chunk
;
int
const
part
=
(
num
+
m_league_size
-
1
)
/
m_league_size
;
m_work_range
.
first
=
part
*
m_league_rank
;
m_work_range
.
second
=
m_work_range
.
first
+
part
;
// Steal from next team, round robin
// The next team is offset by m_team_alloc if it fits in the pool.
m_steal_rank
=
m_team_base
+
m_team_alloc
+
m_team_size
<=
m_pool_size
?
m_team_base
+
m_team_alloc
:
0
;
}
std
::
pair
<
int64_t
,
int64_t
>
get_work_partition
()
noexcept
{
return
std
::
pair
<
int64_t
,
int64_t
>
(
m_work_range
.
first
*
m_work_chunk
,
m_work_range
.
second
*
m_work_chunk
<
m_work_end
?
m_work_range
.
second
*
m_work_chunk
:
m_work_end
);
}
std
::
pair
<
int64_t
,
int64_t
>
get_work_stealing_chunk
()
noexcept
{
std
::
pair
<
int64_t
,
int64_t
>
x
(
-
1
,
-
1
);
const
int
i
=
get_work_stealing
();
if
(
0
<=
i
)
{
x
.
first
=
m_work_chunk
*
i
;
x
.
second
=
x
.
first
+
m_work_chunk
<
m_work_end
?
x
.
first
+
m_work_chunk
:
m_work_end
;
}
return
x
;
}
};
//----------------------------------------------------------------------------
template
<
class
HostExecSpace
>
class
HostThreadTeamMember
{
public
:
using
scratch_memory_space
=
typename
HostExecSpace
::
scratch_memory_space
;
private
:
scratch_memory_space
m_scratch
;
HostThreadTeamData
&
m_data
;
int
const
m_league_rank
;
int
const
m_league_size
;
public
:
constexpr
HostThreadTeamMember
(
HostThreadTeamData
&
arg_data
)
noexcept
:
m_scratch
(
arg_data
.
team_shared
()
,
arg_data
.
team_shared_bytes
()
)
,
m_data
(
arg_data
)
,
m_league_rank
(
0
)
,
m_league_size
(
1
)
{}
constexpr
HostThreadTeamMember
(
HostThreadTeamData
&
arg_data
,
int
const
arg_league_rank
,
int
const
arg_league_size
)
noexcept
:
m_scratch
(
arg_data
.
team_shared
()
,
arg_data
.
team_shared_bytes
()
,
arg_data
.
team_shared
()
,
arg_data
.
team_shared_bytes
()
)
,
m_data
(
arg_data
)
,
m_league_rank
(
arg_league_rank
)
,
m_league_size
(
arg_league_size
)
{}
~
HostThreadTeamMember
()
=
default
;
HostThreadTeamMember
()
=
delete
;
HostThreadTeamMember
(
HostThreadTeamMember
&&
)
=
default
;
HostThreadTeamMember
(
HostThreadTeamMember
const
&
)
=
default
;
HostThreadTeamMember
&
operator
=
(
HostThreadTeamMember
&&
)
=
default
;
HostThreadTeamMember
&
operator
=
(
HostThreadTeamMember
const
&
)
=
default
;
//----------------------------------------
KOKKOS_INLINE_FUNCTION
int
team_rank
()
const
noexcept
{
return
m_data
.
m_team_rank
;
}
KOKKOS_INLINE_FUNCTION
int
team_size
()
const
noexcept
{
return
m_data
.
m_team_size
;
}
KOKKOS_INLINE_FUNCTION
int
league_rank
()
const
noexcept
{
return
m_league_rank
;
}
KOKKOS_INLINE_FUNCTION
int
league_size
()
const
noexcept
{
return
m_league_size
;
}
//----------------------------------------
KOKKOS_INLINE_FUNCTION
const
scratch_memory_space
&
team_shmem
()
const
{
return
m_scratch
.
set_team_thread_mode
(
0
,
1
,
0
);
}
KOKKOS_INLINE_FUNCTION
const
scratch_memory_space
&
team_scratch
(
int
)
const
{
return
m_scratch
.
set_team_thread_mode
(
0
,
1
,
0
);
}
KOKKOS_INLINE_FUNCTION
const
scratch_memory_space
&
thread_scratch
(
int
)
const
{
return
m_scratch
.
set_team_thread_mode
(
0
,
m_data
.
m_team_size
,
m_data
.
m_team_rank
);
}
//--------------------------------------------------------------------------
// Team collectives
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
void
team_barrier
()
const
noexcept
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{
if
(
m_data
.
team_rendezvous
()
)
m_data
.
team_rendezvous_release
();
}
#else
{}
#endif
//--------------------------------------------------------------------------
template
<
typename
T
>
KOKKOS_INLINE_FUNCTION
void
team_broadcast
(
T
&
value
,
const
int
source_team_rank
)
const
noexcept
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{
if
(
1
<
m_data
.
m_team_size
)
{
T
volatile
*
const
shared_value
=
(
T
*
)
m_data
.
team_reduce
();
// Don't overwrite shared memory until all threads arrive
if
(
m_data
.
team_rendezvous
(
source_team_rank
)
)
{
// All threads have entered 'team_rendezvous'
// only this thread returned from 'team_rendezvous'
// with a return value of 'true'
*
shared_value
=
value
;
m_data
.
team_rendezvous_release
();
// This thread released all other threads from 'team_rendezvous'
// with a return value of 'false'
}
else
{
value
=
*
shared_value
;
}
}
}
#else
{
Kokkos
::
abort
(
"HostThreadTeamMember team_broadcast
\n
"
);
}
#endif
//--------------------------------------------------------------------------
template
<
class
Closure
,
typename
T
>
KOKKOS_INLINE_FUNCTION
void
team_broadcast
(
Closure
const
&
f
,
T
&
value
,
const
int
source_team_rank
)
const
noexcept
{
T
volatile
*
const
shared_value
=
(
T
*
)
m_data
.
team_reduce
();
// Don't overwrite shared memory until all threads arrive
if
(
m_data
.
team_rendezvous
(
source_team_rank
)
)
{
// All threads have entered 'team_rendezvous'
// only this thread returned from 'team_rendezvous'
// with a return value of 'true'
f
(
value
);
if
(
1
<
m_data
.
m_team_size
)
{
*
shared_value
=
value
;
}
m_data
.
team_rendezvous_release
();
// This thread released all other threads from 'team_rendezvous'
// with a return value of 'false'
}
else
{
value
=
*
shared_value
;
}
}
//--------------------------------------------------------------------------
// team_reduce( Sum(result) );
// team_reduce( Min(result) );
// team_reduce( Max(result) );
template
<
typename
ReducerType
>
KOKKOS_INLINE_FUNCTION
typename
std
::
enable_if
<
is_reducer
<
ReducerType
>::
value
>::
type
team_reduce
(
ReducerType
const
&
reducer
)
const
noexcept
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{
if
(
1
<
m_data
.
m_team_size
)
{
using
value_type
=
typename
ReducerType
::
value_type
;
if
(
0
!=
m_data
.
m_team_rank
)
{
// Non-root copies to their local buffer:
/*reducer.copy( (value_type*) m_data.team_reduce_local()
, reducer.data() );*/
*
((
value_type
*
)
m_data
.
team_reduce_local
())
=
reducer
.
reference
();
}
// Root does not overwrite shared memory until all threads arrive
// and copy to their local buffer.
if
(
m_data
.
team_rendezvous
()
)
{
// All threads have entered 'team_rendezvous'
// only this thread returned from 'team_rendezvous'
// with a return value of 'true'
//
// This thread sums contributed values
for
(
int
i
=
1
;
i
<
m_data
.
m_team_size
;
++
i
)
{
value_type
*
const
src
=
(
value_type
*
)
m_data
.
team_member
(
i
)
->
team_reduce_local
();
reducer
.
join
(
reducer
.
reference
(),
*
src
);
}
// Copy result to root member's buffer:
// reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
*
((
value_type
*
)
m_data
.
team_reduce
())
=
reducer
.
reference
();
m_data
.
team_rendezvous_release
();
// This thread released all other threads from 'team_rendezvous'
// with a return value of 'false'
}
else
{
// Copy from root member's buffer:
reducer
.
reference
()
=
*
((
value_type
*
)
m_data
.
team_reduce
());
}
}
}
#else
{
Kokkos
::
abort
(
"HostThreadTeamMember team_reduce
\n
"
);
}
#endif
//--------------------------------------------------------------------------
/*template< typename ValueType , class JoinOp >
KOKKOS_INLINE_FUNCTION
ValueType
team_reduce( ValueType const & value
, JoinOp const & join ) const noexcept
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{
if ( 0 != m_data.m_team_rank ) {
// Non-root copies to their local buffer:
*((ValueType*) m_data.team_reduce_local()) = value ;
}
// Root does not overwrite shared memory until all threads arrive
// and copy to their local buffer.
if ( m_data.team_rendezvous() ) {
const Impl::Reducer< ValueType , JoinOp > reducer( join );
// All threads have entered 'team_rendezvous'
// only this thread returned from 'team_rendezvous'
// with a return value of 'true'
//
// This thread sums contributed values
ValueType * const dst = (ValueType*) m_data.team_reduce_local();
*dst = value ;
for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
ValueType * const src =
(ValueType*) m_data.team_member(i)->team_reduce_local();
reducer.join( dst , src );
}
m_data.team_rendezvous_release();
// This thread released all other threads from 'team_rendezvous'
// with a return value of 'false'
}
return *((ValueType*) m_data.team_reduce());
}
#else
{ Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); }
#endif*/
template
<
typename
T
>
KOKKOS_INLINE_FUNCTION
T
team_scan
(
T
const
&
value
,
T
*
const
global
=
0
)
const
noexcept
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{
if
(
0
!=
m_data
.
m_team_rank
)
{
// Non-root copies to their local buffer:
((
T
*
)
m_data
.
team_reduce_local
())[
1
]
=
value
;
}
// Root does not overwrite shared memory until all threads arrive
// and copy to their local buffer.
if
(
m_data
.
team_rendezvous
()
)
{
// All threads have entered 'team_rendezvous'
// only this thread returned from 'team_rendezvous'
// with a return value of 'true'
//
// This thread scans contributed values
{
T
*
prev
=
(
T
*
)
m_data
.
team_reduce_local
();
prev
[
0
]
=
0
;
prev
[
1
]
=
value
;
for
(
int
i
=
1
;
i
<
m_data
.
m_team_size
;
++
i
)
{
T
*
const
ptr
=
(
T
*
)
m_data
.
team_member
(
i
)
->
team_reduce_local
();
ptr
[
0
]
=
prev
[
0
]
+
prev
[
1
]
;
prev
=
ptr
;
}
}
// If adding to global value then atomic_fetch_add to that value
// and sum previous value to every entry of the scan.
if
(
global
)
{
T
*
prev
=
(
T
*
)
m_data
.
team_reduce_local
();
{
T
*
ptr
=
(
T
*
)
m_data
.
team_member
(
m_data
.
m_team_size
-
1
)
->
team_reduce_local
();
prev
[
0
]
=
Kokkos
::
atomic_fetch_add
(
global
,
ptr
[
0
]
+
ptr
[
1
]
);
}
for
(
int
i
=
1
;
i
<
m_data
.
m_team_size
;
++
i
)
{
T
*
ptr
=
(
T
*
)
m_data
.
team_member
(
i
)
->
team_reduce_local
();
ptr
[
0
]
+=
prev
[
0
]
;
}
}
m_data
.
team_rendezvous_release
();
}
return
((
T
*
)
m_data
.
team_reduce_local
())[
0
];
}
#else
{
Kokkos
::
abort
(
"HostThreadTeamMember team_scan
\n
"
);
return
T
();
}
#endif
};
}}
/* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace
Kokkos
{
template
<
class
Space
,
typename
iType
>
KOKKOS_INLINE_FUNCTION
Impl
::
TeamThreadRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
TeamThreadRange
(
Impl
::
HostThreadTeamMember
<
Space
>
const
&
member
,
iType
const
&
count
)
{
return
Impl
::
TeamThreadRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
(
member
,
0
,
count
);
}
template
<
class
Space
,
typename
iType1
,
typename
iType2
>
KOKKOS_INLINE_FUNCTION
Impl
::
TeamThreadRangeBoundariesStruct
<
typename
std
::
common_type
<
iType1
,
iType2
>::
type
,
Impl
::
HostThreadTeamMember
<
Space
>
>
TeamThreadRange
(
Impl
::
HostThreadTeamMember
<
Space
>
const
&
member
,
iType1
const
&
begin
,
iType2
const
&
end
)
{
return
Impl
::
TeamThreadRangeBoundariesStruct
<
typename
std
::
common_type
<
iType1
,
iType2
>::
type
,
Impl
::
HostThreadTeamMember
<
Space
>
>
(
member
,
begin
,
end
);
}
template
<
class
Space
,
typename
iType
>
KOKKOS_INLINE_FUNCTION
Impl
::
ThreadVectorRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
ThreadVectorRange
(
Impl
::
HostThreadTeamMember
<
Space
>
const
&
member
,
const
iType
&
count
)
{
return
Impl
::
ThreadVectorRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
(
member
,
count
);
}
//----------------------------------------------------------------------------
/** \brief Inter-thread parallel_for.
*
* Executes lambda(iType i) for each i=[0..N)
*
* The range [0..N) is mapped to all threads of the the calling thread team.
*/
template
<
typename
iType
,
class
Space
,
class
Closure
>
KOKKOS_INLINE_FUNCTION
void
parallel_for
(
Impl
::
TeamThreadRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
const
&
loop_boundaries
,
Closure
const
&
closure
)
{
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
closure
(
i
);
}
}
template
<
typename
iType
,
class
Space
,
class
Closure
>
KOKKOS_INLINE_FUNCTION
void
parallel_for
(
Impl
::
ThreadVectorRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
const
&
loop_boundaries
,
Closure
const
&
closure
)
{
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
closure
(
i
);
}
}
//----------------------------------------------------------------------------
template
<
typename
iType
,
class
Space
,
class
Closure
,
class
Reducer
>
KOKKOS_INLINE_FUNCTION
typename
std
::
enable_if
<
Kokkos
::
is_reducer
<
Reducer
>::
value
>::
type
parallel_reduce
(
Impl
::
TeamThreadRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
const
&
loop_boundaries
,
Closure
const
&
closure
,
Reducer
const
&
reducer
)
{
reducer
.
init
(
reducer
.
reference
()
);
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
closure
(
i
,
reducer
.
reference
()
);
}
loop_boundaries
.
thread
.
team_reduce
(
reducer
);
}
template
<
typename
iType
,
class
Space
,
typename
Closure
,
typename
ValueType
>
KOKKOS_INLINE_FUNCTION
typename
std
::
enable_if
<
!
Kokkos
::
is_reducer
<
ValueType
>::
value
>::
type
parallel_reduce
(
Impl
::
TeamThreadRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
const
&
loop_boundaries
,
Closure
const
&
closure
,
ValueType
&
result
)
{
Kokkos
::
Experimental
::
Sum
<
ValueType
>
reducer
(
result
);
reducer
.
init
(
result
);
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
closure
(
i
,
reducer
.
reference
()
);
}
loop_boundaries
.
thread
.
team_reduce
(
reducer
);
}
/*template< typename iType, class Space
, class Closure, class Joiner , typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
const & loop_boundaries
, Closure const & closure
, Joiner const & joiner
, ValueType & result
)
{
Impl::Reducer< ValueType , Joiner > reducer( joiner , & result );
reducer.init( reducer.data() );
for( iType i = loop_boundaries.start
; i < loop_boundaries.end
; i += loop_boundaries.increment ) {
closure( i , reducer.reference() );
}
loop_boundaries.thread.team_reduce( reducer );
}*/
//----------------------------------------------------------------------------
/** \brief Inter-thread vector parallel_reduce.
*
* Executes lambda(iType i, ValueType & val) for each i=[0..N)
*
* The range [0..N) is mapped to all threads of the
* calling thread team and a summation of val is
* performed and put into result.
*/
template
<
typename
iType
,
class
Space
,
class
Lambda
,
typename
ValueType
>
KOKKOS_INLINE_FUNCTION
typename
std
::
enable_if
<
!
Kokkos
::
is_reducer
<
ValueType
>::
value
>::
type
parallel_reduce
(
const
Impl
::
ThreadVectorRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>&
loop_boundaries
,
const
Lambda
&
lambda
,
ValueType
&
result
)
{
result
=
ValueType
();
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
lambda
(
i
,
result
);
}
}
template
<
typename
iType
,
class
Space
,
class
Lambda
,
typename
ReducerType
>
KOKKOS_INLINE_FUNCTION
typename
std
::
enable_if
<
Kokkos
::
is_reducer
<
ReducerType
>::
value
>::
type
parallel_reduce
(
const
Impl
::
ThreadVectorRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>&
loop_boundaries
,
const
Lambda
&
lambda
,
const
ReducerType
&
reducer
)
{
reducer
.
init
(
reducer
.
reference
());
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
lambda
(
i
,
reducer
.
reference
());
}
}
/** \brief Intra-thread vector parallel_reduce.
*
* Executes lambda(iType i, ValueType & val) for each i=[0..N)
*
* The range [0..N) is mapped to all vector lanes of the the
* calling thread and a reduction of val is performed using
* JoinType(ValueType& val, const ValueType& update)
* and put into init_result.
* The input value of init_result is used as initializer for
* temporary variables of ValueType. Therefore * the input
* value should be the neutral element with respect to the
* join operation (e.g. '0 for +-' or * '1 for *').
*/
template
<
typename
iType
,
class
Space
,
class
Lambda
,
class
JoinType
,
typename
ValueType
>
KOKKOS_INLINE_FUNCTION
void
parallel_reduce
(
const
Impl
::
ThreadVectorRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>&
loop_boundaries
,
const
Lambda
&
lambda
,
const
JoinType
&
join
,
ValueType
&
result
)
{
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
lambda
(
i
,
result
);
}
}
//----------------------------------------------------------------------------
template
<
typename
iType
,
class
Space
,
class
Closure
>
KOKKOS_INLINE_FUNCTION
void
parallel_scan
(
Impl
::
TeamThreadRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
const
&
loop_boundaries
,
Closure
const
&
closure
)
{
// Extract ValueType from the closure
using
value_type
=
typename
Kokkos
::
Impl
::
FunctorAnalysis
<
Kokkos
::
Impl
::
FunctorPatternInterface
::
SCAN
,
void
,
Closure
>::
value_type
;
value_type
accum
=
0
;
// Intra-member scan
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
closure
(
i
,
accum
,
false
);
}
// 'accum' output is the exclusive prefix sum
accum
=
loop_boundaries
.
thread
.
team_scan
(
accum
);
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
closure
(
i
,
accum
,
true
);
}
}
template
<
typename
iType
,
class
Space
,
class
ClosureType
>
KOKKOS_INLINE_FUNCTION
void
parallel_scan
(
Impl
::
ThreadVectorRangeBoundariesStruct
<
iType
,
Impl
::
HostThreadTeamMember
<
Space
>
>
const
&
loop_boundaries
,
ClosureType
const
&
closure
)
{
using
value_type
=
typename
Kokkos
::
Impl
::
FunctorAnalysis
<
Impl
::
FunctorPatternInterface
::
SCAN
,
void
,
ClosureType
>::
value_type
;
value_type
scan_val
=
value_type
();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for
(
iType
i
=
loop_boundaries
.
start
;
i
<
loop_boundaries
.
end
;
i
+=
loop_boundaries
.
increment
)
{
closure
(
i
,
scan_val
,
true
);
}
}
//----------------------------------------------------------------------------
template
<
class
Space
>
KOKKOS_INLINE_FUNCTION
Impl
::
ThreadSingleStruct
<
Impl
::
HostThreadTeamMember
<
Space
>
>
PerTeam
(
const
Impl
::
HostThreadTeamMember
<
Space
>
&
member
)
{
return
Impl
::
ThreadSingleStruct
<
Impl
::
HostThreadTeamMember
<
Space
>
>
(
member
);
}
template
<
class
Space
>
KOKKOS_INLINE_FUNCTION
Impl
::
VectorSingleStruct
<
Impl
::
HostThreadTeamMember
<
Space
>
>
PerThread
(
const
Impl
::
HostThreadTeamMember
<
Space
>
&
member
)
{
return
Impl
::
VectorSingleStruct
<
Impl
::
HostThreadTeamMember
<
Space
>
>
(
member
);
}
template
<
class
Space
,
class
FunctorType
>
KOKKOS_INLINE_FUNCTION
void
single
(
const
Impl
::
ThreadSingleStruct
<
Impl
::
HostThreadTeamMember
<
Space
>
>
&
single
,
const
FunctorType
&
functor
)
{
// 'single' does not perform a barrier.
if
(
single
.
team_member
.
team_rank
()
==
0
)
functor
();
}
template
<
class
Space
,
class
FunctorType
,
typename
ValueType
>
KOKKOS_INLINE_FUNCTION
void
single
(
const
Impl
::
ThreadSingleStruct
<
Impl
::
HostThreadTeamMember
<
Space
>
>
&
single
,
const
FunctorType
&
functor
,
ValueType
&
val
)
{
single
.
team_member
.
team_broadcast
(
functor
,
val
,
0
);
}
template
<
class
Space
,
class
FunctorType
>
KOKKOS_INLINE_FUNCTION
void
single
(
const
Impl
::
VectorSingleStruct
<
Impl
::
HostThreadTeamMember
<
Space
>
>
&
,
const
FunctorType
&
functor
)
{
functor
();
}
template
<
class
Space
,
class
FunctorType
,
typename
ValueType
>
KOKKOS_INLINE_FUNCTION
void
single
(
const
Impl
::
VectorSingleStruct
<
Impl
::
HostThreadTeamMember
<
Space
>
>
&
,
const
FunctorType
&
functor
,
ValueType
&
val
)
{
functor
(
val
);
}
}
/* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif
/* #ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP */
Event Timeline
Log In to Comment