Page MenuHomec4science

data_layouts.cpp
No OneTemporary

File Metadata

Created
Fri, Sep 13, 00:51

data_layouts.cpp

/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Timer.hpp>
#include <cstdio>
// These two View types are both 2-D arrays of double. However, they
// have different layouts in memory. left_type has "layout left,"
// which means "column major," the same as in Fortran, the BLAS, or
// LAPACK. right_type has "layout right," which means "row major,"
// the same as in C, C++, or Java.
typedef Kokkos::View<double**, Kokkos::LayoutLeft> left_type;
typedef Kokkos::View<double**, Kokkos::LayoutRight> right_type;
// This is a one-dimensional View, so the layout matters less.
// However, it still has a layout! Since its layout is not specified
// explicitly in the type, its layout is a function of the memory
// space. For example, the default Cuda layout is LayoutLeft, and the
// default Host layout is LayoutRight.
typedef Kokkos::View<double*> view_type;
// parallel_for functor that fills the given View with some data. It
// expects to access the View by rows in parallel: each call i of
// operator() accesses a row.
template<class ViewType>
struct init_view {
ViewType a;
init_view (ViewType a_) : a (a_) {}
KOKKOS_INLINE_FUNCTION
void operator() (const typename ViewType::size_type i) const {
// On CPUs this loop could be vectorized so j should do stride 1
// access on a for optimal performance. I.e. a should be LayoutRight.
// On GPUs threads should do coalesced loads and stores. That means
// that i should be the stride one access for optimal performance.
for (typename ViewType::size_type j = 0; j < a.dimension_1 (); ++j) {
a(i,j) = 1.0*a.dimension_0()*i + 1.0*j;
}
}
};
// Compute a contraction of v1 and v2 into a:
//
// a(i) := sum_j (v1(i,j) * v2(j,i))
//
// Since the functor is templated on the ViewTypes itself it doesn't matter what
// there layouts are. That means you can use different layouts on different
// architectures.
template<class ViewType1, class ViewType2>
struct contraction {
view_type a;
typename ViewType1::const_type v1;
typename ViewType2::const_type v2;
contraction (view_type a_, ViewType1 v1_, ViewType2 v2_) :
a (a_), v1 (v1_), v2 (v2_)
{}
// As with the initialization functor the performance of this operator
// depends on the architecture and the chosen data layouts.
// On CPUs optimal would be to vectorize the inner loop, so j should be the
// stride 1 access. That means v1 should be LayoutRight and v2 LayoutLeft.
// In order to get coalesced access on GPUs where i corresponds closely to
// the thread Index, i must be the stride 1 dimension. That means v1 should be
// LayoutLeft and v2 LayoutRight.
KOKKOS_INLINE_FUNCTION
void operator() (const view_type::size_type i) const {
for (view_type::size_type j = 0; j < v1.dimension_1 (); ++j) {
a(i) = v1(i,j)*v2(j,i);
}
}
};
// Compute a dot product. This is used for result verification.
struct dot {
view_type a;
dot (view_type a_) : a (a_) {}
typedef double value_type; //Specify type for reduction target, lsum
KOKKOS_INLINE_FUNCTION
void operator() (const view_type::size_type i, double &lsum) const {
lsum += a(i)*a(i);
}
};
int main (int narg, char* arg[]) {
// When initializing Kokkos, you may pass in command-line arguments,
// just like with MPI_Init(). Kokkos reserves the right to remove
// arguments from the list that start with '--kokkos-'.
Kokkos::initialize (narg, arg);
int size = 10000;
view_type a("A",size);
// Define two views with LayoutLeft and LayoutRight.
left_type l("L",size,10000);
right_type r("R",size,10000);
// Initialize the data in the views.
Kokkos::parallel_for(size,init_view<left_type>(l));
Kokkos::parallel_for(size,init_view<right_type>(r));
Kokkos::fence();
// Measure time to execute the contraction kernel when giving it a
// LayoutLeft view for v1 and a LayoutRight view for v2. This should be
// fast on GPUs and slow on CPUs
Kokkos::Impl::Timer time1;
Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r));
Kokkos::fence();
double sec1 = time1.seconds();
double sum1 = 0;
Kokkos::parallel_reduce(size,dot(a),sum1);
Kokkos::fence();
// Measure time to execute the contraction kernel when giving it a
// LayoutRight view for v1 and a LayoutLeft view for v2. This should be
// fast on CPUs and slow on GPUs
Kokkos::Impl::Timer time2;
Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l));
Kokkos::fence();
double sec2 = time2.seconds();
double sum2 = 0;
Kokkos::parallel_reduce(size,dot(a),sum2);
// Kokkos' reductions are deterministic.
// The results should always be equal.
printf("Result Left/Right %f Right/Left %f (equal result: %i)\n",sec1,sec2,sum2==sum1);
Kokkos::finalize();
}

Event Timeline