Page MenuHomec4science

Explicit.hpp
No OneTemporary

File Metadata

Created
Sat, Nov 9, 06:26

Explicit.hpp

/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef EXPLICIT_DRIVER_HPP
#define EXPLICIT_DRIVER_HPP
#include <sys/time.h>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cmath>
#include <impl/Kokkos_Timer.hpp>
#include <ExplicitFunctors.hpp>
//----------------------------------------------------------------------------
namespace Explicit {
struct PerformanceData {
double mesh_time ;
double init_time ;
double internal_force_time ;
double central_diff ;
double comm_time ;
size_t number_of_steps ;
PerformanceData()
: mesh_time(0)
, init_time(0)
, internal_force_time(0)
, central_diff(0)
, comm_time(0)
, number_of_steps(0)
{}
void best( const PerformanceData & rhs )
{
if ( rhs.mesh_time < mesh_time ) mesh_time = rhs.mesh_time ;
if ( rhs.init_time < init_time ) init_time = rhs.init_time ;
if ( rhs.internal_force_time < internal_force_time ) internal_force_time = rhs.internal_force_time ;
if ( rhs.central_diff < central_diff ) central_diff = rhs.central_diff ;
if ( rhs.comm_time < comm_time ) comm_time = rhs.comm_time ;
}
};
template< typename Scalar , class FixtureType >
PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
const int global_max_x ,
const int global_max_y ,
const int global_max_z ,
const int steps ,
const int print_sample )
{
typedef Scalar scalar_type ;
typedef FixtureType fixture_type ;
typedef typename fixture_type::execution_space execution_space ;
//typedef typename fixture_type::FEMeshType mesh_type ; // unused
enum { ElementNodeCount = fixture_type::element_node_count };
const int NumStates = 2;
const int total_num_steps = steps ;
const Scalar user_dt = 5.0e-6;
//const Scalar end_time = 0.0050;
// element block parameters
const Scalar lin_bulk_visc = 0.0;
const Scalar quad_bulk_visc = 0.0;
// const Scalar lin_bulk_visc = 0.06;
// const Scalar quad_bulk_visc = 1.2;
// const Scalar hg_stiffness = 0.0;
// const Scalar hg_viscosity = 0.0;
// const Scalar hg_stiffness = 0.03;
// const Scalar hg_viscosity = 0.001;
// material properties
const Scalar youngs_modulus=1.0e6;
const Scalar poissons_ratio=0.0;
const Scalar density = 8.0e-4;
const comm::Machine machine = mesh.parallel_data_map.machine ;
PerformanceData perf_data ;
Kokkos::Timer wall_clock ;
//------------------------------------
// Generate fields
typedef Fields< scalar_type , execution_space > fields_type ;
fields_type mesh_fields( mesh ,
lin_bulk_visc ,
quad_bulk_visc ,
youngs_modulus ,
poissons_ratio ,
density );
typename fields_type::node_coords_type::HostMirror
model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );
typename fields_type::geom_state_array_type::HostMirror
displacement_h = Kokkos::create_mirror( mesh_fields.displacement );
typename fields_type::geom_state_array_type::HostMirror
velocity_h = Kokkos::create_mirror( mesh_fields.velocity );
Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );
//------------------------------------
// Initialization
initialize_element<Scalar,execution_space>::apply( mesh_fields );
initialize_node< Scalar,execution_space>::apply( mesh_fields );
const Scalar x_bc = global_max_x ;
// Initial condition on velocity to initiate a pulse along the X axis
{
const unsigned X = 0;
for (int inode = 0; inode< mesh_fields.num_nodes; ++inode) {
if ( model_coords_h(inode,X) == 0) {
velocity_h(inode,X,0) = 1.0e3;
velocity_h(inode,X,1) = 1.0e3;
}
}
}
Kokkos::deep_copy( mesh_fields.velocity , velocity_h );
//--------------------------------------------------------------------------
// We will call a sequence of functions. These functions have been
// grouped into several functors to balance the number of global memory
// accesses versus requiring too many registers or too much L1 cache.
// Global memory accees have read/write cost and memory subsystem contention cost.
//--------------------------------------------------------------------------
perf_data.init_time = comm::max( machine , wall_clock.seconds() );
// Parameters required for the internal force computations.
int current_state = 0;
int previous_state = 0;
int next_state = 0;
perf_data.number_of_steps = total_num_steps ;
#if defined( KOKKOS_HAVE_MPI )
typedef typename
fields_type::geom_state_array_type::value_type comm_value_type ;
const unsigned comm_value_count = 6 ;
Kokkos::AsyncExchange< comm_value_type , execution_space ,
Kokkos::ParallelDataMap >
comm_exchange( mesh.parallel_data_map , comm_value_count );
#endif
for (int step = 0; step < total_num_steps; ++step) {
wall_clock.reset();
//------------------------------------------------------------------------
#if defined( KOKKOS_HAVE_MPI )
{
// Communicate "send" nodes' displacement and velocity next_state
// to the ghosted nodes.
// buffer packages: { { dx , dy , dz , vx , vy , vz }_node }
pack_state< Scalar , execution_space >
::apply( comm_exchange.buffer() ,
mesh.parallel_data_map.count_interior ,
mesh.parallel_data_map.count_send ,
mesh_fields , next_state );
comm_exchange.setup();
comm_exchange.send_receive();
unpack_state< Scalar , execution_space >
::apply( mesh_fields , next_state ,
comm_exchange.buffer() ,
mesh.parallel_data_map.count_owned ,
mesh.parallel_data_map.count_receive );
execution_space::fence();
}
#endif
perf_data.comm_time += comm::max( machine , wall_clock.seconds() );
//------------------------------------------------------------------------
// rotate the states
previous_state = current_state;
current_state = next_state;
++next_state;
next_state %= NumStates;
wall_clock.reset();
// First kernel 'grad_hgop' combines two functions:
// gradient, velocity gradient
grad< Scalar , execution_space >::apply( mesh_fields ,
current_state ,
previous_state );
// Combine tensor decomposition and rotation functions.
decomp_rotate< Scalar , execution_space >::apply( mesh_fields ,
current_state ,
previous_state );
internal_force< Scalar , execution_space >::apply( mesh_fields ,
user_dt ,
current_state );
execution_space::fence();
perf_data.internal_force_time +=
comm::max( machine , wall_clock.seconds() );
wall_clock.reset();
// Assembly of elements' contributions to nodal force into
// a nodal force vector. Update the accelerations, velocities,
// displacements.
// The same pattern can be used for matrix-free residual computations.
nodal_step< Scalar , execution_space >::apply( mesh_fields ,
x_bc ,
current_state,
next_state );
execution_space::fence();
perf_data.central_diff +=
comm::max( machine , wall_clock.seconds() );
if ( print_sample && 0 == step % 100 ) {
Kokkos::deep_copy( displacement_h , mesh_fields.displacement );
Kokkos::deep_copy( velocity_h , mesh_fields.velocity );
if ( 1 == print_sample ) {
std::cout << "step " << step
<< " : displacement(*,0,0) =" ;
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
std::cout << " " << displacement_h(i,0,next_state);
}
}
std::cout << std::endl ;
const float tol = 1.0e-6 ;
const int yb = global_max_y ;
const int zb = global_max_z ;
std::cout << "step " << step
<< " : displacement(*," << yb << "," << zb << ") =" ;
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
if ( fabs( model_coords_h(i,1) - yb ) < tol &&
fabs( model_coords_h(i,2) - zb ) < tol ) {
std::cout << " " << displacement_h(i,0,next_state);
}
}
std::cout << std::endl ;
}
else if ( 2 == print_sample ) {
const float tol = 1.0e-6 ;
const int xb = global_max_x / 2 ;
const int yb = global_max_y / 2 ;
const int zb = global_max_z / 2 ;
for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
if ( fabs( model_coords_h(i,0) - xb ) < tol &&
fabs( model_coords_h(i,1) - yb ) < tol &&
fabs( model_coords_h(i,2) - zb ) < tol ) {
std::cout << "step " << step
<< " : displacement("
<< xb << "," << yb << "," << zb << ") = {"
<< std::setprecision(6)
<< " " << displacement_h(i,0,next_state)
<< std::setprecision(2)
<< " " << displacement_h(i,1,next_state)
<< std::setprecision(2)
<< " " << displacement_h(i,2,next_state)
<< " }" << std::endl ;
}
}
}
}
}
return perf_data ;
}
template <typename Scalar, typename Device>
static void driver( const char * const label ,
comm::Machine machine ,
const int gang_count ,
const int elem_count_beg ,
const int elem_count_end ,
const int runs )
{
typedef Scalar scalar_type ;
typedef Device execution_space ;
typedef double coordinate_scalar_type ;
typedef FixtureElementHex8 fixture_element_type ;
typedef BoxMeshFixture< coordinate_scalar_type ,
execution_space ,
fixture_element_type > fixture_type ;
typedef typename fixture_type::FEMeshType mesh_type ;
const size_t proc_count = comm::size( machine );
const size_t proc_rank = comm::rank( machine );
const int space = 15 ;
const int steps = 1000 ;
const int print_sample = 0 ;
if ( comm::rank( machine ) == 0 ) {
std::cout << std::endl ;
std::cout << "\"MiniExplicitDynamics with Kokkos " << label
<< " time_steps(" << steps << ")"
<< "\"" << std::endl;
std::cout << std::left << std::setw(space) << "\"Element\" , ";
std::cout << std::left << std::setw(space) << "\"Node\" , ";
std::cout << std::left << std::setw(space) << "\"Initialize\" , ";
std::cout << std::left << std::setw(space) << "\"ElemForce\" , ";
std::cout << std::left << std::setw(space) << "\"NodeUpdate\" , ";
std::cout << std::left << std::setw(space) << "\"NodeComm\" , ";
std::cout << std::left << std::setw(space) << "\"Time/Elem\" , ";
std::cout << std::left << std::setw(space) << "\"Time/Node\"";
std::cout << std::endl;
std::cout << std::left << std::setw(space) << "\"count\" , ";
std::cout << std::left << std::setw(space) << "\"count\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\" , ";
std::cout << std::left << std::setw(space) << "\"microsec\"";
std::cout << std::endl;
}
for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
{
const int iz = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
const int iy = iz + 1 ;
const int ix = 2 * iy ;
const int nelem = ix * iy * iz ;
const int nnode = ( ix + 1 ) * ( iy + 1 ) * ( iz + 1 );
mesh_type mesh =
fixture_type::create( proc_count , proc_rank , gang_count ,
ix , iy , iz );
mesh.parallel_data_map.machine = machine ;
PerformanceData perf , best ;
for(int j = 0; j < runs; j++){
perf = run<scalar_type,fixture_type>(mesh,ix,iy,iz,steps,print_sample);
if( j == 0 ) {
best = perf ;
}
else {
best.best( perf );
}
}
if ( comm::rank( machine ) == 0 ) {
double time_per_element =
( best.internal_force_time ) / ( nelem * perf.number_of_steps );
double time_per_node =
( best.comm_time + best.central_diff ) / ( nnode * perf.number_of_steps );
std::cout << std::setw(space-3) << nelem << " , "
<< std::setw(space-3) << nnode << " , "
<< std::setw(space-3) << best.number_of_steps << " , "
<< std::setw(space-3) << best.init_time * 1000000 << " , "
<< std::setw(space-3)
<< ( best.internal_force_time * 1000000 ) / best.number_of_steps << " , "
<< std::setw(space-3)
<< ( best.central_diff * 1000000 ) / best.number_of_steps << " , "
<< std::setw(space-3)
<< ( best.comm_time * 1000000 ) / best.number_of_steps << " , "
<< std::setw(space-3) << time_per_element * 1000000 << " , "
<< std::setw(space-3) << time_per_node * 1000000
<< std::endl ;
}
}
}
} // namespace Explicit
#endif /* #ifndef EXPLICIT_DRIVER_HPP */

Event Timeline