Explicit.hpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Dec 2, 14:56

Explicit.hpp
View Options

	/*
	//@HEADER
	// ************************************************************************
	//
	// Kokkos v. 2.0
	// Copyright (2014) Sandia Corporation
	//
	// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
	// the U.S. Government retains certain rights in this software.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// 1. Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	//
	// 2. Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	//
	// 3. Neither the name of the Corporation nor the names of the
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
	// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
	// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	//
	// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
	//
	// ************************************************************************
	//@HEADER
	*/

	#ifndef EXPLICIT_DRIVER_HPP
	#define EXPLICIT_DRIVER_HPP

	#include <sys/time.h>
	#include <iostream>
	#include <iomanip>
	#include <cstdlib>
	#include <cmath>

	#include <impl/Kokkos_Timer.hpp>

	#include <ExplicitFunctors.hpp>

	//----------------------------------------------------------------------------

	namespace Explicit {

	struct PerformanceData {
	double mesh_time ;
	double init_time ;
	double internal_force_time ;
	double central_diff ;
	double comm_time ;
	size_t number_of_steps ;

	PerformanceData()
	: mesh_time(0)
	, init_time(0)
	, internal_force_time(0)
	, central_diff(0)
	, comm_time(0)
	, number_of_steps(0)
	{}

	void best( const PerformanceData & rhs )
	{
	if ( rhs.mesh_time < mesh_time ) mesh_time = rhs.mesh_time ;
	if ( rhs.init_time < init_time ) init_time = rhs.init_time ;
	if ( rhs.internal_force_time < internal_force_time ) internal_force_time = rhs.internal_force_time ;
	if ( rhs.central_diff < central_diff ) central_diff = rhs.central_diff ;
	if ( rhs.comm_time < comm_time ) comm_time = rhs.comm_time ;
	}
	};

	template< typename Scalar , class FixtureType >
	PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
	const int global_max_x ,
	const int global_max_y ,
	const int global_max_z ,
	const int steps ,
	const int print_sample )
	{
	typedef Scalar scalar_type ;
	typedef FixtureType fixture_type ;
	typedef typename fixture_type::execution_space execution_space ;
	//typedef typename fixture_type::FEMeshType mesh_type ; // unused

	enum { ElementNodeCount = fixture_type::element_node_count };

	const int NumStates = 2;

	const int total_num_steps = steps ;

	const Scalar user_dt = 5.0e-6;
	//const Scalar end_time = 0.0050;

	// element block parameters
	const Scalar lin_bulk_visc = 0.0;
	const Scalar quad_bulk_visc = 0.0;

	// const Scalar lin_bulk_visc = 0.06;
	// const Scalar quad_bulk_visc = 1.2;
	// const Scalar hg_stiffness = 0.0;
	// const Scalar hg_viscosity = 0.0;
	// const Scalar hg_stiffness = 0.03;
	// const Scalar hg_viscosity = 0.001;

	// material properties
	const Scalar youngs_modulus=1.0e6;
	const Scalar poissons_ratio=0.0;
	const Scalar density = 8.0e-4;

	const comm::Machine machine = mesh.parallel_data_map.machine ;

	PerformanceData perf_data ;

	Kokkos::Timer wall_clock ;

	//------------------------------------
	// Generate fields

	typedef Fields< scalar_type , execution_space > fields_type ;

	fields_type mesh_fields( mesh ,
	lin_bulk_visc ,
	quad_bulk_visc ,
	youngs_modulus ,
	poissons_ratio ,
	density );

	typename fields_type::node_coords_type::HostMirror
	model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );

	typename fields_type::geom_state_array_type::HostMirror
	displacement_h = Kokkos::create_mirror( mesh_fields.displacement );

	typename fields_type::geom_state_array_type::HostMirror
	velocity_h = Kokkos::create_mirror( mesh_fields.velocity );

	Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );

	//------------------------------------
	// Initialization

	initialize_element<Scalar,execution_space>::apply( mesh_fields );
	initialize_node< Scalar,execution_space>::apply( mesh_fields );

	const Scalar x_bc = global_max_x ;

	// Initial condition on velocity to initiate a pulse along the X axis
	{
	const unsigned X = 0;
	for (int inode = 0; inode< mesh_fields.num_nodes; ++inode) {
	if ( model_coords_h(inode,X) == 0) {
	velocity_h(inode,X,0) = 1.0e3;
	velocity_h(inode,X,1) = 1.0e3;
	}
	}
	}

	Kokkos::deep_copy( mesh_fields.velocity , velocity_h );

	//--------------------------------------------------------------------------
	// We will call a sequence of functions. These functions have been
	// grouped into several functors to balance the number of global memory
	// accesses versus requiring too many registers or too much L1 cache.
	// Global memory accees have read/write cost and memory subsystem contention cost.
	//--------------------------------------------------------------------------

	perf_data.init_time = comm::max( machine , wall_clock.seconds() );

	// Parameters required for the internal force computations.

	int current_state = 0;
	int previous_state = 0;
	int next_state = 0;

	perf_data.number_of_steps = total_num_steps ;

	#if defined( KOKKOS_HAVE_MPI )

	typedef typename
	fields_type::geom_state_array_type::value_type comm_value_type ;

	const unsigned comm_value_count = 6 ;

	Kokkos::AsyncExchange< comm_value_type , execution_space ,
	Kokkos::ParallelDataMap >
	comm_exchange( mesh.parallel_data_map , comm_value_count );

	#endif

	for (int step = 0; step < total_num_steps; ++step) {

	wall_clock.reset();

	//------------------------------------------------------------------------
	#if defined( KOKKOS_HAVE_MPI )
	{
	// Communicate "send" nodes' displacement and velocity next_state
	// to the ghosted nodes.
	// buffer packages: { { dx , dy , dz , vx , vy , vz }_node }

	pack_state< Scalar , execution_space >
	::apply( comm_exchange.buffer() ,
	mesh.parallel_data_map.count_interior ,
	mesh.parallel_data_map.count_send ,
	mesh_fields , next_state );

	comm_exchange.setup();

	comm_exchange.send_receive();

	unpack_state< Scalar , execution_space >
	::apply( mesh_fields , next_state ,
	comm_exchange.buffer() ,
	mesh.parallel_data_map.count_owned ,
	mesh.parallel_data_map.count_receive );

	execution_space::fence();
	}
	#endif

	perf_data.comm_time += comm::max( machine , wall_clock.seconds() );

	//------------------------------------------------------------------------
	// rotate the states

	previous_state = current_state;
	current_state = next_state;
	++next_state;
	next_state %= NumStates;

	wall_clock.reset();

	// First kernel 'grad_hgop' combines two functions:
	// gradient, velocity gradient
	grad< Scalar , execution_space >::apply( mesh_fields ,
	current_state ,
	previous_state );

	// Combine tensor decomposition and rotation functions.
	decomp_rotate< Scalar , execution_space >::apply( mesh_fields ,
	current_state ,
	previous_state );

	internal_force< Scalar , execution_space >::apply( mesh_fields ,
	user_dt ,
	current_state );

	execution_space::fence();

	perf_data.internal_force_time +=
	comm::max( machine , wall_clock.seconds() );

	wall_clock.reset();

	// Assembly of elements' contributions to nodal force into
	// a nodal force vector. Update the accelerations, velocities,
	// displacements.
	// The same pattern can be used for matrix-free residual computations.
	nodal_step< Scalar , execution_space >::apply( mesh_fields ,
	x_bc ,
	current_state,
	next_state );
	execution_space::fence();

	perf_data.central_diff +=
	comm::max( machine , wall_clock.seconds() );

	if ( print_sample && 0 == step % 100 ) {
	Kokkos::deep_copy( displacement_h , mesh_fields.displacement );
	Kokkos::deep_copy( velocity_h , mesh_fields.velocity );

	if ( 1 == print_sample ) {

	std::cout << "step " << step
	<< " : displacement(*,0,0) =" ;
	for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
	if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
	std::cout << " " << displacement_h(i,0,next_state);
	}
	}
	std::cout << std::endl ;

	const float tol = 1.0e-6 ;
	const int yb = global_max_y ;
	const int zb = global_max_z ;
	std::cout << "step " << step
	<< " : displacement(*," << yb << "," << zb << ") =" ;
	for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
	if ( fabs( model_coords_h(i,1) - yb ) < tol &&
	fabs( model_coords_h(i,2) - zb ) < tol ) {
	std::cout << " " << displacement_h(i,0,next_state);
	}
	}
	std::cout << std::endl ;
	}
	else if ( 2 == print_sample ) {

	const float tol = 1.0e-6 ;
	const int xb = global_max_x / 2 ;
	const int yb = global_max_y / 2 ;
	const int zb = global_max_z / 2 ;

	for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
	if ( fabs( model_coords_h(i,0) - xb ) < tol &&
	fabs( model_coords_h(i,1) - yb ) < tol &&
	fabs( model_coords_h(i,2) - zb ) < tol ) {
	std::cout << "step " << step
	<< " : displacement("
	<< xb << "," << yb << "," << zb << ") = {"
	<< std::setprecision(6)
	<< " " << displacement_h(i,0,next_state)
	<< std::setprecision(2)
	<< " " << displacement_h(i,1,next_state)
	<< std::setprecision(2)
	<< " " << displacement_h(i,2,next_state)
	<< " }" << std::endl ;
	}
	}
	}
	}
	}

	return perf_data ;
	}


	template <typename Scalar, typename Device>
	static void driver( const char * const label ,
	comm::Machine machine ,
	const int gang_count ,
	const int elem_count_beg ,
	const int elem_count_end ,
	const int runs )
	{
	typedef Scalar scalar_type ;
	typedef Device execution_space ;
	typedef double coordinate_scalar_type ;
	typedef FixtureElementHex8 fixture_element_type ;

	typedef BoxMeshFixture< coordinate_scalar_type ,
	execution_space ,
	fixture_element_type > fixture_type ;

	typedef typename fixture_type::FEMeshType mesh_type ;

	const size_t proc_count = comm::size( machine );
	const size_t proc_rank = comm::rank( machine );

	const int space = 15 ;
	const int steps = 1000 ;
	const int print_sample = 0 ;

	if ( comm::rank( machine ) == 0 ) {

	std::cout << std::endl ;
	std::cout << "\"MiniExplicitDynamics with Kokkos " << label
	<< " time_steps(" << steps << ")"
	<< "\"" << std::endl;
	std::cout << std::left << std::setw(space) << "\"Element\" , ";
	std::cout << std::left << std::setw(space) << "\"Node\" , ";
	std::cout << std::left << std::setw(space) << "\"Initialize\" , ";
	std::cout << std::left << std::setw(space) << "\"ElemForce\" , ";
	std::cout << std::left << std::setw(space) << "\"NodeUpdate\" , ";
	std::cout << std::left << std::setw(space) << "\"NodeComm\" , ";
	std::cout << std::left << std::setw(space) << "\"Time/Elem\" , ";
	std::cout << std::left << std::setw(space) << "\"Time/Node\"";

	std::cout << std::endl;

	std::cout << std::left << std::setw(space) << "\"count\" , ";
	std::cout << std::left << std::setw(space) << "\"count\" , ";
	std::cout << std::left << std::setw(space) << "\"microsec\" , ";
	std::cout << std::left << std::setw(space) << "\"microsec\" , ";
	std::cout << std::left << std::setw(space) << "\"microsec\" , ";
	std::cout << std::left << std::setw(space) << "\"microsec\" , ";
	std::cout << std::left << std::setw(space) << "\"microsec\" , ";
	std::cout << std::left << std::setw(space) << "\"microsec\"";

	std::cout << std::endl;
	}

	for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
	{
	const int iz = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
	const int iy = iz + 1 ;
	const int ix = 2 * iy ;
	const int nelem = ix * iy * iz ;
	const int nnode = ( ix + 1 ) * ( iy + 1 ) * ( iz + 1 );

	mesh_type mesh =
	fixture_type::create( proc_count , proc_rank , gang_count ,
	ix , iy , iz );

	mesh.parallel_data_map.machine = machine ;

	PerformanceData perf , best ;

	for(int j = 0; j < runs; j++){

	perf = run<scalar_type,fixture_type>(mesh,ix,iy,iz,steps,print_sample);

	if( j == 0 ) {
	best = perf ;
	}
	else {
	best.best( perf );
	}
	}

	if ( comm::rank( machine ) == 0 ) {
	double time_per_element =
	( best.internal_force_time ) / ( nelem * perf.number_of_steps );
	double time_per_node =
	( best.comm_time + best.central_diff ) / ( nnode * perf.number_of_steps );

	std::cout << std::setw(space-3) << nelem << " , "
	<< std::setw(space-3) << nnode << " , "
	<< std::setw(space-3) << best.number_of_steps << " , "
	<< std::setw(space-3) << best.init_time * 1000000 << " , "
	<< std::setw(space-3)
	<< ( best.internal_force_time * 1000000 ) / best.number_of_steps << " , "
	<< std::setw(space-3)
	<< ( best.central_diff * 1000000 ) / best.number_of_steps << " , "
	<< std::setw(space-3)
	<< ( best.comm_time * 1000000 ) / best.number_of_steps << " , "
	<< std::setw(space-3) << time_per_element * 1000000 << " , "
	<< std::setw(space-3) << time_per_node * 1000000
	<< std::endl ;
	}
	}
	}


	} // namespace Explicit

	#endif /* #ifndef EXPLICIT_DRIVER_HPP */

Explicit.hppNo OneTemporaryActions

File Metadata

Explicit.hppView Options

Event Timeline

Explicit.hpp
No OneTemporary
Actions

Explicit.hpp
View Options