domain.c
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jun 1, 18:56

domain.c
View Options

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include <mpi.h>

	#include "allvars.h"
	#include "proto.h"



	/*! \file domain.c
	* \brief code for domain decomposition
	*
	* This file contains the code for the domain decomposition of the
	* simulation volume. The domains are constructed from disjoint subsets
	* of the leaves of a fiducial top-level tree that covers the full
	* simulation volume. Domain boundaries hence run along tree-node
	* divisions of a fiducial global BH tree. As a result of this method, the
	* tree force are in principle strictly independent of the way the domains
	* are cut. The domain decomposition can be carried out for an arbitrary
	* number of CPUs. Individual domains are not cubical, but spatially
	* coherent since the leaves are traversed in a Peano-Hilbert order and
	* individual domains form segments along this order. This also ensures
	* that each domain has a small surface to volume ratio, which minimizes
	* communication.
	*/

	#define TOPNODEFACTOR 20.0

	#define REDUC_FAC 0.98


	/! toGo[taskNTask + partner] gives the number of particles in task 'task'
	* that have to go to task 'partner'
	*/
	static int toGo, toGoSph;
	static int local_toGo, local_toGoSph;
	static int *list_NumPart;
	static int *list_N_gas;
	static int *list_load;
	static int *list_loadsph;
	static double *list_work;

	#ifdef STELLAR_PROP
	static int *toGoSt;
	static int *local_toGoSt;
	static int *list_N_stars;
	#endif

	static long long maxload, maxloadsph;

	static struct topnode_exchange
	{
	peanokey Startkey;
	int Count;
	}
	toplist, toplist_local;



	/*! This is the main routine for the domain decomposition. It acts as a
	* driver routine that allocates various temporary buffers, maps the
	* particles back onto the periodic box if needed, and then does the
	* domain decomposition, and a final Peano-Hilbert order of all particles
	* as a tuning measure.
	*/
	void domain_Decomposition(void)
	{
	double t0, t1;
	int i;

	#ifdef PMGRID
	if(All.PM_Ti_endstep == All.Ti_Current)
	{
	All.NumForcesSinceLastDomainDecomp = 1 + All.TotNumPart * All.TreeDomainUpdateFrequency;
	/* to make sure that we do a domain decomposition before the PM-force is evaluated.
	this is needed to make sure that the particles are wrapped into the box */
	}
	#endif

	/* Check whether it is really time for a new domain decomposition */
	if(All.NumForcesSinceLastDomainDecomp > All.TotNumPart * All.TreeDomainUpdateFrequency)
	{
	t0 = second();

	#ifdef PERIODIC
	do_box_wrapping(); /* map the particles back onto the box */
	#endif
	All.NumForcesSinceLastDomainDecomp = 0;


	#ifdef SFR
	rearrange_particle_sequence();
	#endif


	TreeReconstructFlag = 1; /* ensures that new tree will be constructed */

	if(ThisTask == 0)
	{
	printf("start domain decomposition... \n");
	fflush(stdout);
	}

	Key = malloc(sizeof(peanokey) * All.MaxPart);
	KeySorted = malloc(sizeof(peanokey) * All.MaxPart);

	toGo = malloc(sizeof(int) * NTask * NTask);
	toGoSph = malloc(sizeof(int) * NTask * NTask);
	local_toGo = malloc(sizeof(int) * NTask);
	local_toGoSph = malloc(sizeof(int) * NTask);
	list_NumPart = malloc(sizeof(int) * NTask);
	list_N_gas = malloc(sizeof(int) * NTask);
	list_load = malloc(sizeof(int) * NTask);
	list_loadsph = malloc(sizeof(int) * NTask);
	list_work = malloc(sizeof(double) * NTask);

	MPI_Allgather(&NumPart, 1, MPI_INT, list_NumPart, 1, MPI_INT, MPI_COMM_WORLD);
	MPI_Allgather(&N_gas, 1, MPI_INT, list_N_gas, 1, MPI_INT, MPI_COMM_WORLD);

	#ifdef STELLAR_PROP
	toGoSt = malloc(sizeof(int) * NTask * NTask);
	local_toGoSt = malloc(sizeof(int) * NTask);
	list_N_stars = malloc(sizeof(int) * NTask);
	MPI_Allgather(&N_stars, 1, MPI_INT, list_N_stars, 1, MPI_INT, MPI_COMM_WORLD);
	#endif

	maxload = All.MaxPart * REDUC_FAC;
	maxloadsph = All.MaxPartSph * REDUC_FAC;




	#if STELLAR_PROP
	#ifdef CHECK_ID_CORRESPONDENCE
	if (ThisTask==0)
	printf("Check id correspondence before decomposition...\n");

	rearrange_particle_sequence();

	for(i = N_gas; i < N_gas+N_stars; i++)
	{
	if( StP[P[i].StPIdx].PIdx != i )
	{
	printf("\nP/StP correspondance error\n");
	printf("(%d) (in domain before) N_stars=%d N_gas=%d i=%d id=%d P[i].StPIdx=%d StP[P[i].StPIdx].PIdx=%d\n\n",ThisTask,N_stars,N_gas,i,P[i].ID,P[i].StPIdx,StP[P[i].StPIdx].PIdx);
	endrun(888001);
	}


	if(StP[P[i].StPIdx].ID != P[i].ID)
	{
	printf("\nP/StP correspondance error\n");
	printf("(%d) (in domain before) N_gas=%d N_stars=%d i=%d Type=%d P.Id=%d P[i].StPIdx=%d StP[P[i].StPIdx].ID=%d \n\n",ThisTask,N_gas,N_stars,i,P[i].Type,P[i].ID, P[i].StPIdx, StP[P[i].StPIdx].ID);
	endrun(888002);
	}
	}
	if (ThisTask==0)
	printf("Check id correspondence before decomposition done...\n");
	#endif
	#endif


	domain_decompose();

	free(list_work);
	free(list_loadsph);
	free(list_load);
	free(list_N_gas);
	free(list_NumPart);
	free(local_toGoSph);
	free(local_toGo);
	free(toGoSph);
	free(toGo);

	#ifdef STELLAR_PROP
	free(list_N_stars);
	free(local_toGoSt);
	free(toGoSt);
	#endif

	#ifdef STELLAR_PROP
	#ifdef CHECK_BLOCK_ORDER
	int typenow;
	typenow = 0;
	for(i = 0; i < NumPart; i++)
	{

	if ((P[i].Type<typenow)&&(typenow<=ST))
	{
	printf("\nBlock order error\n");
	printf("(%d) i=%d Type=%d typenow=%d\n\n",ThisTask,i,P[i].Type,typenow);
	endrun(888003);
	}

	typenow = P[i].Type;
	}
	#endif
	#endif

	#ifdef STELLAR_PROP
	#ifdef CHECK_ID_CORRESPONDENCE
	if (ThisTask==0)
	printf("Check id correspondence after decomposition...\n");

	rearrange_particle_sequence();

	for(i = N_gas; i < N_gas+N_stars; i++)
	{
	if( StP[P[i].StPIdx].PIdx != i )
	{
	printf("\nP/StP correspondance error\n");
	printf("(%d) (in domain after) N_stars=%d N_gas=%d i=%d id=%d P[i].StPIdx=%d StP[P[i].StPIdx].PIdx=%d\n\n",ThisTask,N_stars,N_gas,i,P[i].ID,P[i].StPIdx,StP[P[i].StPIdx].PIdx);
	endrun(888004);
	}


	if(StP[P[i].StPIdx].ID != P[i].ID)
	{
	printf("\nP/StP correspondance error\n");
	printf("(%d) (in domain after) N_gas=%d N_stars=%d i=%d Type=%d P.Id=%d P[i].StPIdx=%d StP[P[i].StPIdx].ID=%d \n\n",ThisTask,N_gas,N_stars,i,P[i].Type,P[i].ID, P[i].StPIdx, StP[P[i].StPIdx].ID);
	endrun(888005);
	}
	}
	if (ThisTask==0)
	printf("Check id correspondence after decomposition done...\n");
	#endif
	#endif





	if(ThisTask == 0)
	{
	printf("domain decomposition done. \n");
	fflush(stdout);
	}

	t1 = second();
	All.CPU_Domain += timediff(t0, t1);

	#ifdef PEANOHILBERT
	t0 = second();
	peano_hilbert_order();
	t1 = second();
	All.CPU_Peano += timediff(t0, t1);
	#endif

	free(KeySorted);
	free(Key);
	}

	}



	/*! This function carries out the actual domain decomposition for all
	* particle types. It will try to balance the work-load for each domain,
	* as estimated based on the P[i]-GravCost values. The decomposition will
	* respect the maximum allowed memory-imbalance given by the value of
	* PartAllocFactor.
	*/
	void domain_decompose(void)
	{
	int i, j, status;
	int ngrp, task, partner, sendcount, recvcount;
	long long sumtogo, sumload;
	int maxload, *temp;
	double sumwork, maxwork;

	#ifdef DETAILED_CPU_DOMAIN
	double t0, t1;
	#endif

	for(i = 0; i < 6; i++)
	NtypeLocal[i] = 0;

	for(i = 0; i < NumPart; i++)
	NtypeLocal[P[i].Type]++;

	/* because Ntype[] is of type `long long', we cannot do a simple
	* MPI_Allreduce() to sum the total particle numbers
	*/
	temp = malloc(NTask * 6 * sizeof(int));
	MPI_Allgather(NtypeLocal, 6, MPI_INT, temp, 6, MPI_INT, MPI_COMM_WORLD);
	for(i = 0; i < 6; i++)
	{
	Ntype[i] = 0;
	for(j = 0; j < NTask; j++)
	Ntype[i] += temp[j * 6 + i];
	}
	free(temp);

	#ifndef UNEQUALSOFTENINGS
	for(i = 0; i < 6; i++)
	if(Ntype[i] > 0)
	break;

	for(ngrp = i + 1; ngrp < 6; ngrp++)
	{
	if(Ntype[ngrp] > 0)
	if(All.SofteningTable[ngrp] != All.SofteningTable[i])
	{
	if(ThisTask == 0)
	{
	fprintf(stdout, "Code was not compiled with UNEQUALSOFTENINGS, but some of the\n");
	fprintf(stdout, "softening lengths are unequal nevertheless.\n");
	fprintf(stdout, "This is not allowed.\n");
	}
	endrun(0);
	}
	}
	#endif


	/* determine global dimensions of domain grid */
	#ifdef DETAILED_CPU_DOMAIN
	t0 = second();
	#endif
	domain_findExtent();
	#ifdef DETAILED_CPU_DOMAIN
	t1 = second();
	All.CPU_Domain_findExtend += timediff(t0, t1);
	#endif


	#ifdef DETAILED_CPU_DOMAIN
	t0 = second();
	#endif
	domain_determineTopTree();
	#ifdef DETAILED_CPU_DOMAIN
	t1 = second();
	All.CPU_Domain_determineTopTree += timediff(t0, t1);
	#endif


	/* determine cost distribution in domain grid */
	#ifdef DETAILED_CPU_DOMAIN
	t0 = second();
	#endif
	domain_sumCost();
	#ifdef DETAILED_CPU_DOMAIN
	t1 = second();
	All.CPU_Domain_sumCost += timediff(t0, t1);
	#endif


	/* find the split of the domain grid recursively */
	#ifdef DETAILED_CPU_DOMAIN
	t0 = second();
	#endif
	#ifdef SPLIT_DOMAIN_USING_TIME
	status = domain_findSplityr(0, NTask, 0, NTopleaves - 1);
	#else
	status = domain_findSplit(0, NTask, 0, NTopleaves - 1);
	#endif
	#ifdef DETAILED_CPU_DOMAIN
	t1 = second();
	All.CPU_Domain_findSplit += timediff(t0, t1);
	#endif

	if(status != 0)
	{
	if(ThisTask == 0)
	printf("\nNo domain decomposition that stays within memory bounds is possible (status=%d).\n",status);
	endrun(0);
	}

	/* now try to improve the work-load balance of the split */
	#ifdef DETAILED_CPU_DOMAIN
	t0 = second();
	#endif
	#ifdef SPLIT_DOMAIN_USING_TIME
	domain_shiftSplityr();
	#else
	domain_shiftSplit();
	#endif
	#ifdef DETAILED_CPU_DOMAIN
	t1 = second();
	All.CPU_Domain_shiftSplit += timediff(t0, t1);
	#endif


	DomainMyStart = DomainStartList[ThisTask];
	DomainMyLast = DomainEndList[ThisTask];

	if(ThisTask == 0)
	{
	sumload = maxload = 0;
	sumwork = maxwork = 0;
	for(i = 0; i < NTask; i++)
	{
	sumload += list_load[i];
	sumwork += list_work[i];

	if(list_load[i] > maxload)
	maxload = list_load[i];

	if(list_work[i] > maxwork)
	maxwork = list_work[i];
	}

	printf("work-load balance=%g memory-balance=%g\n",
	maxwork / (sumwork / NTask), maxload / (((double) sumload) / NTask));
	}


	/* determine for each cpu how many particles have to be shifted to other cpus */
	#ifdef DETAILED_CPU_DOMAIN
	t0 = second();
	#endif
	domain_countToGo();
	#ifdef DETAILED_CPU_DOMAIN
	t1 = second();
	All.CPU_Domain_countToGo += timediff(t0, t1);
	#endif


	#ifdef DETAILED_CPU_DOMAIN
	t0 = second();
	#endif

	for(i = 0, sumtogo = 0; i < NTask * NTask; i++)
	sumtogo += toGo[i];

	while(sumtogo > 0)
	{
	if(ThisTask == 0)
	{
	printf("exchange of %d%09d particles\n", (int) (sumtogo / 1000000000),
	(int) (sumtogo % 1000000000));
	fflush(stdout);
	}

	for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	{
	for(task = 0; task < NTask; task++)
	{
	partner = task ^ ngrp;

	if(partner < NTask && task < partner)
	{



	/* treat SPH separately */
	if(All.TotN_gas > 0)
	{
	domain_findExchangeNumbers(task, partner, 0, &sendcount, &recvcount);

	list_NumPart[task] += recvcount - sendcount;
	list_NumPart[partner] -= recvcount - sendcount;
	list_N_gas[task] += recvcount - sendcount;
	list_N_gas[partner] -= recvcount - sendcount;

	toGo[task * NTask + partner] -= sendcount;
	toGo[partner * NTask + task] -= recvcount;
	toGoSph[task * NTask + partner] -= sendcount;
	toGoSph[partner * NTask + task] -= recvcount;

	//if (sendcount>0)
	// printf("(%d) gas --> %d (%d)\n",ThisTask,sendcount,N_gas);
	//if (recvcount>0)
	// printf("(%d) gas <-- %d (%d)\n",ThisTask,recvcount,N_gas);

	//printf("(%d) N_gas=%d P[N_gas].ID=%d %d (%d %d)\n",ThisTask,N_gas,P[N_gas].ID,P[N_gas].Type,sendcount,recvcount);

	if(task == ThisTask) /* actually carry out the exchange */
	domain_exchangeParticles(partner, 0, sendcount, recvcount);
	if(partner == ThisTask)
	domain_exchangeParticles(task, 0, recvcount, sendcount);


	//if (sendcount>0)
	// printf("(%d) (after) gas --> %d (%d)\n",ThisTask,sendcount,N_gas);
	//if (recvcount>0)
	// printf("(%d) (after) gas <-- %d (%d)\n",ThisTask,recvcount,N_gas);

	//printf("(%d) (after) N_gas=%d P[N_gas].ID=%d %d (%d %d)\n",ThisTask,N_gas,P[N_gas].ID,P[N_gas].Type,sendcount,recvcount);


	}



	#ifdef STELLAR_PROP
	/* treat STARS separately */
	if(All.TotN_stars > 0)
	{
	domain_findExchangeNumbers(task, partner, 1, &sendcount, &recvcount);

	list_NumPart[task] += recvcount - sendcount;
	list_NumPart[partner] -= recvcount - sendcount;
	list_N_stars[task] += recvcount - sendcount;
	list_N_stars[partner] -= recvcount - sendcount;

	toGo[task * NTask + partner] -= sendcount;
	toGo[partner * NTask + task] -= recvcount;
	toGoSt[task * NTask + partner] -= sendcount;
	toGoSt[partner * NTask + task] -= recvcount;

	//if (sendcount>0)
	// printf("(%d) stars --> %d (%d)\n",ThisTask,sendcount,N_stars);
	//if (recvcount>0)
	// printf("(%d) stars <-- %d (%d)\n",ThisTask,recvcount,N_stars);

	if(task == ThisTask) /* actually carry out the exchange */
	domain_exchangeParticles(partner, 1, sendcount, recvcount);
	if(partner == ThisTask)
	domain_exchangeParticles(task, 1, recvcount, sendcount);

	//if (sendcount>0)
	// printf("(%d) stars --> %d (%d)\n",ThisTask,sendcount,N_stars);
	//if (recvcount>0)
	// printf("(%d) stars <-- %d (%d)\n",ThisTask,recvcount,N_stars);

	}
	#endif




	domain_findExchangeNumbers(task, partner, -1, &sendcount, &recvcount);

	list_NumPart[task] += recvcount - sendcount;
	list_NumPart[partner] -= recvcount - sendcount;

	toGo[task * NTask + partner] -= sendcount;
	toGo[partner * NTask + task] -= recvcount;

	if(task == ThisTask) /* actually carry out the exchange */
	domain_exchangeParticles(partner, -1, sendcount, recvcount);
	if(partner == ThisTask)
	domain_exchangeParticles(task, -1, recvcount, sendcount);
	}
	}
	}

	for(i = 0, sumtogo = 0; i < NTask * NTask; i++)
	sumtogo += toGo[i];
	}
	#ifdef SFR
	#ifndef STELLAR_PROP
	/* count the new N_stars : in case STELLAR_PROP is on,
	N_stars is already updated during the domain decomposition */

	int nstars;

	for(i = N_gas, nstars = 0; i < NumPart; i++)
	{
	if (P[i].Type==ST)
	nstars++;
	}

	N_stars = nstars;

	#endif
	#endif


	#ifdef DETAILED_CPU_DOMAIN
	t1 = second();
	All.CPU_Domain_exchange += timediff(t0, t1);
	#endif


	}

	/*! This function tries to find a split point in a range of cells in the
	* domain-grid. The range of cells starts at 'first', and ends at 'last'
	* (inclusively). The number of cpus that holds the range is 'ncpu', with
	* the first cpu given by 'cpustart'. If more than 2 cpus are to be split,
	* the function calls itself recursively. The division tries to achieve a
	* best particle-load balance under the constraint that 'maxload' and
	* 'maxloadsph' may not be exceeded, and that each cpu holds at least one
	* cell from the domaingrid. If such a decomposition cannot be achieved, a
	* non-zero error code is returned.
	*
	* After successful completion, DomainMyStart[] and DomainMyLast[] contain
	* the first and last cell of the domaingrid assigned to the local task
	* for the given type. Also, DomainTask[] contains for each cell the task
	* it was assigned to.
	*/
	int domain_findSplit(int cpustart, int ncpu, int first, int last)
	{
	int i, split, ok_left, ok_right;
	long long load, sphload, load_leftOfSplit, sphload_leftOfSplit;
	int ncpu_leftOfSplit;
	double maxAvgLoad_CurrentSplit, maxAvgLoad_NewSplit;


	ncpu_leftOfSplit = ncpu / 2;

	for(i = first, load = 0, sphload = 0; i <= last; i++)
	{
	load += DomainCount[i];
	sphload += DomainCountSph[i];
	}

	split = first + ncpu_leftOfSplit;

	for(i = first, load_leftOfSplit = sphload_leftOfSplit = 0; i < split; i++)
	{
	load_leftOfSplit += DomainCount[i];
	sphload_leftOfSplit += DomainCountSph[i];
	}

	/* find the best split point in terms of work-load balance */

	while(split < last - (ncpu - ncpu_leftOfSplit - 1) && split > 0)
	{
	maxAvgLoad_CurrentSplit =
	dmax(load_leftOfSplit / ncpu_leftOfSplit, (load - load_leftOfSplit) / (ncpu - ncpu_leftOfSplit));

	maxAvgLoad_NewSplit =
	dmax((load_leftOfSplit + DomainCount[split]) / ncpu_leftOfSplit,
	(load - load_leftOfSplit - DomainCount[split]) / (ncpu - ncpu_leftOfSplit));

	if(maxAvgLoad_NewSplit <= maxAvgLoad_CurrentSplit)
	{
	load_leftOfSplit += DomainCount[split];
	sphload_leftOfSplit += DomainCountSph[split];
	split++;
	}
	else
	break;
	}


	/* we will now have to check whether this solution is possible given the restrictions on the maximum load */

	for(i = first, load_leftOfSplit = 0, sphload_leftOfSplit = 0; i < split; i++)
	{
	load_leftOfSplit += DomainCount[i];
	sphload_leftOfSplit += DomainCountSph[i];
	}

	if(load_leftOfSplit > maxload * ncpu_leftOfSplit \|\|
	(load - load_leftOfSplit) > maxload * (ncpu - ncpu_leftOfSplit))
	{
	/* we did not find a viable split */
	printf("(%d) error -1 ",ThisTask);
	return -1;
	}

	if(sphload_leftOfSplit > maxloadsph * ncpu_leftOfSplit \|\|
	(sphload - sphload_leftOfSplit) > maxloadsph * (ncpu - ncpu_leftOfSplit))
	{
	/* we did not find a viable split */
	printf("(%d) error -2 ",ThisTask);
	return -2;
	}




	if(ncpu_leftOfSplit >= 2)
	ok_left = domain_findSplit(cpustart, ncpu_leftOfSplit, first, split - 1);
	else
	ok_left = 0;

	if((ncpu - ncpu_leftOfSplit) >= 2)
	ok_right = domain_findSplit(cpustart + ncpu_leftOfSplit, ncpu - ncpu_leftOfSplit, split, last);
	else
	ok_right = 0;



	if(ok_left == 0 && ok_right == 0)
	{
	/* found a viable split */

	if(ncpu_leftOfSplit == 1)
	{
	for(i = first; i < split; i++)
	DomainTask[i] = cpustart;

	list_load[cpustart] = load_leftOfSplit;
	list_loadsph[cpustart] = sphload_leftOfSplit;
	DomainStartList[cpustart] = first;
	DomainEndList[cpustart] = split - 1;
	}

	if((ncpu - ncpu_leftOfSplit) == 1)
	{
	for(i = split; i <= last; i++)
	DomainTask[i] = cpustart + ncpu_leftOfSplit;

	list_load[cpustart + ncpu_leftOfSplit] = load - load_leftOfSplit;
	list_loadsph[cpustart + ncpu_leftOfSplit] = sphload - sphload_leftOfSplit;
	DomainStartList[cpustart + ncpu_leftOfSplit] = split;
	DomainEndList[cpustart + ncpu_leftOfSplit] = last;
	}

	return 0;
	}

	/* we did not find a viable split */
	return -3;
	}


	#ifdef SPLIT_DOMAIN_USING_TIME
	/*! This function tries to find a split point in a range of cells
	*/
	int domain_findSplityr(int cpustart, int ncpu, int first, int last)
	{
	int i, split, ok_left, ok_right;
	long long load, sphload, load_leftOfSplit, sphload_leftOfSplit;
	int ncpu_leftOfSplit;
	double maxAvgLoad_CurrentSplit, maxAvgLoad_NewSplit;





	/************************************/
	/* find the number of part per proc */
	/************************************/


	double sumCPU_Gravity,meanCPU_Gravity;
	double imb;
	double ImbFactor;
	int NumPartDiff,sumNumPartDiff,DesiredNumPart;
	int *list_DesiredNumPart;

	MPI_Allreduce(&CPU_Gravity, &sumCPU_Gravity, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	meanCPU_Gravity = sumCPU_Gravity/NTask;

	if (sumCPU_Gravity>0)
	{
	imb = CPU_Gravity/meanCPU_Gravity;
	NumPartDiff = (int) ((meanCPU_Gravity-CPU_Gravity)/meanCPU_Gravity * All.TotNumPart/NTask * 0.5);
	ImbFactor=0;
	}
	else
	{
	imb = 0;
	ImbFactor=0;
	NumPartDiff=0;
	}


	//NumPartDiff = (int)NumPart*ImbFactor;
	MPI_Reduce(&NumPartDiff, &sumNumPartDiff, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

	if (ThisTask==0)
	{
	NumPartDiff-= sumNumPartDiff;
	}

	/* check */
	MPI_Reduce(&NumPartDiff, &sumNumPartDiff, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
	if (ThisTask==0)
	if (sumNumPartDiff != 0)
	{
	printf("we are in trouble here...\n");
	endrun(89897676);
	}

	DesiredNumPart = NumPart + NumPartDiff;


	/* create a list */
	list_DesiredNumPart= malloc(sizeof(double) * NTask);
	MPI_Allgather(&DesiredNumPart, 1, MPI_INT, list_DesiredNumPart, 1, MPI_INT, MPI_COMM_WORLD);



	printf("(%04d) Step=%04d Imbalance : %12g %12g %12g %12g %12d %12d %12d\n",ThisTask,All.NumCurrentTiStep,CPU_Gravity,sumCPU_Gravity/NTask, imb ,ImbFactor,NumPart,NumPartDiff,DesiredNumPart);

	/************************************/
	/* find the splits */
	/************************************/


	int task;
	int domain;

	/* loop over all top leaves */
	for(task = 0, domain = 0; task < NTask-1; task++)
	{


	DomainStartList[task] = domain;

	list_load[task] =0;
	list_loadsph[task]=0;


	while( (list_load[task]+DomainCount[domain] < list_DesiredNumPart[task]) \|\| ((list_load[task]+DomainCount[domain] - list_DesiredNumPart[task]) < (list_DesiredNumPart[task]-list_load[task])) )
	{

	/* add the domain to the task */
	list_load[task] += DomainCount[domain];
	list_loadsph[task]+= DomainCountSph[domain];

	DomainTask[domain] = task;
	DomainEndList[task] = domain;

	/* move to next domain */
	domain++;
	if (domain==NTopleaves)
	{

	printf("not enought domains...\n");
	endrun(77665566);
	}


	}

	}


	/* now, do the last task */
	task = NTask-1;

	DomainStartList[task] = domain;

	list_load[task] =0;
	list_loadsph[task]=0;

	for(domain = domain; domain < NTopleaves; domain++)
	{
	list_load[task] += DomainCount[domain];
	list_loadsph[task]+= DomainCountSph[domain];

	DomainTask[domain] = task;
	DomainEndList[task] = domain;
	}





	free(list_DesiredNumPart);


	return 0;
	}
	#endif


	/*! This function tries to improve the domain decomposition found by
	* domain_findSplit() with respect to work-load balance. To this end, the
	* boundaries in the existing domain-split solution (which was found by
	* trying to balance the particle load) are shifted as long as this leads
	* to better work-load while still remaining within the allowed
	* memory-imbalance constraints.
	*/
	void domain_shiftSplit(void)
	{
	int i, task, iter = 0, moved;
	double maxw, newmaxw;

	for(task = 0; task < NTask; task++)
	list_work[task] = 0;

	for(i = 0; i < NTopleaves; i++)
	list_work[DomainTask[i]] += DomainWork[i];

	#ifndef PY_INTERFACE
	if (ThisTask==0)
	{
	fprintf(FdLog,"1 %12g ",All.Time);
	for(task = 0; task < NTask; task++)
	fprintf(FdLog,"%12g ",list_work[task]);
	fprintf(FdLog,"\n");
	fflush(FdLog);
	}
	#endif

	do
	{
	for(task = 0, moved = 0; task < NTask - 1; task++)
	{
	maxw = dmax(list_work[task], list_work[task + 1]);

	if(list_work[task] < list_work[task + 1])
	{
	newmaxw = dmax(list_work[task] + DomainWork[DomainStartList[task + 1]],
	list_work[task + 1] - DomainWork[DomainStartList[task + 1]]);
	if(newmaxw <= maxw)
	{
	if(list_load[task] + DomainCount[DomainStartList[task + 1]] <= maxload)
	{
	if(list_loadsph[task] + DomainCountSph[DomainStartList[task + 1]] > maxloadsph)
	continue;

	/* ok, we can move one domain cell from right to left */
	list_work[task] += DomainWork[DomainStartList[task + 1]];
	list_load[task] += DomainCount[DomainStartList[task + 1]];
	list_loadsph[task] += DomainCountSph[DomainStartList[task + 1]];
	list_work[task + 1] -= DomainWork[DomainStartList[task + 1]];
	list_load[task + 1] -= DomainCount[DomainStartList[task + 1]];
	list_loadsph[task + 1] -= DomainCountSph[DomainStartList[task + 1]];

	DomainTask[DomainStartList[task + 1]] = task;
	DomainStartList[task + 1] += 1;
	DomainEndList[task] += 1;

	moved++;
	}
	}
	}
	else
	{
	newmaxw = dmax(list_work[task] - DomainWork[DomainEndList[task]],
	list_work[task + 1] + DomainWork[DomainEndList[task]]);
	if(newmaxw <= maxw)
	{
	if(list_load[task + 1] + DomainCount[DomainEndList[task]] <= maxload)
	{
	if(list_loadsph[task + 1] + DomainCountSph[DomainEndList[task]] > maxloadsph)
	continue;

	/* ok, we can move one domain cell from left to right */
	list_work[task] -= DomainWork[DomainEndList[task]];
	list_load[task] -= DomainCount[DomainEndList[task]];
	list_loadsph[task] -= DomainCountSph[DomainEndList[task]];
	list_work[task + 1] += DomainWork[DomainEndList[task]];
	list_load[task + 1] += DomainCount[DomainEndList[task]];
	list_loadsph[task + 1] += DomainCountSph[DomainEndList[task]];

	DomainTask[DomainEndList[task]] = task + 1;
	DomainEndList[task] -= 1;
	DomainStartList[task + 1] -= 1;

	moved++;
	}
	}

	}
	}

	iter++;
	}
	while(moved > 0 && iter < 10 * NTopleaves);


	#ifndef PY_INTERFACE
	if (ThisTask==0)
	{
	fprintf(FdLog,"2 %12g ",All.Time);
	for(task = 0; task < NTask; task++)
	fprintf(FdLog,"%12g ",list_work[task]);
	fprintf(FdLog,"\n");
	fflush(FdLog);
	}
	#endif




	}


	#ifdef SPLIT_DOMAIN_USING_TIME
	/*! This function tries to improve the domain decomposition found by
	* domain_findSplit() with respect to work-load balance. To this end, the
	* boundaries in the existing domain-split solution (which was found by
	* trying to balance the particle load) are shifted as long as this leads
	* to better work-load while still remaining within the allowed
	* memory-imbalance constraints.
	*/
	void domain_shiftSplityr(void)
	{

	}
	#endif


	/*! This function counts how many particles have to be exchanged between
	* two CPUs according to the domain split. If the CPUs are already quite
	* full and hold data from other CPUs as well, not all the particles may
	* be exchanged at once. In this case the communication phase has to be
	* repeated, until enough of the third-party particles have been moved
	* away such that the decomposition can be completed.
	*/
	void domain_findExchangeNumbers(int task, int partner, int sphflag, int send, int recv)
	{
	int numpartA, numpartsphA, ntobesentA, maxsendA, maxsendA_old;
	int numpartB, numpartsphB, ntobesentB, maxsendB, maxsendB_old;
	#ifdef STELLAR_PROP
	int numpartstA;
	int numpartstB;
	#endif


	numpartA = list_NumPart[task];
	numpartsphA = list_N_gas[task];
	#ifdef STELLAR_PROP
	numpartstA = list_N_stars[task];
	#endif

	numpartB = list_NumPart[partner];
	numpartsphB = list_N_gas[partner];
	#ifdef STELLAR_PROP
	numpartstB = list_N_stars[partner];
	#endif

	switch (sphflag)
	{
	case 0:
	ntobesentA = toGoSph[task * NTask + partner];
	ntobesentB = toGoSph[partner * NTask + task];
	break;
	#ifdef STELLAR_PROP
	case 1:
	ntobesentA = toGoSt[task * NTask + partner];
	ntobesentB = toGoSt[partner * NTask + task];
	break;
	#endif
	default:
	#ifdef STELLAR_PROP
	ntobesentA = toGo[task * NTask + partner] - toGoSph[task * NTask + partner] - toGoSt[task * NTask + partner];
	ntobesentB = toGo[partner * NTask + task] - toGoSph[partner * NTask + task] - toGoSt[partner * NTask + task];
	#else
	ntobesentA = toGo[task * NTask + partner] - toGoSph[task * NTask + partner];
	ntobesentB = toGo[partner * NTask + task] - toGoSph[partner * NTask + task];
	#endif
	break;
	}


	maxsendA = imin(ntobesentA, All.BunchSizeDomain);
	maxsendB = imin(ntobesentB, All.BunchSizeDomain);

	do
	{
	maxsendA_old = maxsendA;
	maxsendB_old = maxsendB;

	maxsendA = imin(All.MaxPart - numpartB + maxsendB, maxsendA);
	maxsendB = imin(All.MaxPart - numpartA + maxsendA, maxsendB);
	}
	while((maxsendA != maxsendA_old) \|\| (maxsendB != maxsendB_old));


	/* now make also sure that there is enough space for SPH particles */
	switch (sphflag)
	{
	case 0:
	do
	{
	maxsendA_old = maxsendA;
	maxsendB_old = maxsendB;

	maxsendA = imin(All.MaxPartSph - numpartsphB + maxsendB, maxsendA);
	maxsendB = imin(All.MaxPartSph - numpartsphA + maxsendA, maxsendB);
	}
	while((maxsendA != maxsendA_old) \|\| (maxsendB != maxsendB_old));
	break;
	#ifdef STELLAR_PROP
	case 1:
	do
	{
	maxsendA_old = maxsendA;
	maxsendB_old = maxsendB;

	maxsendA = imin(All.MaxPartStars - numpartstB + maxsendB, maxsendA);
	maxsendB = imin(All.MaxPartStars - numpartstA + maxsendA, maxsendB);
	}
	while((maxsendA != maxsendA_old) \|\| (maxsendB != maxsendB_old));
	break;
	#endif
	default:
	break;
	}


	*send = maxsendA;
	*recv = maxsendB;
	}




	/*! This function exchanges particles between two CPUs according to the
	* domain split. In doing this, the memory boundaries which may restrict
	* the exhange process are observed.
	*/
	void domain_exchangeParticles(int partner, int sphflag, int send_count, int recv_count)
	{
	int i, no, n, count, rep;
	MPI_Status status;
	#ifdef STELLAR_PROP
	int m;
	#endif

	for(n = 0, count = 0; count < send_count && n < NumPart; n++)
	{

	switch (sphflag)
	{
	case 0:
	if(P[n].Type != 0)
	continue;
	break;
	#ifdef STELLAR_PROP
	case 1:
	if(P[n].Type != 1)
	continue;
	break;
	#endif
	default:
	#ifdef STELLAR_PROP
	if((P[n].Type == 0)\|\|(P[n].Type == 1))
	#else
	if(P[n].Type == 0)
	#endif
	continue;
	break;
	}

	no = 0;

	while(TopNodes[no].Daughter >= 0)
	no = TopNodes[no].Daughter + (Key[n] - TopNodes[no].StartKey) / (TopNodes[no].Size / 8);

	no = TopNodes[no].Leaf;

	if(DomainTask[no] == partner)
	{


	switch (sphflag)
	{
	case 0: /* special reorder routine for SPH particles (need to stay at beginning) */
	#ifndef STELLAR_PROP
	DomainPartBuf[count] = P[n]; /* copy particle and collect in contiguous memory */
	DomainKeyBuf[count] = Key[n];
	DomainSphBuf[count] = SphP[n];

	P[n] = P[N_gas - 1];
	P[N_gas - 1] = P[NumPart - 1];

	Key[n] = Key[N_gas - 1];
	Key[N_gas - 1] = Key[NumPart - 1];

	SphP[n] = SphP[N_gas - 1];
	N_gas--;
	break;
	#else
	DomainPartBuf[count] = P[n]; /* copy particle and collect in contiguous memory */
	DomainKeyBuf[count] = Key[n];
	DomainSphBuf[count] = SphP[n];

	P[n] = P[N_gas - 1];
	P[N_gas - 1] = P[N_gas+N_stars - 1];
	P[N_gas+N_stars - 1] = P[NumPart - 1];

	Key[n] = Key[N_gas - 1];
	Key[N_gas - 1] = Key[N_gas+N_stars - 1];
	Key[N_gas+N_stars - 1] = Key[NumPart - 1];

	SphP[n] = SphP[N_gas - 1];

	StP[P[N_gas - 1].StPIdx].PIdx = N_gas - 1;

	N_gas--;
	break;
	#endif
	#ifdef STELLAR_PROP
	case 1:
	DomainPartBuf[count] = P[n]; /* copy particle and collect in contiguous memory */
	DomainKeyBuf[count] = Key[n];
	DomainStBuf[count] = StP[P[n].StPIdx];

	m = P[n].StPIdx;

	//printf("(%d) sending P .... n=%d id=%d\n",ThisTask,n,P[n].ID);
	//printf("(%d) replacing P using .... n=%d id=%d\n",ThisTask,N_gas+N_stars - 1,P[N_gas+N_stars - 1].ID);
	//printf("(%d) sending Stp .... n=%d id=%d\n",ThisTask,m,StP[m].ID);
	//printf("(%d) replacing StP using .... n=%d id=%d\n",ThisTask,N_stars - 1,StP[N_stars - 1].ID);


	P[n] = P[N_gas+N_stars - 1];
	StP[P[n].StPIdx].PIdx = n; /* create correct link */
	P[N_gas+N_stars - 1] = P[NumPart - 1];


	Key[n] = Key[N_gas+N_stars - 1];
	Key[N_gas+N_stars - 1] = Key[NumPart - 1];


	if (m!=(N_stars - 1))
	{
	StP[m] = StP[N_stars - 1]; /* replace by the last one -> avoid a hole in Stp */
	P[StP[m].PIdx].StPIdx = m; /* create correct link */
	}






	N_stars--;
	break;
	#endif
	default:
	DomainPartBuf[count] = P[n]; /* copy particle and collect in contiguous memory */
	DomainKeyBuf[count] = Key[n];
	P[n] = P[NumPart - 1];
	Key[n] = Key[NumPart - 1];
	break;
	}


	count++;
	NumPart--;
	n--;

	}
	}

	if(count != send_count)
	{
	printf("Houston, we got a problem... (sphflag=%d)\n",sphflag);
	printf("ThisTask=%d count=%d send_count=%d\n", ThisTask, count, send_count);
	endrun(88);
	}

	/* transmit */

	for(rep = 0; rep < 2; rep++)
	{
	if((rep == 0 && ThisTask < partner) \|\| (rep == 1 && ThisTask > partner))
	{
	if(send_count > 0)
	{
	MPI_Ssend(&DomainPartBuf[0], send_count * sizeof(struct particle_data), MPI_BYTE, partner,
	TAG_PDATA, MPI_COMM_WORLD);

	MPI_Ssend(&DomainKeyBuf[0], send_count * sizeof(peanokey), MPI_BYTE, partner, TAG_KEY,
	MPI_COMM_WORLD);

	if(sphflag==0)
	MPI_Ssend(&DomainSphBuf[0], send_count * sizeof(struct sph_particle_data), MPI_BYTE, partner,
	TAG_SPHDATA, MPI_COMM_WORLD);

	#ifdef STELLAR_PROP
	if(sphflag==ST)
	MPI_Ssend(&DomainStBuf[0], send_count * sizeof(struct st_particle_data), MPI_BYTE, partner,
	TAG_STDATA, MPI_COMM_WORLD);
	#endif

	}
	}

	if((rep == 1 && ThisTask < partner) \|\| (rep == 0 && ThisTask > partner))
	{
	if(recv_count > 0)
	{
	switch (sphflag)
	{
	case 0:
	#ifndef STELLAR_PROP

	if((NumPart - N_gas) > recv_count)
	{
	for(i = 0; i < recv_count; i++)
	{
	P[NumPart + i] = P[N_gas + i];
	Key[NumPart + i] = Key[N_gas + i];
	}
	}
	else
	{
	for(i = NumPart - 1; i >= N_gas; i--)
	{
	P[i + recv_count] = P[i];
	Key[i + recv_count] = Key[i];
	}
	}

	#else
	/* A : move elts of last block */
	if((NumPart - N_gas - N_stars) > recv_count)
	{
	for(i = 0; i < recv_count; i++)
	{
	P[NumPart + i] = P[N_gas + N_stars + i];
	Key[NumPart + i] = Key[N_gas + N_stars + i];
	}
	}
	else
	{
	for(i = NumPart - 1; i >= N_gas + N_stars; i--)
	{
	P[i + recv_count] = P[i];
	Key[i + recv_count] = Key[i];
	}
	}


	/* B : move stars */
	if (N_stars > 0)
	{
	if((N_stars) > recv_count)
	{
	for(i = 0; i < recv_count; i++)
	{
	P[N_gas + N_stars + i] = P[N_gas + i];
	Key[N_gas + N_stars + i] = Key[N_gas + i];
	StP[P[N_gas + N_stars + i].StPIdx].PIdx = N_gas + N_stars + i;
	}
	}
	else
	{
	for(i = N_gas + N_stars - 1; i >= N_gas; i--)
	{
	P[i + recv_count] = P[i];
	Key[i + recv_count] = Key[i];
	StP[P[i + recv_count].StPIdx].PIdx = i + recv_count;
	}
	}
	}
	#endif
	MPI_Recv(&P[N_gas], recv_count * sizeof(struct particle_data), MPI_BYTE, partner, TAG_PDATA,
	MPI_COMM_WORLD, &status);
	MPI_Recv(&Key[N_gas], recv_count * sizeof(peanokey), MPI_BYTE, partner, TAG_KEY,
	MPI_COMM_WORLD, &status);
	MPI_Recv(&SphP[N_gas], recv_count * sizeof(struct sph_particle_data), MPI_BYTE, partner,
	TAG_SPHDATA, MPI_COMM_WORLD, &status);

	N_gas += recv_count;
	break;
	#ifdef STELLAR_PROP
	case 1:
	if((NumPart - N_gas - N_stars) > recv_count)
	{
	for(i = 0; i < recv_count; i++)
	{
	P[NumPart + i] = P[N_gas + N_stars + i];
	Key[NumPart + i] = Key[N_gas + N_stars + i];
	}
	}
	else
	{
	for(i = NumPart - 1; i >= N_gas + N_stars; i--)
	{
	P[i + recv_count] = P[i];
	Key[i + recv_count] = Key[i];
	}
	}

	MPI_Recv(&P[N_gas+N_stars], recv_count * sizeof(struct particle_data), MPI_BYTE, partner, TAG_PDATA,
	MPI_COMM_WORLD, &status);
	MPI_Recv(&Key[N_gas+N_stars], recv_count * sizeof(peanokey), MPI_BYTE, partner, TAG_KEY,
	MPI_COMM_WORLD, &status);
	MPI_Recv(&StP[N_stars], recv_count * sizeof(struct st_particle_data), MPI_BYTE, partner, TAG_STDATA,
	MPI_COMM_WORLD, &status);

	/* set right links */
	for(i = 0; i < recv_count; i++)
	{
	P[N_gas + N_stars + i].StPIdx = N_stars + i;
	StP[N_stars + i].PIdx = N_gas + N_stars + i;
	}


	N_stars += recv_count;
	break;
	#endif
	default:
	MPI_Recv(&P[NumPart], recv_count * sizeof(struct particle_data), MPI_BYTE, partner,
	TAG_PDATA, MPI_COMM_WORLD, &status);
	MPI_Recv(&Key[NumPart], recv_count * sizeof(peanokey), MPI_BYTE, partner,
	TAG_KEY, MPI_COMM_WORLD, &status);

	break;
	}


	NumPart += recv_count;
	}
	}
	}
	}

	/*! This function determines how many particles that are currently stored
	* on the local CPU have to be moved off according to the domain
	* decomposition.
	*/
	void domain_countToGo(void)
	{
	int n, no;

	for(n = 0; n < NTask; n++)
	{
	local_toGo[n] = 0;
	local_toGoSph[n] = 0;
	#ifdef STELLAR_PROP
	local_toGoSt[n] = 0;
	#endif
	}

	for(n = 0; n < NumPart; n++)
	{
	no = 0;

	while(TopNodes[no].Daughter >= 0)
	no = TopNodes[no].Daughter + (Key[n] - TopNodes[no].StartKey) / (TopNodes[no].Size / 8);

	no = TopNodes[no].Leaf;

	if(DomainTask[no] != ThisTask)
	{
	local_toGo[DomainTask[no]] += 1;
	if(P[n].Type == 0)
	local_toGoSph[DomainTask[no]] += 1;
	#ifdef STELLAR_PROP
	if(P[n].Type == 1)
	local_toGoSt[DomainTask[no]] += 1;
	#endif
	}
	}

	MPI_Allgather(local_toGo, NTask, MPI_INT, toGo, NTask, MPI_INT, MPI_COMM_WORLD);
	MPI_Allgather(local_toGoSph, NTask, MPI_INT, toGoSph, NTask, MPI_INT, MPI_COMM_WORLD);
	#ifdef STELLAR_PROP
	MPI_Allgather(local_toGoSt, NTask, MPI_INT, toGoSt, NTask, MPI_INT, MPI_COMM_WORLD);
	#endif
	}


	/*! This function walks the global top tree in order to establish the
	* number of leaves it has. These leaves are distributed to different
	* processors.
	*/
	void domain_walktoptree(int no)
	{
	int i;

	if(TopNodes[no].Daughter == -1)
	{
	TopNodes[no].Leaf = NTopleaves;
	NTopleaves++;
	}
	else
	{
	for(i = 0; i < 8; i++)
	domain_walktoptree(TopNodes[no].Daughter + i);
	}
	}

	/*! This routine bins the particles onto the domain-grid, i.e. it sums up the
	* total number of particles and the total amount of work in each of the
	* domain-cells. This information forms the basis for the actual decision on
	* the adopted domain decomposition.
	*/
	void domain_sumCost(void)
	{
	int i, n, no;
	double *local_DomainWork;
	int *local_DomainCount;
	int *local_DomainCountSph;

	local_DomainWork = malloc(NTopnodes * sizeof(double));
	local_DomainCount = malloc(NTopnodes * sizeof(int));
	local_DomainCountSph = malloc(NTopnodes * sizeof(int));



	NTopleaves = 0;

	domain_walktoptree(0);

	for(i = 0; i < NTopleaves; i++)
	{
	local_DomainWork[i] = 0;
	local_DomainCount[i] = 0;
	local_DomainCountSph[i] = 0;
	}

	if(ThisTask == 0)
	printf("NTopleaves= %d\n", NTopleaves);


	for(n = 0; n < NumPart; n++)
	{
	no = 0;

	while(TopNodes[no].Daughter >= 0)
	no = TopNodes[no].Daughter + (Key[n] - TopNodes[no].StartKey) / (TopNodes[no].Size / 8);

	no = TopNodes[no].Leaf;

	if(P[n].Ti_endstep > P[n].Ti_begstep)
	local_DomainWork[no] += (1.0 + P[n].GravCost) / (P[n].Ti_endstep - P[n].Ti_begstep);
	else
	local_DomainWork[no] += (1.0 + P[n].GravCost);

	local_DomainCount[no] += 1;
	if(P[n].Type == 0)
	local_DomainCountSph[no] += 1;
	}

	MPI_Allreduce(local_DomainWork, DomainWork, NTopleaves, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	MPI_Allreduce(local_DomainCount, DomainCount, NTopleaves, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
	MPI_Allreduce(local_DomainCountSph, DomainCountSph, NTopleaves, MPI_INT, MPI_SUM, MPI_COMM_WORLD);


	free(local_DomainCountSph);
	free(local_DomainCount);
	free(local_DomainWork);
	}


	/*! This routine finds the extent of the global domain grid.
	*/
	void domain_findExtent(void)
	{
	int i, j;
	double len, xmin[3], xmax[3], xmin_glob[3], xmax_glob[3];

	/* determine local extension */
	for(j = 0; j < 3; j++)
	{
	xmin[j] = MAX_REAL_NUMBER;
	xmax[j] = -MAX_REAL_NUMBER;
	}

	for(i = 0; i < NumPart; i++)
	{
	for(j = 0; j < 3; j++)
	{
	if(xmin[j] > P[i].Pos[j])
	xmin[j] = P[i].Pos[j];

	if(xmax[j] < P[i].Pos[j])
	xmax[j] = P[i].Pos[j];
	}
	}

	MPI_Allreduce(xmin, xmin_glob, 3, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
	MPI_Allreduce(xmax, xmax_glob, 3, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);

	len = 0;
	for(j = 0; j < 3; j++)
	if(xmax_glob[j] - xmin_glob[j] > len)
	len = xmax_glob[j] - xmin_glob[j];

	len *= 1.001;

	#ifdef DOMAIN_AT_ORIGIN
	for(j = 0; j < 3; j++)
	{
	DomainCenter[j] = 0.0;
	DomainCorner[j] = - 0.5 * len;
	}
	#else
	for(j = 0; j < 3; j++)
	{
	DomainCenter[j] = 0.5 * (xmin_glob[j] + xmax_glob[j]);
	DomainCorner[j] = 0.5 * (xmin_glob[j] + xmax_glob[j]) - 0.5 * len;
	}
	#endif

	DomainLen = len;
	DomainFac = 1.0 / len * (((peanokey) 1) << (BITS_PER_DIMENSION));

	#ifdef OTHERINFO
	if(ThisTask == 0)
	{
	printf("xmin_glob = %g %g %g\n",xmin_glob[0],xmin_glob[1],xmin_glob[2]);
	printf("xmax_glob = %g %g %g\n",xmax_glob[0],xmax_glob[1],xmax_glob[2]);
	printf("DomainCenter = %g %g %g\n",DomainCenter[0], DomainCenter[1], DomainCenter[2]);
	printf("DomainCorner = %g %g %g\n",DomainCorner[0], DomainCorner[1], DomainCorner[2]);
	printf("DomainLen = %g\n",DomainLen);
	}
	#endif

	}


	/*! This function constructs the global top-level tree node that is used
	* for the domain decomposition. This is done by considering the string of
	* Peano-Hilbert keys for all particles, which is recursively chopped off
	* in pieces of eight segments until each segment holds at most a certain
	* number of particles.
	*/
	void domain_determineTopTree(void)
	{
	int i, ntop_local, ntop;
	int ntopnodelist, ntopoffset;

	for(i = 0; i < NumPart; i++)
	{
	KeySorted[i] = Key[i] = peano_hilbert_key((P[i].Pos[0] - DomainCorner[0]) * DomainFac,
	(P[i].Pos[1] - DomainCorner[1]) * DomainFac,
	(P[i].Pos[2] - DomainCorner[2]) * DomainFac,
	BITS_PER_DIMENSION);
	}

	qsort(KeySorted, NumPart, sizeof(peanokey), domain_compare_key);

	NTopnodes = 1;
	TopNodes[0].Daughter = -1;
	TopNodes[0].Size = PEANOCELLS;
	TopNodes[0].StartKey = 0;
	TopNodes[0].Count = NumPart;
	TopNodes[0].Pstart = 0;

	domain_topsplit_local(0, 0);

	toplist_local = malloc(NTopnodes * sizeof(struct topnode_exchange));

	for(i = 0, ntop_local = 0; i < NTopnodes; i++)
	{
	if(TopNodes[i].Daughter == -1) /* only use leaves */
	{
	toplist_local[ntop_local].Startkey = TopNodes[i].StartKey;
	toplist_local[ntop_local].Count = TopNodes[i].Count;
	ntop_local++;
	}
	}

	ntopnodelist = malloc(sizeof(int) * NTask);
	ntopoffset = malloc(sizeof(int) * NTask);

	MPI_Allgather(&ntop_local, 1, MPI_INT, ntopnodelist, 1, MPI_INT, MPI_COMM_WORLD);

	for(i = 0, ntop = 0, ntopoffset[0] = 0; i < NTask; i++)
	{
	ntop += ntopnodelist[i];
	if(i > 0)
	ntopoffset[i] = ntopoffset[i - 1] + ntopnodelist[i - 1];
	}


	toplist = malloc(ntop * sizeof(struct topnode_exchange));

	for(i = 0; i < NTask; i++)
	{
	ntopnodelist[i] *= sizeof(struct topnode_exchange);
	ntopoffset[i] *= sizeof(struct topnode_exchange);
	}

	MPI_Allgatherv(toplist_local, ntop_local * sizeof(struct topnode_exchange), MPI_BYTE,
	toplist, ntopnodelist, ntopoffset, MPI_BYTE, MPI_COMM_WORLD);

	qsort(toplist, ntop, sizeof(struct topnode_exchange), domain_compare_toplist);

	NTopnodes = 1;
	TopNodes[0].Daughter = -1;
	TopNodes[0].Size = PEANOCELLS;
	TopNodes[0].StartKey = 0;
	TopNodes[0].Count = All.TotNumPart;
	TopNodes[0].Pstart = 0;
	TopNodes[0].Blocks = ntop;

	domain_topsplit(0, 0);

	free(toplist);
	free(ntopoffset);
	free(ntopnodelist);
	free(toplist_local);

	}



	/*! This function is responsible for constructing the local top-level
	* Peano-Hilbert segments. A segment is cut into 8 pieces recursively
	* until the number of particles in the segment has fallen below
	* All.TotNumPart / (TOPNODEFACTOR * NTask * NTask).
	*/
	void domain_topsplit_local(int node, peanokey startkey)
	{
	int i, p, sub, bin;

	if(TopNodes[node].Size >= 8)
	{
	TopNodes[node].Daughter = NTopnodes;

	for(i = 0; i < 8; i++)
	{
	if(NTopnodes < MAXTOPNODES)
	{
	sub = TopNodes[node].Daughter + i;
	TopNodes[sub].Size = TopNodes[node].Size / 8;
	TopNodes[sub].Count = 0;
	TopNodes[sub].Daughter = -1;
	TopNodes[sub].StartKey = startkey + i * TopNodes[sub].Size;
	TopNodes[sub].Pstart = TopNodes[node].Pstart;

	NTopnodes++;
	}
	else
	{
	printf("task=%d: We are out of Topnodes. Increasing the constant MAXTOPNODES might help.\n",
	ThisTask);
	fflush(stdout);
	endrun(13213);
	}
	}

	for(p = TopNodes[node].Pstart; p < TopNodes[node].Pstart + TopNodes[node].Count; p++)
	{
	bin = (KeySorted[p] - startkey) / (TopNodes[node].Size / 8);

	if(bin < 0 \|\| bin > 7)
	{
	printf("task=%d: something odd has happened here. bin=%d\n", ThisTask, bin);
	fflush(stdout);
	endrun(13123123);
	}

	sub = TopNodes[node].Daughter + bin;

	if(TopNodes[sub].Count == 0)
	TopNodes[sub].Pstart = p;

	TopNodes[sub].Count++;
	}

	for(i = 0; i < 8; i++)
	{
	sub = TopNodes[node].Daughter + i;
	//if(TopNodes[sub].Count > All.TotNumPart / (TOPNODEFACTOR * NTask * NTask))
	if(TopNodes[sub].Count > All.TotNumPart / (TOPNODEFACTOR * 8 * NTask))
	domain_topsplit_local(sub, TopNodes[sub].StartKey);
	}
	}
	}



	/*! This function is responsible for constructing the global top-level tree
	* segments. Starting from a joint list of all local top-level segments,
	* in which mulitple occurences of the same spatial segment have been
	* combined, a segment is subdivided into 8 pieces recursively until the
	* number of particles in each segment has fallen below All.TotNumPart /
	* (TOPNODEFACTOR * NTask).
	*/
	void domain_topsplit(int node, peanokey startkey)
	{
	int i, p, sub, bin;

	if(TopNodes[node].Size >= 8)
	{
	TopNodes[node].Daughter = NTopnodes;

	for(i = 0; i < 8; i++)
	{
	if(NTopnodes < MAXTOPNODES)
	{
	sub = TopNodes[node].Daughter + i;
	TopNodes[sub].Size = TopNodes[node].Size / 8;
	TopNodes[sub].Count = 0;
	TopNodes[sub].Blocks = 0;
	TopNodes[sub].Daughter = -1;
	TopNodes[sub].StartKey = startkey + i * TopNodes[sub].Size;
	TopNodes[sub].Pstart = TopNodes[node].Pstart;
	NTopnodes++;
	}
	else
	{
	printf("Task=%d: We are out of Topnodes. Increasing the constant MAXTOPNODES might help.\n",
	ThisTask);
	fflush(stdout);
	endrun(137213);
	}
	}

	for(p = TopNodes[node].Pstart; p < TopNodes[node].Pstart + TopNodes[node].Blocks; p++)
	{
	bin = (toplist[p].Startkey - startkey) / (TopNodes[node].Size / 8);
	sub = TopNodes[node].Daughter + bin;

	if(bin < 0 \|\| bin > 7)
	endrun(77);

	if(TopNodes[sub].Blocks == 0)
	TopNodes[sub].Pstart = p;

	TopNodes[sub].Count += toplist[p].Count;
	TopNodes[sub].Blocks++;
	}

	for(i = 0; i < 8; i++)
	{
	sub = TopNodes[node].Daughter + i;
	if(TopNodes[sub].Count > All.TotNumPart / (TOPNODEFACTOR * NTask))
	domain_topsplit(sub, TopNodes[sub].StartKey);
	}
	}
	}


	/*! This is a comparison kernel used in a sort routine.
	*/
	int domain_compare_toplist(const void a, const void b)
	{
	if(((struct topnode_exchange ) a)->Startkey < (((struct topnode_exchange ) b)->Startkey))
	return -1;

	if(((struct topnode_exchange ) a)->Startkey > (((struct topnode_exchange ) b)->Startkey))
	return +1;

	return 0;
	}

	/*! This is a comparison kernel used in a sort routine.
	*/
	int domain_compare_key(const void a, const void b)
	{
	if((peanokey ) a < (peanokey ) b)
	return -1;

	if((peanokey ) a > (peanokey ) b)
	return +1;

	return 0;
	}

domain.cNo OneTemporaryActions

File Metadata

domain.cView Options

Event Timeline

domain.c
No OneTemporary
Actions

domain.c
View Options