diff --git a/scripts/execute.sh b/scripts/execute.sh index 75b5197..bceb0f0 100755 --- a/scripts/execute.sh +++ b/scripts/execute.sh @@ -1,70 +1,58 @@ #!/bin/bash # author: gilles fourestey (EPFL) # based on http://serverfault.com/questions/776687/how-can-i-run-spark-on-a-cluster-using-slurm # debug flag # set -x #echo $0 $1 #env | grep SPARK # This section will be run when started by sbatch if [ "x$1" == 'x' ]; then - this=$0 + this = $0 # I experienced problems with some nodes not finding the script: # slurmstepd: execve(): /var/spool/slurm/job123/slurm_script: # No such file or directory # that's why this script is being copied to a shared location to which # all nodes have access to: - #script=./${SLURM_JOBID}_$(basename "$0") - #cp "$this" "$script" script=$this # - # This might not be necessary on all clusters - # module load scala/2.10.4 java/jdk1.7.0_25 cuda/7.0.28 - # export sparkLogs=./sparklogs/ export sparkTmp=./sparktmp/ mkdir -p "$sparkLogs" "$sparkTmp" # export SPARK_ROOT=/home/foureste/Projects/Spark/spark-2.0.1-bin-hadoop2.7/ export SPARK_WORKER_DIR=$sparkLogs export SPARK_LOCAL_DIRS=$sparkLogs export SPARK_MASTER_PORT=7077 export SPARK_MASTER_WEBUI_PORT=8080 export SPARK_WORKER_CORES=$SLURM_CPUS_PER_TASK + #export SPARK_MEM=48G #export SPARK_DRIVER_MEMORY=60G #export SPARK_DAEMON_MEMORY=$(( $SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK / 2 ))m #export SPARK_DAEMON_MEMORY=8G #export SPARK_WORKER_MEMORY=48G #export SPARK_MEM=$SPARK_DAEMON_MEMORY - #export SPARK_MEM=48G + #srun -mem=60G "$script" 'srunning' # - #srun --mem=60G "$script" 'srunning' srun "$script" 'srunning' # # If run by srun, then decide by $SLURM_PROCID whether we are master or worker elif [ "$1" == 'srunning' ]; then -#set -x - #source "$SPARK_ROOT/sbin/spark-config.sh" - #source "$SPARK_ROOT/bin/load-spark-env.sh" if [ $SLURM_PROCID -eq 0 ]; then export SPARK_MASTER_IP=$(hostname) MASTER_NODE=$(scontrol show hostname $SLURM_NODELIST | head -n 1) # # The saved IP address + port is necessary alter for submitting jobs - #echo "spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT" > "$sparkLogs/${SLURM_JOBID}_spark_master" echo "spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT" > "./${SLURM_JOBID}_spark_master" # "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.master.Master \ --ip $SPARK_MASTER_IP \ --port $SPARK_MASTER_PORT \ --webui-port $SPARK_MASTER_WEBUI_PORT else - # $(scontrol show hostname) is used to convert e.g. host20[39-40] - # to host2039 this step assumes that SLURM_PROCID=0 corresponds to - # the first node in SLURM_NODELIST ! MASTER_NODE=spark://$(scontrol show hostname $SLURM_NODELIST | head -n 1):7077 "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.worker.Worker $MASTER_NODE fi fi