diff --git a/examples/pi.slurm b/examples/diffusion.slurm similarity index 80% copy from examples/pi.slurm copy to examples/diffusion.slurm index 7fa09d8..61b5969 100644 --- a/examples/pi.slurm +++ b/examples/diffusion.slurm @@ -1,43 +1,48 @@ #!/bin/bash -# author: gilles foureste (EPFL) +# author: gilles fourestey (EPFL) # #SBATCH --nodes=2 # ntasks per node MUST be one, because multiple slaves per work doesn't # work well with slurm + spark in this script (they would need increasing # ports among other things) #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=24 #SBATCH --mem=8192 # Beware! $HOME will not be expanded and invalid paths will result Slurm jobs # hanging indefinitely with status CG (completing) when calling scancel! -##SBATCH --time=96:00:00 +#SBATCH --time=00:30:00 #SBATCH --partition=scitas -#SBATCH --qos=scitas #set -x # + +module load spark/2.0.2 + +### modify the path to your scripts directory +export PATH=/home/rezzonic/scitas_sparkservice/scripts:$PATH +### end modif + + echo "---- starting $0 on $HOSTNAME" echo # MASTER_NODE="" start-spark.sh echo "configuration done..." set -x # #echo $MASTER_NODE #echo $MASTER # #MASTER_IP=$(cat ./sparklogs_${SLURM_JOBID}/spark_master) MASTER_IP=$(cat ${SLURM_JOBID}_spark_master) # echo $MASTER_IP time time spark-submit \ --executor-memory 5G \ --master $MASTER_IP \ ./diffusion.py # stop-spark.sh -#scancel -u foureste -#spark-submit --master $MASTER ./pi.py diff --git a/examples/pi.slurm b/examples/pi.slurm index 7fa09d8..f08df34 100644 --- a/examples/pi.slurm +++ b/examples/pi.slurm @@ -1,43 +1,48 @@ #!/bin/bash -# author: gilles foureste (EPFL) # #SBATCH --nodes=2 # ntasks per node MUST be one, because multiple slaves per work doesn't # work well with slurm + spark in this script (they would need increasing # ports among other things) #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=24 #SBATCH --mem=8192 # Beware! $HOME will not be expanded and invalid paths will result Slurm jobs # hanging indefinitely with status CG (completing) when calling scancel! -##SBATCH --time=96:00:00 +#SBATCH --time=00:30:00 #SBATCH --partition=scitas -#SBATCH --qos=scitas #set -x # + +module load spark/2.0.2 + +### modify here with correct path to scripts directory +export PATH=/home/rezzonic/scitas_sparkservice/scripts:$PATH +### end modify + echo "---- starting $0 on $HOSTNAME" echo # MASTER_NODE="" start-spark.sh echo "configuration done..." set -x # #echo $MASTER_NODE #echo $MASTER # #MASTER_IP=$(cat ./sparklogs_${SLURM_JOBID}/spark_master) MASTER_IP=$(cat ${SLURM_JOBID}_spark_master) # echo $MASTER_IP time time spark-submit \ --executor-memory 5G \ --master $MASTER_IP \ -./diffusion.py +./pi.py # stop-spark.sh #scancel -u foureste #spark-submit --master $MASTER ./pi.py diff --git a/scripts/execute.sh b/scripts/execute.sh index 75b5197..127571a 100755 --- a/scripts/execute.sh +++ b/scripts/execute.sh @@ -1,70 +1,70 @@ #!/bin/bash # author: gilles fourestey (EPFL) # based on http://serverfault.com/questions/776687/how-can-i-run-spark-on-a-cluster-using-slurm # debug flag # set -x #echo $0 $1 #env | grep SPARK # This section will be run when started by sbatch if [ "x$1" == 'x' ]; then this=$0 # I experienced problems with some nodes not finding the script: # slurmstepd: execve(): /var/spool/slurm/job123/slurm_script: # No such file or directory # that's why this script is being copied to a shared location to which # all nodes have access to: #script=./${SLURM_JOBID}_$(basename "$0") #cp "$this" "$script" script=$this # # This might not be necessary on all clusters # module load scala/2.10.4 java/jdk1.7.0_25 cuda/7.0.28 # export sparkLogs=./sparklogs/ export sparkTmp=./sparktmp/ mkdir -p "$sparkLogs" "$sparkTmp" # - export SPARK_ROOT=/home/foureste/Projects/Spark/spark-2.0.1-bin-hadoop2.7/ + #export SPARK_ROOT=/home/foureste/Projects/Spark/spark-2.0.1-bin-hadoop2.7/ export SPARK_WORKER_DIR=$sparkLogs export SPARK_LOCAL_DIRS=$sparkLogs export SPARK_MASTER_PORT=7077 export SPARK_MASTER_WEBUI_PORT=8080 export SPARK_WORKER_CORES=$SLURM_CPUS_PER_TASK #export SPARK_DRIVER_MEMORY=60G #export SPARK_DAEMON_MEMORY=$(( $SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK / 2 ))m #export SPARK_DAEMON_MEMORY=8G #export SPARK_WORKER_MEMORY=48G #export SPARK_MEM=$SPARK_DAEMON_MEMORY #export SPARK_MEM=48G # #srun --mem=60G "$script" 'srunning' srun "$script" 'srunning' # # If run by srun, then decide by $SLURM_PROCID whether we are master or worker elif [ "$1" == 'srunning' ]; then #set -x #source "$SPARK_ROOT/sbin/spark-config.sh" #source "$SPARK_ROOT/bin/load-spark-env.sh" if [ $SLURM_PROCID -eq 0 ]; then export SPARK_MASTER_IP=$(hostname) MASTER_NODE=$(scontrol show hostname $SLURM_NODELIST | head -n 1) # # The saved IP address + port is necessary alter for submitting jobs #echo "spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT" > "$sparkLogs/${SLURM_JOBID}_spark_master" echo "spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT" > "./${SLURM_JOBID}_spark_master" # "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.master.Master \ --ip $SPARK_MASTER_IP \ --port $SPARK_MASTER_PORT \ --webui-port $SPARK_MASTER_WEBUI_PORT else # $(scontrol show hostname) is used to convert e.g. host20[39-40] # to host2039 this step assumes that SLURM_PROCID=0 corresponds to # the first node in SLURM_NODELIST ! MASTER_NODE=spark://$(scontrol show hostname $SLURM_NODELIST | head -n 1):7077 "$SPARK_ROOT/bin/spark-class" org.apache.spark.deploy.worker.Worker $MASTER_NODE fi fi