diff --git a/Dockerfile b/Dockerfile
index 0cffdb7..405501c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,136 +1,136 @@
FROM renku/singleuser:0.4.3-renku0.8.2
# Uncomment and adapt if code is to be included in the image
# COPY src /code/src
# Uncomment and adapt if your R or python packages require extra linux (ubuntu) software
# e.g. the following installs apt-utils and vim; each pkg on its own line, all lines
# except for the last end with backslash '\' to continue the RUN line
#
# USER root
# RUN apt-get update && \
# apt-get install -y --no-install-recommends \
# apt-utils \
# vim
# USER ${NB_USER}
USER root
# Install hdfs, spark client dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends openjdk-8-jre-headless && \
apt-get clean
# Prepare configuration files
ARG HADOOP_DEFAULT_FS_ARG="hdfs://iccluster044.iccluster.epfl.ch:8020"
ARG YARN_RM_HOSTNAME_ARG="iccluster044.iccluster.epfl.ch"
ARG LIVY_SERVER_ARG="http://iccluster044.iccluster.epfl.ch:8998/"
ENV HADOOP_DEFAULT_FS=${HADOOP_DEFAULT_FS_ARG}
ENV YARN_RM_HOSTNAME=${YARN_RM_HOSTNAME_ARG}
ENV YARN_RM_ADDRESS=${YARN_RM_HOSTNAME_ARG}:8050
ENV YARN_RM_SCHEDULER=${YARN_RM_HOSTNAME_ARG}:8030
ENV YARN_RM_TRACKER=${YARN_RM_HOSTNAME_ARG}:8025
ENV LIVY_SERVER_URL=${LIVY_SERVER_ARG}
ENV HADOOP_HOME=/usr/hdp/current/hadoop-3.1.0/
ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/
ENV SPARK_HOME=/usr/hdp/current/spark2-client/
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
ENV PYTHONPATH=${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:${SPARK_HOME}/python
ENV PYSPARK_PYTHON=/opt/conda/bin/python
# Install hdfs, spark packages
RUN mkdir -p /usr/hdp/current && \
cd /usr/hdp/current && \
# Hadoop MapReduce
wget -q https://archive.apache.org/dist/hadoop/core/hadoop-3.1.0/hadoop-3.1.0.tar.gz && \
tar --no-same-owner -xf hadoop-3.1.0.tar.gz && \
rm hadoop-3.1.0.tar.gz && \
# Spark
wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz && \
tar --no-same-owner -xf spark-2.4.5-bin-hadoop2.7.tgz && \
rm spark-2.4.5-bin-hadoop2.7.tgz && \
mv spark-2.4.5-bin-hadoop2.7 spark2-client && \
echo 'export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/' >> ${SPARK_HOME}/conf/spark-env.sh &&\
echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ${SPARK_HOME}/conf/spark-env.sh &&\
echo 'spark.master yarn' >> ${SPARK_HOME}/conf/spark-defaults.conf
# Configure Hadoop core-site.xml
RUN echo '\n\
\n\
\n\
\n\
fs.defaultFS\n\
'${HADOOP_DEFAULT_FS}'\n\
true\n\
\n\
' > /usr/hdp/current/hadoop-3.1.0/etc/hadoop/core-site.xml
# Configure Yarn yarn-site.xml
RUN echo '\n\
\n\
\n\
yarn.nodemanager.address\n\
0.0.0.0:45454\n\
\n\
\n\
yarn.nodemanager.bind-host\n\
0.0.0.0\n\
\n\
\n\
yarn.resourcemanager.hostname\n\
- '${YARN_RM_HOSTNAME}\n\
+ '${YARN_RM_HOSTNAME}'\n\
\n\
\n\
yarn.resourcemanager.address\n\
- '${YARN_RM_ADDRESS}\n\
+ '${YARN_RM_ADDRESS}'\n\
\n\
\n\
yarn.resourcemanager.resource-tracker.address\n\
- '${YARN_RM_TRACKER}\n\
+ '${YARN_RM_TRACKER}'\n\
\n\
\n\
yarn.resourcemanager.scheduler.address\n\
- '${YARN_RM_SCHEDULER}\n\
+ '${YARN_RM_SCHEDULER}'\n\
\n\
' > /usr/hdp/current/hadoop-3.1.0/etc/hadoop/yarn-site.xml
# Install sparkmagic
RUN pip install sparkmagic && \
# jupyter nbextension enable --py --sys-prefix widgetsnbextension && \
jupyter labextension install @jupyter-widgets/jupyterlab-manager && \
cd "$(pip show sparkmagic|sed -En 's/Location: (.*)$/\1/p')" && \
jupyter-kernelspec install sparkmagic/kernels/sparkkernel && \
jupyter-kernelspec install sparkmagic/kernels/sparkrkernel && \
jupyter-kernelspec install sparkmagic/kernels/pysparkkernel && \
jupyter serverextension enable --py sparkmagic
# Set user environment
USER ${NB_USER}
RUN echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ~/.bashrc && \
echo 'export PATH=${PATH}:${HADOOP_HOME}/bin:${SPARK_HOME}/bin' >> ~/.bashrc && \
mkdir -p ~/.sparkmagic/ && \
echo '{\n\
"kernel_python_credentials" : {\n\
"url": "'${LIVY_SERVER_URL}'"\n\
},\n\n\
"kernel_scala_credentials" : {\n\
"url": "'${LIVY_SERVER_URL}'"\n\
},\n\n\
"custom_headers" : {\n\
"X-Requested-By": "livy"\n\
},\n\n\
"heartbeat_refresh_seconds": 5,\n\
"livy_server_heartbeat_timeout_seconds": 60,\n\
"heartbeat_retry_seconds": 1\n\
}\n' > ~/.sparkmagic/config.json
# switch back to notebook user
USER ${NB_USER}
# install the python dependencies
COPY requirements.txt environment.yml /tmp/
RUN conda env update -q -f /tmp/environment.yml && \
/opt/conda/bin/pip install -r /tmp/requirements.txt && \
conda clean -y --all && \
conda env export -n "root"