diff --git a/Dockerfile b/Dockerfile
index 405501c..660a70d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,136 +1,141 @@
FROM renku/singleuser:0.4.3-renku0.8.2
# Uncomment and adapt if code is to be included in the image
# COPY src /code/src
# Uncomment and adapt if your R or python packages require extra linux (ubuntu) software
# e.g. the following installs apt-utils and vim; each pkg on its own line, all lines
# except for the last end with backslash '\' to continue the RUN line
#
# USER root
# RUN apt-get update && \
# apt-get install -y --no-install-recommends \
# apt-utils \
# vim
# USER ${NB_USER}
USER root
# Install hdfs, spark client dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends openjdk-8-jre-headless && \
apt-get clean
# Prepare configuration files
ARG HADOOP_DEFAULT_FS_ARG="hdfs://iccluster044.iccluster.epfl.ch:8020"
ARG YARN_RM_HOSTNAME_ARG="iccluster044.iccluster.epfl.ch"
ARG LIVY_SERVER_ARG="http://iccluster044.iccluster.epfl.ch:8998/"
ENV HADOOP_DEFAULT_FS=${HADOOP_DEFAULT_FS_ARG}
ENV YARN_RM_HOSTNAME=${YARN_RM_HOSTNAME_ARG}
ENV YARN_RM_ADDRESS=${YARN_RM_HOSTNAME_ARG}:8050
ENV YARN_RM_SCHEDULER=${YARN_RM_HOSTNAME_ARG}:8030
ENV YARN_RM_TRACKER=${YARN_RM_HOSTNAME_ARG}:8025
ENV LIVY_SERVER_URL=${LIVY_SERVER_ARG}
ENV HADOOP_HOME=/usr/hdp/current/hadoop-3.1.0/
ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/
-ENV SPARK_HOME=/usr/hdp/current/spark2-client/
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
-ENV PYTHONPATH=${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:${SPARK_HOME}/python
-ENV PYSPARK_PYTHON=/opt/conda/bin/python
+#ENV SPARK_HOME=/usr/hdp/current/spark2-client/
+#ENV PYTHONPATH=${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:${SPARK_HOME}/python
+#ENV PYSPARK_PYTHON=/opt/conda/bin/python
# Install hdfs, spark packages
RUN mkdir -p /usr/hdp/current && \
cd /usr/hdp/current && \
# Hadoop MapReduce
wget -q https://archive.apache.org/dist/hadoop/core/hadoop-3.1.0/hadoop-3.1.0.tar.gz && \
tar --no-same-owner -xf hadoop-3.1.0.tar.gz && \
rm hadoop-3.1.0.tar.gz && \
# Spark
- wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz && \
- tar --no-same-owner -xf spark-2.4.5-bin-hadoop2.7.tgz && \
- rm spark-2.4.5-bin-hadoop2.7.tgz && \
- mv spark-2.4.5-bin-hadoop2.7 spark2-client && \
- echo 'export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/' >> ${SPARK_HOME}/conf/spark-env.sh &&\
- echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ${SPARK_HOME}/conf/spark-env.sh &&\
- echo 'spark.master yarn' >> ${SPARK_HOME}/conf/spark-defaults.conf
+ #wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz && \
+ #tar --no-same-owner -xf spark-2.4.5-bin-hadoop2.7.tgz && \
+ #rm spark-2.4.5-bin-hadoop2.7.tgz && \
+ #mv spark-2.4.5-bin-hadoop2.7 spark2-client && \
+ #echo 'export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/' >> ${SPARK_HOME}/conf/spark-env.sh &&\
+ #echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ${SPARK_HOME}/conf/spark-env.sh &&\
+ #echo 'spark.master yarn' >> ${SPARK_HOME}/conf/spark-defaults.conf && \
+ echo "Hadoop installed"
# Configure Hadoop core-site.xml
RUN echo '\n\
\n\
\n\
\n\
fs.defaultFS\n\
'${HADOOP_DEFAULT_FS}'\n\
true\n\
\n\
' > /usr/hdp/current/hadoop-3.1.0/etc/hadoop/core-site.xml
# Configure Yarn yarn-site.xml
RUN echo '\n\
\n\
\n\
yarn.nodemanager.address\n\
0.0.0.0:45454\n\
\n\
\n\
yarn.nodemanager.bind-host\n\
0.0.0.0\n\
\n\
\n\
yarn.resourcemanager.hostname\n\
'${YARN_RM_HOSTNAME}'\n\
\n\
\n\
yarn.resourcemanager.address\n\
'${YARN_RM_ADDRESS}'\n\
\n\
\n\
yarn.resourcemanager.resource-tracker.address\n\
'${YARN_RM_TRACKER}'\n\
\n\
\n\
yarn.resourcemanager.scheduler.address\n\
'${YARN_RM_SCHEDULER}'\n\
\n\
' > /usr/hdp/current/hadoop-3.1.0/etc/hadoop/yarn-site.xml
# Install sparkmagic
-RUN pip install sparkmagic && \
- # jupyter nbextension enable --py --sys-prefix widgetsnbextension && \
- jupyter labextension install @jupyter-widgets/jupyterlab-manager && \
+USER ${NB_USER}
+
+RUN /opt/conda/bin/pip install sparkmagic && \
+ echo "JUPYTERLAB_DIR=${JUPYTERLAB_DIR:-null}" && \
+ echo "JUPYTERLAB_SETTINGS_DIR=${JUPYTERLAB_SETTINGS_DIR:-null}" && \
+ echo "JUPYTERLAB_WORKSPACES_DIR=${JUPYTERLAB_WORKSPACES_DIR:-null}" && \
+ export JUPYTERLAB_DIR=/opt/conda/share/jupyter/lab && \
+ export JUPYTERLAB_SETTINGS_DIR=/home/jovyan/.jupyter/lab/user-settings && \
+ export JUPYTERLAB_WORKSPACES_DIR=/home/jovyan/.jupyter/lab/workspaces && \
+ # /opt/conda/bin/jupyter nbextension enable --py --sys-prefix widgetsnbextension && \
+ /opt/conda/bin/jupyter labextension install -y --log-level=INFO @jupyter-widgets/jupyterlab-manager && \
cd "$(pip show sparkmagic|sed -En 's/Location: (.*)$/\1/p')" && \
- jupyter-kernelspec install sparkmagic/kernels/sparkkernel && \
- jupyter-kernelspec install sparkmagic/kernels/sparkrkernel && \
- jupyter-kernelspec install sparkmagic/kernels/pysparkkernel && \
+ jupyter-kernelspec install sparkmagic/kernels/sparkkernel --user && \
+ jupyter-kernelspec install sparkmagic/kernels/sparkrkernel --user && \
+ jupyter-kernelspec install sparkmagic/kernels/pysparkkernel --user && \
jupyter serverextension enable --py sparkmagic
# Set user environment
-USER ${NB_USER}
RUN echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ~/.bashrc && \
- echo 'export PATH=${PATH}:${HADOOP_HOME}/bin:${SPARK_HOME}/bin' >> ~/.bashrc && \
+ # echo 'export PATH=${PATH}:${HADOOP_HOME}/bin:${SPARK_HOME}/bin' >> ~/.bashrc && \
mkdir -p ~/.sparkmagic/ && \
echo '{\n\
"kernel_python_credentials" : {\n\
"url": "'${LIVY_SERVER_URL}'"\n\
},\n\n\
"kernel_scala_credentials" : {\n\
"url": "'${LIVY_SERVER_URL}'"\n\
},\n\n\
"custom_headers" : {\n\
"X-Requested-By": "livy"\n\
},\n\n\
"heartbeat_refresh_seconds": 5,\n\
"livy_server_heartbeat_timeout_seconds": 60,\n\
"heartbeat_retry_seconds": 1\n\
}\n' > ~/.sparkmagic/config.json
-# switch back to notebook user
-USER ${NB_USER}
-
# install the python dependencies
COPY requirements.txt environment.yml /tmp/
RUN conda env update -q -f /tmp/environment.yml && \
/opt/conda/bin/pip install -r /tmp/requirements.txt && \
conda clean -y --all && \
conda env export -n "root"
diff --git a/README.md b/README.md
index 704a64d..01e63e3 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,36 @@
-# spak-docker
+# spark-docker
This is a Renku project - basically a git repository with some
bells and whistles. You'll find we have already created some
useful things like `data` and `notebooks` directories and
a `Dockerfile`.
## Working with the project
The simplest way to start your project is right from the Renku
platform - just click on the `Environments` tab and start a new session.
This will start an interactive environment right in your browser.
To work with the project anywhere outside the Renku platform,
click the `Settings` tab where you will find the
git repo URLs - use `git` to clone the project on whichever machine you want.
### Changing interactive environment dependencies
Initially we install a very minimal set of packages to keep the images small.
However, you can add python and conda packages in `requirements.txt` and
`environment.yml` to your heart's content. If you need more fine-grained
control over your environment, please see [the documentation](https://renku.readthedocs.io/en/latest/user/advanced_interfaces.html#dockerfile-modifications).
## Project configuration
Project options can be found in `.renku/renku.ini`. In this
project there is currently only one option, which specifies
the default type of environment to open, in this case `/lab` for
JupyterLab. You may also choose `/tree` to get to the "classic" Jupyter
interface.
## Moving forward
Once you feel at home with your project, we recommend that you replace
this README file with your own project documentation! Happy data wrangling!