diff --git a/Dockerfile b/Dockerfile index 405501c..660a70d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,136 +1,141 @@ FROM renku/singleuser:0.4.3-renku0.8.2 # Uncomment and adapt if code is to be included in the image # COPY src /code/src # Uncomment and adapt if your R or python packages require extra linux (ubuntu) software # e.g. the following installs apt-utils and vim; each pkg on its own line, all lines # except for the last end with backslash '\' to continue the RUN line # # USER root # RUN apt-get update && \ # apt-get install -y --no-install-recommends \ # apt-utils \ # vim # USER ${NB_USER} USER root # Install hdfs, spark client dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends openjdk-8-jre-headless && \ apt-get clean # Prepare configuration files ARG HADOOP_DEFAULT_FS_ARG="hdfs://iccluster044.iccluster.epfl.ch:8020" ARG YARN_RM_HOSTNAME_ARG="iccluster044.iccluster.epfl.ch" ARG LIVY_SERVER_ARG="http://iccluster044.iccluster.epfl.ch:8998/" ENV HADOOP_DEFAULT_FS=${HADOOP_DEFAULT_FS_ARG} ENV YARN_RM_HOSTNAME=${YARN_RM_HOSTNAME_ARG} ENV YARN_RM_ADDRESS=${YARN_RM_HOSTNAME_ARG}:8050 ENV YARN_RM_SCHEDULER=${YARN_RM_HOSTNAME_ARG}:8030 ENV YARN_RM_TRACKER=${YARN_RM_HOSTNAME_ARG}:8025 ENV LIVY_SERVER_URL=${LIVY_SERVER_ARG} ENV HADOOP_HOME=/usr/hdp/current/hadoop-3.1.0/ ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/ -ENV SPARK_HOME=/usr/hdp/current/spark2-client/ ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ -ENV PYTHONPATH=${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:${SPARK_HOME}/python -ENV PYSPARK_PYTHON=/opt/conda/bin/python +#ENV SPARK_HOME=/usr/hdp/current/spark2-client/ +#ENV PYTHONPATH=${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:${SPARK_HOME}/python +#ENV PYSPARK_PYTHON=/opt/conda/bin/python # Install hdfs, spark packages RUN mkdir -p /usr/hdp/current && \ cd /usr/hdp/current && \ # Hadoop MapReduce wget -q https://archive.apache.org/dist/hadoop/core/hadoop-3.1.0/hadoop-3.1.0.tar.gz && \ tar --no-same-owner -xf hadoop-3.1.0.tar.gz && \ rm hadoop-3.1.0.tar.gz && \ # Spark - wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz && \ - tar --no-same-owner -xf spark-2.4.5-bin-hadoop2.7.tgz && \ - rm spark-2.4.5-bin-hadoop2.7.tgz && \ - mv spark-2.4.5-bin-hadoop2.7 spark2-client && \ - echo 'export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/' >> ${SPARK_HOME}/conf/spark-env.sh &&\ - echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ${SPARK_HOME}/conf/spark-env.sh &&\ - echo 'spark.master yarn' >> ${SPARK_HOME}/conf/spark-defaults.conf + #wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz && \ + #tar --no-same-owner -xf spark-2.4.5-bin-hadoop2.7.tgz && \ + #rm spark-2.4.5-bin-hadoop2.7.tgz && \ + #mv spark-2.4.5-bin-hadoop2.7 spark2-client && \ + #echo 'export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/' >> ${SPARK_HOME}/conf/spark-env.sh &&\ + #echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ${SPARK_HOME}/conf/spark-env.sh &&\ + #echo 'spark.master yarn' >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "Hadoop installed" # Configure Hadoop core-site.xml RUN echo '\n\ \n\ \n\ \n\ fs.defaultFS\n\ '${HADOOP_DEFAULT_FS}'\n\ true\n\ \n\ ' > /usr/hdp/current/hadoop-3.1.0/etc/hadoop/core-site.xml # Configure Yarn yarn-site.xml RUN echo '\n\ \n\ \n\ yarn.nodemanager.address\n\ 0.0.0.0:45454\n\ \n\ \n\ yarn.nodemanager.bind-host\n\ 0.0.0.0\n\ \n\ \n\ yarn.resourcemanager.hostname\n\ '${YARN_RM_HOSTNAME}'\n\ \n\ \n\ yarn.resourcemanager.address\n\ '${YARN_RM_ADDRESS}'\n\ \n\ \n\ yarn.resourcemanager.resource-tracker.address\n\ '${YARN_RM_TRACKER}'\n\ \n\ \n\ yarn.resourcemanager.scheduler.address\n\ '${YARN_RM_SCHEDULER}'\n\ \n\ ' > /usr/hdp/current/hadoop-3.1.0/etc/hadoop/yarn-site.xml # Install sparkmagic -RUN pip install sparkmagic && \ - # jupyter nbextension enable --py --sys-prefix widgetsnbextension && \ - jupyter labextension install @jupyter-widgets/jupyterlab-manager && \ +USER ${NB_USER} + +RUN /opt/conda/bin/pip install sparkmagic && \ + echo "JUPYTERLAB_DIR=${JUPYTERLAB_DIR:-null}" && \ + echo "JUPYTERLAB_SETTINGS_DIR=${JUPYTERLAB_SETTINGS_DIR:-null}" && \ + echo "JUPYTERLAB_WORKSPACES_DIR=${JUPYTERLAB_WORKSPACES_DIR:-null}" && \ + export JUPYTERLAB_DIR=/opt/conda/share/jupyter/lab && \ + export JUPYTERLAB_SETTINGS_DIR=/home/jovyan/.jupyter/lab/user-settings && \ + export JUPYTERLAB_WORKSPACES_DIR=/home/jovyan/.jupyter/lab/workspaces && \ + # /opt/conda/bin/jupyter nbextension enable --py --sys-prefix widgetsnbextension && \ + /opt/conda/bin/jupyter labextension install -y --log-level=INFO @jupyter-widgets/jupyterlab-manager && \ cd "$(pip show sparkmagic|sed -En 's/Location: (.*)$/\1/p')" && \ - jupyter-kernelspec install sparkmagic/kernels/sparkkernel && \ - jupyter-kernelspec install sparkmagic/kernels/sparkrkernel && \ - jupyter-kernelspec install sparkmagic/kernels/pysparkkernel && \ + jupyter-kernelspec install sparkmagic/kernels/sparkkernel --user && \ + jupyter-kernelspec install sparkmagic/kernels/sparkrkernel --user && \ + jupyter-kernelspec install sparkmagic/kernels/pysparkkernel --user && \ jupyter serverextension enable --py sparkmagic # Set user environment -USER ${NB_USER} RUN echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ~/.bashrc && \ - echo 'export PATH=${PATH}:${HADOOP_HOME}/bin:${SPARK_HOME}/bin' >> ~/.bashrc && \ + # echo 'export PATH=${PATH}:${HADOOP_HOME}/bin:${SPARK_HOME}/bin' >> ~/.bashrc && \ mkdir -p ~/.sparkmagic/ && \ echo '{\n\ "kernel_python_credentials" : {\n\ "url": "'${LIVY_SERVER_URL}'"\n\ },\n\n\ "kernel_scala_credentials" : {\n\ "url": "'${LIVY_SERVER_URL}'"\n\ },\n\n\ "custom_headers" : {\n\ "X-Requested-By": "livy"\n\ },\n\n\ "heartbeat_refresh_seconds": 5,\n\ "livy_server_heartbeat_timeout_seconds": 60,\n\ "heartbeat_retry_seconds": 1\n\ }\n' > ~/.sparkmagic/config.json -# switch back to notebook user -USER ${NB_USER} - # install the python dependencies COPY requirements.txt environment.yml /tmp/ RUN conda env update -q -f /tmp/environment.yml && \ /opt/conda/bin/pip install -r /tmp/requirements.txt && \ conda clean -y --all && \ conda env export -n "root" diff --git a/README.md b/README.md index 704a64d..01e63e3 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,36 @@ -# spak-docker +# spark-docker This is a Renku project - basically a git repository with some bells and whistles. You'll find we have already created some useful things like `data` and `notebooks` directories and a `Dockerfile`. ## Working with the project The simplest way to start your project is right from the Renku platform - just click on the `Environments` tab and start a new session. This will start an interactive environment right in your browser. To work with the project anywhere outside the Renku platform, click the `Settings` tab where you will find the git repo URLs - use `git` to clone the project on whichever machine you want. ### Changing interactive environment dependencies Initially we install a very minimal set of packages to keep the images small. However, you can add python and conda packages in `requirements.txt` and `environment.yml` to your heart's content. If you need more fine-grained control over your environment, please see [the documentation](https://renku.readthedocs.io/en/latest/user/advanced_interfaces.html#dockerfile-modifications). ## Project configuration Project options can be found in `.renku/renku.ini`. In this project there is currently only one option, which specifies the default type of environment to open, in this case `/lab` for JupyterLab. You may also choose `/tree` to get to the "classic" Jupyter interface. ## Moving forward Once you feel at home with your project, we recommend that you replace this README file with your own project documentation! Happy data wrangling!