FROM renku/renkulab:renku0.9.1-py3.7-0.5.2 # Uncomment and adapt if code is to be included in the image # COPY src /code/src # Uncomment and adapt if your R or python packages require extra linux (ubuntu) software # e.g. the following installs apt-utils and vim; each pkg on its own line, all lines # except for the last end with backslash '\' to continue the RUN line # # USER root # RUN apt-get update && \ # apt-get install -y --no-install-recommends \ # apt-utils \ # vim # USER ${NB_USER} USER root # Install dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends openjdk-8-jre-headless && \ apt-get install -y --no-install-recommends libsasl2-dev libsasl2-2 libsasl2-modules-gssapi-mit && \ apt-get clean # Prepare configuration files ARG HADOOP_DEFAULT_FS_ARG="hdfs://iccluster044.iccluster.epfl.ch:8020" ARG HIVE_JDBC_ARG="jdbc:hive2://iccluster059.iccluster.epfl.ch:2181,iccluster054.iccluster.epfl.ch:2181,iccluster044.iccluster.epfl.ch:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2" ARG YARN_RM_HOSTNAME_ARG="iccluster044.iccluster.epfl.ch" ARG LIVY_SERVER_ARG="http://iccluster044.iccluster.epfl.ch:8998/" ENV HDP_HOME=/usr/hdp/current ENV HADOOP_DEFAULT_FS=${HADOOP_DEFAULT_FS_ARG} ENV HADOOP_HOME=${HDP_HOME}/hadoop-3.1.0/ ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/ ENV HIVE_JDBC_URL=${HIVE_JDBC_ARG} ENV HIVE_HOME=${HDP_HOME}/hive-3.1.0/ ENV YARN_RM_HOSTNAME=${YARN_RM_HOSTNAME_ARG} ENV YARN_RM_ADDRESS=${YARN_RM_HOSTNAME_ARG}:8050 ENV YARN_RM_SCHEDULER=${YARN_RM_HOSTNAME_ARG}:8030 ENV YARN_RM_TRACKER=${YARN_RM_HOSTNAME_ARG}:8025 ENV LIVY_SERVER_URL=${LIVY_SERVER_ARG} ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ # Install hadoop 3.1.0 RUN mkdir -p ${HDP_HOME} && \ cd ${HDP_HOME} && \ wget -q https://archive.apache.org/dist/hadoop/core/hadoop-3.1.0/hadoop-3.1.0.tar.gz && \ tar --no-same-owner -xf hadoop-3.1.0.tar.gz && \ if [ ! -d ${HADOOP_HOME} ]; then mv hadoop-3.1.0 ${HADOOP_HOME}; fi && \ mkdir -p ${HADOOP_CONF_DIR} && \ rm hadoop-3.1.0.tar.gz # Install Hive 3.1.0 RUN mkdir -p ${HDP_HOME} && \ cd ${HDP_HOME} && \ wget -q https://archive.apache.org/dist/hive/hive-3.1.0/apache-hive-3.1.0-bin.tar.gz && \ tar --no-same-owner -xf apache-hive-3.1.0-bin.tar.gz && \ if [ ! -d ${HIVE_HOME} ]; then mv apache-hive-3.1.0-bin ${HIVE_HOME}; fi && \ mkdir -p ${HIVE_HOME}/conf && \ rm apache-hive-3.1.0-bin.tar.gz # Configure Hadoop core-site.xml RUN echo '\n\ \n\ \n\ \n\ fs.defaultFS\n\ '${HADOOP_DEFAULT_FS}'\n\ true\n\ \n\ \n' > ${HADOOP_CONF_DIR}/core-site.xml # Configure Yarn yarn-site.xml RUN echo '\n\ \n\ \n\ yarn.nodemanager.address\n\ '${YARN_RM_HOSTNAME_ARG}':45454\n\ \n\ \n\ yarn.nodemanager.bind-host\n\ '${YARN_RM_HOSTNAME_ARG}'\n\ \n\ \n\ yarn.resourcemanager.hostname\n\ '${YARN_RM_HOSTNAME}'\n\ \n\ \n\ yarn.resourcemanager.address\n\ '${YARN_RM_ADDRESS}'\n\ \n\ \n\ yarn.resourcemanager.resource-tracker.address\n\ '${YARN_RM_TRACKER}'\n\ \n\ \n\ yarn.resourcemanager.scheduler.address\n\ '${YARN_RM_SCHEDULER}'\n\ \n\ \n' > ${HADOOP_CONF_DIR}/yarn-site.xml # Configure Hive beeline-site.xml RUN echo '\n\ \n\ beeline.hs2.jdbc.url.container\n\ '${HIVE_JDBC_URL}'\n\ \n\ \n\ beeline.hs2.jdbc.url.default\n\ container\n\ \n\ \n' > ${HIVE_HOME}/conf/beeline-site.xml # Renku-hack modify entrypoint.sh RUN if [ -e /entrypoint.sh ]; then \ sed -i -Ee 's,^\$\@$,. ${HOME}/.renkurc || true\n\$\@,' /entrypoint.sh; \ fi USER ${NB_USER} # Install sparkmagic RUN /opt/conda/bin/pip install sparkmagic && \ export JUPYTERLAB_DIR=/opt/conda/share/jupyter/lab && \ export JUPYTERLAB_SETTINGS_DIR=/home/jovyan/.jupyter/lab/user-settings && \ export JUPYTERLAB_WORKSPACES_DIR=/home/jovyan/.jupyter/lab/workspaces && \ # /opt/conda/bin/jupyter nbextension enable --py --sys-prefix widgetsnbextension && \ /opt/conda/bin/jupyter labextension install -y --log-level=INFO @jupyter-widgets/jupyterlab-manager && \ cd "$(pip show sparkmagic|sed -En 's/Location: (.*)$/\1/p')" && \ jupyter-kernelspec install sparkmagic/kernels/sparkkernel --user && \ jupyter-kernelspec install sparkmagic/kernels/sparkrkernel --user && \ jupyter-kernelspec install sparkmagic/kernels/pysparkkernel --user && \ jupyter serverextension enable --py sparkmagic # Install bash kernel RUN /opt/conda/bin/pip install bash_kernel && \ python -m bash_kernel.install # Set user environment # + https://github.com/jupyter-incubator/sparkmagic/blob/master/sparkmagic/example_config.json RUN echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ~/.bashrc && \ echo 'export PATH=${PATH}:${HADOOP_HOME}/bin' >> ~/.bashrc && \ echo 'export PATH=${PATH}:${HIVE_HOME}/bin' >> ~/.bashrc && \ mkdir -p ~/.sparkmagic/ && \ echo '{\n\ "kernel_python_credentials" : {\n\ "url": "'${LIVY_SERVER_URL}'"\n\ },\n\n\ "kernel_scala_credentials" : {\n\ "url": "'${LIVY_SERVER_URL}'"\n\ },\n\n\ "custom_headers" : {\n\ "X-Requested-By": "livy"\n\ },\n\n\ "session_configs" : {\n\ "driverMemory": "1000M",\n\ "executorMemory": "4G",\n\ "executorCores": 4,\n\ "numExecutors": 10\n\ },\n\ "server_extension_default_kernel_name": "pysparkkernel",\n\ "use_auto_viz": true,\n\ "coerce_dataframe": true,\n\ "max_results_sql": 1000,\n\ "pyspark_dataframe_encoding": "utf-8",\n\ "heartbeat_refresh_seconds": 5,\n\ "livy_server_heartbeat_timeout_seconds": 60,\n\ "heartbeat_retry_seconds": 1\n\ }\n' > ~/.sparkmagic/config.json && \ mkdir -p ~/.beeline && \ echo '\n\ \n\ \n\ \n\ beeline.hs2.connection.user\n\ JUPYTERHUB_USER\n\ \n\ \n\ beeline.hs2.connection.password\n\ SECRET\n\ \n\ \n' > ~/.beeline/beeline-hs2-connection.xml && \ echo '#!/usr/bin/env bash\n\ sed -i -e "s,JUPYTERHUB_USER,${JUPYTERHUB_USER},g" ~/.beeline/beeline-hs2-connection.xml\n' > ~/.renkurc # install bokeh extensions RUN /opt/conda/bin/jupyter labextension install -y --log-level=INFO @bokeh/jupyter_bokeh # install the python dependencies COPY requirements.txt environment.yml /tmp/ RUN conda env update -q -f /tmp/environment.yml && \ /opt/conda/bin/pip install -r /tmp/requirements.txt && \ conda install hdfs3 -c conda-forge -y -q && \ conda clean -y --all && \ conda env export -n "root"