diff --git a/Dockerfile b/Dockerfile index 660a70d..9121aaf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,141 +1,187 @@ -FROM renku/singleuser:0.4.3-renku0.8.2 +FROM renku/renkulab:renku0.9.1-py3.7-0.5.2 # Uncomment and adapt if code is to be included in the image # COPY src /code/src # Uncomment and adapt if your R or python packages require extra linux (ubuntu) software # e.g. the following installs apt-utils and vim; each pkg on its own line, all lines # except for the last end with backslash '\' to continue the RUN line -# +# # USER root # RUN apt-get update && \ # apt-get install -y --no-install-recommends \ # apt-utils \ # vim # USER ${NB_USER} USER root -# Install hdfs, spark client dependencies +# Install dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends openjdk-8-jre-headless && \ + apt-get install -y --no-install-recommends libsasl2-dev libsasl2-2 libsasl2-modules-gssapi-mit && \ apt-get clean # Prepare configuration files ARG HADOOP_DEFAULT_FS_ARG="hdfs://iccluster044.iccluster.epfl.ch:8020" +ARG HIVE_JDBC_ARG="jdbc:hive2://iccluster059.iccluster.epfl.ch:2181,iccluster054.iccluster.epfl.ch:2181,iccluster044.iccluster.epfl.ch:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2" ARG YARN_RM_HOSTNAME_ARG="iccluster044.iccluster.epfl.ch" ARG LIVY_SERVER_ARG="http://iccluster044.iccluster.epfl.ch:8998/" +ENV HDP_HOME=/usr/hdp/current ENV HADOOP_DEFAULT_FS=${HADOOP_DEFAULT_FS_ARG} +ENV HADOOP_HOME=${HDP_HOME}/hadoop-3.1.0/ +ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/ +ENV HIVE_JDBC_URL=${HIVE_JDBC_ARG} +ENV HIVE_HOME=${HDP_HOME}/hive-3.1.0/ ENV YARN_RM_HOSTNAME=${YARN_RM_HOSTNAME_ARG} ENV YARN_RM_ADDRESS=${YARN_RM_HOSTNAME_ARG}:8050 ENV YARN_RM_SCHEDULER=${YARN_RM_HOSTNAME_ARG}:8030 ENV YARN_RM_TRACKER=${YARN_RM_HOSTNAME_ARG}:8025 ENV LIVY_SERVER_URL=${LIVY_SERVER_ARG} -ENV HADOOP_HOME=/usr/hdp/current/hadoop-3.1.0/ -ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/ ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ -#ENV SPARK_HOME=/usr/hdp/current/spark2-client/ -#ENV PYTHONPATH=${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:${SPARK_HOME}/python -#ENV PYSPARK_PYTHON=/opt/conda/bin/python - -# Install hdfs, spark packages -RUN mkdir -p /usr/hdp/current && \ - cd /usr/hdp/current && \ - # Hadoop MapReduce + +# Install hadoop 3.1.0 +RUN mkdir -p ${HDP_HOME} && \ + mkdir -p ${HADOOP_CONF_DIR} && \ + cd ${HDP_HOME} && \ wget -q https://archive.apache.org/dist/hadoop/core/hadoop-3.1.0/hadoop-3.1.0.tar.gz && \ tar --no-same-owner -xf hadoop-3.1.0.tar.gz && \ - rm hadoop-3.1.0.tar.gz && \ - # Spark - #wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz && \ - #tar --no-same-owner -xf spark-2.4.5-bin-hadoop2.7.tgz && \ - #rm spark-2.4.5-bin-hadoop2.7.tgz && \ - #mv spark-2.4.5-bin-hadoop2.7 spark2-client && \ - #echo 'export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop/' >> ${SPARK_HOME}/conf/spark-env.sh &&\ - #echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ${SPARK_HOME}/conf/spark-env.sh &&\ - #echo 'spark.master yarn' >> ${SPARK_HOME}/conf/spark-defaults.conf && \ - echo "Hadoop installed" + if [ ! -d ${HADOOP_HOME} ]; then mv hadoop-3.1.0 ${HADOOP_HOME}; fi && \ + rm hadoop-3.1.0.tar.gz + +# Install Hive 3.1.0 +RUN mkdir -p ${HDP_HOME} && \ + mkdir -p ${HIVE_HOME}/conf && \ + cd ${HDP_HOME} && \ + wget -q https://archive.apache.org/dist/hive/hive-3.1.0/apache-hive-3.1.0-bin.tar.gz && \ + tar --no-same-owner -xf apache-hive-3.1.0-bin.tar.gz && \ + if [ ! -d ${HIVE_HOME} ]; then mv apache-hive-3.1.0-bin ${HIVE_HOME}; fi && \ + rm apache-hive-3.1.0-bin.tar.gz # Configure Hadoop core-site.xml RUN echo '\n\ \n\ \n\ \n\ fs.defaultFS\n\ '${HADOOP_DEFAULT_FS}'\n\ true\n\ \n\ -' > /usr/hdp/current/hadoop-3.1.0/etc/hadoop/core-site.xml +\n' > ${HADOOP_CONF_DIR}/core-site.xml # Configure Yarn yarn-site.xml RUN echo '\n\ \n\ \n\ yarn.nodemanager.address\n\ - 0.0.0.0:45454\n\ + '${YARN_RM_HOSTNAME_ARG}':45454\n\ \n\ \n\ yarn.nodemanager.bind-host\n\ - 0.0.0.0\n\ + '${YARN_RM_HOSTNAME_ARG}'\n\ \n\ \n\ yarn.resourcemanager.hostname\n\ '${YARN_RM_HOSTNAME}'\n\ \n\ \n\ yarn.resourcemanager.address\n\ '${YARN_RM_ADDRESS}'\n\ \n\ \n\ yarn.resourcemanager.resource-tracker.address\n\ '${YARN_RM_TRACKER}'\n\ \n\ \n\ yarn.resourcemanager.scheduler.address\n\ '${YARN_RM_SCHEDULER}'\n\ \n\ -' > /usr/hdp/current/hadoop-3.1.0/etc/hadoop/yarn-site.xml +\n' > ${HADOOP_CONF_DIR}/yarn-site.xml + +# Configure Hive beeline-site.xml +RUN echo '\n\ +\n\ + beeline.hs2.jdbc.url.container\n\ + '${HIVE_JDBC_URL}'\n\ +\n\ +\n\ + beeline.hs2.jdbc.url.default\n\ + container\n\ +\n\ +\n' > ${HIVE_HOME}/conf/beeline-site.xml + +# Renku-hack modify entrypoint.sh +RUN if [ -e /entrypoint.sh ]; then \ + sed -i -Ee 's,^\$\@$,if \[\[ -x ~/.renkurc \]\];then . ~/.renkurc;fi\n\$\@,' /entrypoint.sh; \ + fi -# Install sparkmagic USER ${NB_USER} +# Install sparkmagic RUN /opt/conda/bin/pip install sparkmagic && \ - echo "JUPYTERLAB_DIR=${JUPYTERLAB_DIR:-null}" && \ - echo "JUPYTERLAB_SETTINGS_DIR=${JUPYTERLAB_SETTINGS_DIR:-null}" && \ - echo "JUPYTERLAB_WORKSPACES_DIR=${JUPYTERLAB_WORKSPACES_DIR:-null}" && \ export JUPYTERLAB_DIR=/opt/conda/share/jupyter/lab && \ export JUPYTERLAB_SETTINGS_DIR=/home/jovyan/.jupyter/lab/user-settings && \ export JUPYTERLAB_WORKSPACES_DIR=/home/jovyan/.jupyter/lab/workspaces && \ # /opt/conda/bin/jupyter nbextension enable --py --sys-prefix widgetsnbextension && \ /opt/conda/bin/jupyter labextension install -y --log-level=INFO @jupyter-widgets/jupyterlab-manager && \ cd "$(pip show sparkmagic|sed -En 's/Location: (.*)$/\1/p')" && \ jupyter-kernelspec install sparkmagic/kernels/sparkkernel --user && \ jupyter-kernelspec install sparkmagic/kernels/sparkrkernel --user && \ jupyter-kernelspec install sparkmagic/kernels/pysparkkernel --user && \ jupyter serverextension enable --py sparkmagic # Set user environment +# + https://github.com/jupyter-incubator/sparkmagic/blob/master/sparkmagic/example_config.json RUN echo 'export HADOOP_USER_NAME=${JUPYTERHUB_USER}' >> ~/.bashrc && \ - # echo 'export PATH=${PATH}:${HADOOP_HOME}/bin:${SPARK_HOME}/bin' >> ~/.bashrc && \ + echo 'export PATH=${PATH}:${HADOOP_HOME}/bin' >> ~/.bashrc && \ + echo 'export PATH=${PATH}:${HIVE_HOME}/bin' >> ~/.bashrc && \ mkdir -p ~/.sparkmagic/ && \ echo '{\n\ "kernel_python_credentials" : {\n\ "url": "'${LIVY_SERVER_URL}'"\n\ },\n\n\ "kernel_scala_credentials" : {\n\ "url": "'${LIVY_SERVER_URL}'"\n\ },\n\n\ "custom_headers" : {\n\ "X-Requested-By": "livy"\n\ },\n\n\ + "session_configs" : {\n\ + "driverMemory": "1000M",\n\ + "executorMemory": "4G",\n\ + "executorCores": 4,\n\ + "numExecutors": 10\n\ + },\n\ + "server_extension_default_kernel_name": "pysparkkernel",\n\ + "use_auto_viz": true,\n\ + "coerce_dataframe": true,\n\ + "max_results_sql": 1000,\n\ + "pyspark_dataframe_encoding": "utf-8",\n\ "heartbeat_refresh_seconds": 5,\n\ "livy_server_heartbeat_timeout_seconds": 60,\n\ "heartbeat_retry_seconds": 1\n\ -}\n' > ~/.sparkmagic/config.json +}\n' > ~/.sparkmagic/config.json && \ + mkdir -p ~/.beeline && \ + echo '\n\ +\n\ +\n\ + \n\ + beeline.hs2.connection.user\n\ + JUPYTERHUB_USER\n\ + \n\ + \n\ + beeline.hs2.connection.password\n\ + SECRET\n\ + \n\ +\n' > ~/.beeline/beeline-hs2-connection.xml && \ + echo '#!/usr/bin/env bash\n\ +sed -ie "s,JUPYTERHUB_USER,${JUPYTERHUB_USER},g" ~/.beeline/beeline-hs2-connection.xml\n' > ~/.renkurc # install the python dependencies COPY requirements.txt environment.yml /tmp/ RUN conda env update -q -f /tmp/environment.yml && \ /opt/conda/bin/pip install -r /tmp/requirements.txt && \ conda clean -y --all && \ conda env export -n "root" + diff --git a/requirements.txt b/requirements.txt index e69de29..bcd60a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,3 @@ +pandas +matplotlib +pyhive[hive]