1- ARG ARG_WORKSPACE_BASE_IMAGE="mltooling/ml-workspace:latest"
1+ ARG ARG_WORKSPACE_BASE_IMAGE="mltooling/ml-workspace-r :latest"
22# Build from full flavor of workspace with same version
33FROM $ARG_WORKSPACE_BASE_IMAGE
44
@@ -24,14 +24,52 @@ RUN \
2424 # Cleanup
2525 clean-layer.sh
2626
27+ # Install Hadoop
28+ RUN \
29+ /bin/bash $RESOURCES_PATH/tools/hadoop-local-cluster.sh --install && \
30+ # Cleanup
31+ clean-layer.sh
32+
33+ # Needs to be seperated, otherwise it does not exist yet
34+ ENV HADOOP_HOME="/opt/hadoop"
35+
36+ ENV \
37+ HADOOP_INSTALL=$HADOOP_HOME \
38+ HADOOP_MAPRED_HOME=$HADOOP_HOME \
39+ HADOOP_COMMON_HOME=$HADOOP_HOME \
40+ HADOOP_HDFS_HOME=$HADOOP_HOME \
41+ HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \
42+ # HADOOP_CLASSPATH=$HADOOP_HOME/share/hadoop/tools/lib/* \
43+ HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native \
44+ HADOOP_OPTS="-Djava.library.path=$HADOOP_COMMON_LIB_NATIVE_DIR" \
45+ HDFS_NAMENODE_USER=$NB_USER \
46+ HDFS_DATANODE_USER=$NB_USER \
47+ HDFS_SECONDARYNAMENODE_USER=$NB_USER \
48+ YARN_HOME=$HADOOP_HOME \
49+ YARN_RESOURCEMANAGER_USER=$NB_USER \
50+ YARN_NODEMANAGER_USER=$NB_USER \
51+ PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
52+
2753# Install Spark
2854RUN \
2955 /bin/bash $RESOURCES_PATH/tools/spark-local-cluster.sh --install && \
3056 # Cleanup
3157 clean-layer.sh
3258
3359# Configure Spark
34- ENV SPARK_HOME=/opt/spark \
60+ ENV SPARK_HOME="/opt/spark"
61+
62+ ENV \
63+ # PYSPARK_DRIVER_PYTHON="jupyter"
64+ # PYSPARK_DRIVER_PYTHON_OPTS='notebook'
65+ # https://zeppelin.apache.org/docs/latest/interpreter/spark.html
66+ # export SPARK_DIST_CLASSPATH=`hadoop classpath`
67+ PYSPARK_PYTHON=$CONDA_ROOT/bin/python \
68+ PYSPARK_DRIVER_PYTHON=$CONDA_ROOT/bin/python \
69+ SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
70+ # http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
71+ PYTHONHASHSEED=0 \
72+ PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH \
3573 PATH=$PATH:$SPARK_HOME/bin
3674
3775# Install Zeppelin
@@ -40,45 +78,15 @@ RUN \
4078 # Cleanup
4179 clean-layer.sh
4280
43- # ## CONFIGURATION ###
81+ RUN \
82+ # Install almond jupyter scala kernel: https://almond.sh/
83+ # TODO: The installation in scala-utils does not seem to work currently
84+ curl -Lo coursier https://git.io/coursier-cli && \
85+ chmod +x coursier && \
86+ ./coursier launch --fork almond -- --install --force && \
87+ rm -f coursier
4488
45- ENV \
46- PYSPARK_PYTHON="python" \
47- PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH \
48- SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
49- # http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
50- PYTHONHASHSEED=0
51-
52- # Todo: Add additional spark configuration:
53- # https://spark.apache.org/docs/latest/configuration.html
54- # https://zeppelin.apache.org/docs/latest/interpreter/spark.html
55-
56- # PYSPARK_DRIVER_PYTHON / PYSPARK_DRIVER_PYTHON_OPTS / HADOOP_HOME / HADOOP_CLASSPATH / SPARK_DIST_CLASSPATH
57- # export HADOOP_HOME=~/hadoop-2.7.0 export PATH=$HADOOP_HOME/bin:$PATH export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
58- # export HADOOP_CLASSPATH=$HADOOP_HOME/share/hadoop/tools/lib/*
59- # export SPARK_DIST_CLASSPATH=`hadoop classpath`
60- # export PYSPARK_DRIVER_PYTHON="jupyter"
61- # export PYSPARK_DRIVER_PYTHON_OPTS="notebook"
62- # HADOOP_CONF_DIR=/usr/lib/hadoop
63-
64- # TODO start spark master?
65- # https://medium.com/@marcovillarreal_40011/creating-a-spark-standalone-cluster-with-docker-and-docker-compose-ba9d743a157f
66- # ENV SPARK_MASTER_PORT 7077
67- # ENV SPARK_MASTER_WEBUI_PORT 8080
68- # ENV SPARK_WORKER_WEBUI_PORT 8081
69- # ENV SPARK_MASTER_LOG /spark/logs
70- # ENV SPARK_WORKER_LOG /spark/logs
71- # CMD ["/bin/bash", "/start-master.sh"]
72- # export SPARK_MASTER_HOST=`hostname`
73- # SPARK_WORKER_CORES=1
74- # SPARK_WORKER_MEMORY=1G
75- # SPARK_DRIVER_MEMORY=128m
76- # SPARK_EXECUTOR_MEMORY=256m
77-
78- # TODO configure spark ui to be proxied with base path:
79- # https://stackoverflow.com/questions/45971127/wrong-css-location-of-spark-application-ui
80- # https://github.com/jupyterhub/jupyter-server-proxy/issues/57
81- # https://github.com/yuvipanda/jupyter-sparkui-proxy/blob/master/jupyter_sparkui_proxy/__init__.py
89+ # ## CONFIGURATION ###
8290
8391# Add supervisor config to start zeppelin on port 8072
8492COPY resources/zeppelin-service.conf /etc/supervisor/conf.d/
0 commit comments