Spark on Jupyterでexternal jarを使う


  • jupyterのspark kernel toreeは内部でspark-submitを実行している。なので、そのオプションに外部jar設定を追加する。
  • kernel.jsonの追加でいけるが、毎度書き換えるのは嫌なので、extjarsに放り込めば使えるようにrun.shを変更する。
/usr/local/share/jupyter/kernels/toree/bin/run.sh
#!/usr/bin/env bash                                                                                                                                      

#                                                                                                                                                        
# Licensed to the Apache Software Foundation (ASF) under one or more                                                                                     
# contributor license agreements.  See the NOTICE file distributed with                                                                                  
# this work for additional information regarding copyright ownership.                                                                                    
# The ASF licenses this file to You under the Apache License, Version 2.0                                                                                
# (the "License"); you may not use this file except in compliance with                                                                                   
# the License.  You may obtain a copy of the License at                                                                                                  
#                                                                                                                                                        
#     http://www.apache.org/licenses/LICENSE-2.0                                                                                                         
#                                                                                                                                                        
# Unless required by applicable law or agreed to in writing, software                                                                                    
# distributed under the License is distributed on an "AS IS" BASIS,                                                                                      
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                                               
# See the License for the specific language governing permissions and                                                                                    
# limitations under the License                                                                                                                          
#                                                                                                                                                        
                                           ``
PROG_HOME="$(cd "`dirname "$0"`"/..; pwd)"

if [ -z "$SPARK_HOME" ]; then
  echo "SPARK_HOME must be set to the location of a Spark distribution!"
  exit 1
fi

echo "Starting Spark Kernel with SPARK_HOME=$SPARK_HOME"

KERNEL_ASSEMBLY=`(cd ${PROG_HOME}/lib; ls -1 toree-kernel-assembly-*.jar;)`

# disable randomized hash for string in Python 3.3+                                                                                                      
export PYTHONHASHSEED=0

# added to handle extjars.                                                                                                                                       
extjars=""
for jar in $(ls ${PROG_HOME}/extlib/*.jar) ; do
    if [ ! -z $extjars ] ; then
        extjars="$extjars,$jar"
    else
        extjars=$jar
    fi
done
echo $extjars >/tmp/toree.log

SPARK_OPTS="$SPARK_OPTS --jars $extjars"

echo "${SPARK_HOME}/bin/spark-submit ${SPARK_OPTS} --class org.apache.toree.Main $PROG_HOME/lib/${KERNEL_ASSEMBLY}" >> /tmp/toree.log

exec "$SPARK_HOME"/bin/spark-submit \
    ${SPARK_OPTS} \
    --class org.apache.toree.Main $PROG_HOME/lib/${KERNEL_ASSEMBLY} "$@"