Step 0: Prerequisite
- Existing Java, Hadoop and HIVE installation
- Find the compatible Spark version that can be an execution engine for your HIVE. Here is the HIVE and Spark compatibility matrix.
Step 1: Configure Environment Variables
#JAVA Related Options
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
export PATH=$PATH:$JAVA_HOME/bin
#Hadoop Related Options
export HADOOP_HOME=/home/hdoop/hadoop
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export YARN_HOME=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"
export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
#HIVE Related Options export HIVE_HOME=/home/hdoop/hive export PATH=$PATH:$HIVE_HOME/bin
Step 2: Download Spark 2.4.8 version
wget https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-without-hadoop.tgz
tar xvf spark-*.tgz
Step 3: Add the Spark Dependency to Hive
Create link to the following Jars in $HIVE_HOME/lib (pointing to respective jar in spark-2.4.8-bin-without-hadoop/jars). Execute the below commands to create the links.
cd $HIVE_HOME/lib
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/scala-library*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/spark-core*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/spark-network-common*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/spark-network-shuffle*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/jersey-server*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/jersey-container-servlet-core*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/jackson-module*.jar
ln -s/home/hdoop/spark-2.4.8-bin-without-hadoop/jars/chill*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/json4s-ast*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/kryo-shaded*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/minlog*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/scala-xml*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/spark-launcher*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/spark-network-shuffle*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/spark-unsafe*.jar
ln -s /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/xbean-asm5-shaded.jar
Step 4: Configure Spark to Access Hadoop Class path
Edit the spark-env.sh (create it from spark-env.sh.template if not exists) and then add the following configurations.
nano /home/hdoop/spark-2.4.8-bin-without-hadoop/conf/spark-env.sh
#Add the below lines:
export SPARK_DIST_CLASSPATH=$(/home/hdoop/hadoop/bin/hadoop classpath)
#Spark related options
export SPARK_HOME=/home/hdoop/spark-2.4.8-bin-without-hadoop
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
export PYSPARK_PYTHON=/usr/bin/python3
Step 5: Configure Hive to Access Spark Jars
nano /home/hdoop/hive/conf/hive-env.sh
#Add the below lines
export HADOOP_HOME=/home/hdoop/hadoop
# Hive Configuration Directory can be controlled by:
export HIVE_CONF_DIR=$HIVE_HOME/conf
export SPARK_HOME=/home/hdoop/spark-2.4.8-bin-without-hadoop
export SPARK_JARS=""
for jar in `ls $SPARK_HOME/jars`; do
export SPARK_JARS=$SPARK_JARS:$SPARK_HOME/jars/$jar
done
export HIVE_AUX_JARS_PATH=$SPARK_JARS
Step 6: Configure Hive to use Spark Engine in YARN mode
Add the following entry to hive-site.xml
nano /home/hdoop/hive/conf/hive-site.xml
# Add the following lines
<property>
<name>hive.execution.engine</name>
<value>spark</value>
</property>
<property>
<name>spark.master</name>
<value>yarn</value>
</property>
<property>
<name>spark.eventLog.enabled</name>
<value>true</value>
</property>
<property>
<name>spark.eventLog.dir</name>
<value>/tmp</value>
</property>
<property>
<name>spark.driver.memory</name>
<value>2g</value>
</property>
<property>
<name>spark.executor.memory</name>
<value>2g</value>
</property>
<property>
<name>spark.serializer</name>
<value>org.apache.spark.serializer.KryoSerializer</value>
</property>
<property>
<name>spark.yarn.jars</name>
<value>hdfs://127.0.0.1:9000/spark/jars/*</value>
<!-- <value>hdfs:///spark/jars/*.jar</value> -->
</property>
<property>
<name>spark.submit.deployMode</name>
<value>client</value>
<!-- <value>cluster</value> -->
</property>
<!--
<property>
<name>spark.yarn.queue</name>
<value>default</value>
</property>
-->
<property>
<name>hive.spark.job.monitor.timeout</name>
<value>600</value>
</property>
<property>
<name>hive.server2.enable.doAs</name>
<value>true</value>
</property>
Step 7: Copy Spark jar's in Spark/Jar folder to hdfs:///spark/jars/
Copy all the jars in /home/hdoop/spark-2.4.8-bin-without-hadoop/jars path to hdfs:///spark/jars/ (HDFS path). Refer to the previous step, we have pointed the "spark.yarn.jars" to hdfs:///spark/jars/ in hive-site.xml. YARN will look for Spark in this HDFS path.
hdfs dfs -mkdir -p /spark/jars/
hdfs dfs -put /home/hdoop/spark-2.4.8-bin-without-hadoop/jars/* /spark/jars/*
Step 8: Restart Hadoop and Hive Services
cd /home/hdoop/hadoop/sbin
./stop-all.sh
./start-all.sh
jps
hive