Hive API
Create Hive table for Read and Write
CREATE external TABLE IF NOT EXISTS pcatalog.pc_sampleHiveTable
(
id string,
name string,
rev string
)
STORED AS parquet
LOCATION '/tmp/pcatalog/pc_sampleHiveTable'
Common Imports in all Hive API Usages
import com.paypal.gimel._
import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.sql._
import spray.json._
import spray.json.DefaultJsonProtocol._
Write Api Hive
Prepare Test Data for Write Hive
def stringed(n: Int) = s"""{"id": ${n},"name": "MAC-${n}", "address": "MAC-${n+1}", "age": "${n+1}", "company": "MAC-${n}", "designation": "MAC-${n}", "salary": "${n * 10000}" }"""
val numberOfRows=10000
val texts: Seq[String] = (1 to numberOfRows).map { x => stringed(x) }.toSeq
val rdd: RDD[String] = sparkSession.sparkContext.parallelize(texts)
val dataFrameToWrite: DataFrame = sparkSession.read.json(rdd)
dataFrameToWrite.show
Write Data Api Hive
//Initiate DataSet
val dataSet = DataSet(sparkSession)
//options
val options = "gimel.kafka.throttle.batch.fetchRowsOnFirstRun=1000000:gimel.kafka.throttle.batch.maxRecordsPerPartition=1000000"
//write
val dataFrameWritten = dataSet.write("pcatalog.pc_sampleHiveTable",dataFrameToWrite,options)
Read Api Hive
Read Data from Hive
val dataFrameRead = dataSet.read("pcatalog.pc_sampleHiveTable")
dataFrameRead.show