目录

  • ​​Spark SQL 读取 mysql​​
  • ​​Spark SQL 读取 hive​​

Spark SQL 读取 mysql

<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.27</version>
</dependency>
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
import org.apache.spark.sql.{DataFrame, Dataset, SaveMode, SparkSession}

object SparkSQL08_IO_MySQL {
def main(args: Array[String]): Unit = {
// 创建配置文件对象
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkSQL08_IO_MySQL")
// 创建SparkSession对象
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
// 从MySQL中读取数据
val df: DataFrame = spark.read.format("jdbc")
.option("url", "jdbc:mysql://hadoop102:3306/testSpark") // testSpark 数据库名
.option("driver", "com.mysql.jdbc.Driver")
.option("user", "root")
.option("password", "000000")
.option("dbtable", "student") // student 表名
.load()
df.show()
// 向MySQL中写入数据
val rdd: RDD[User07] = spark.sparkContext.makeRDD(List(User07("x", 20), User07("y", 18)))
// 将RDD转换为DS
val ds: Dataset[User07] = rdd.toDS()
ds.write.format("jdbc")
.option("url", "jdbc:mysql://hadoop102:3306/testSpark")
.option("driver", "com.mysql.jdbc.Driver")
.option("user", "root")
.option("password", "000000")
.option(JDBCOptions.JDBC_TABLE_NAME, "student")
.mode(SaveMode.Append)
.save()
// 释放资源
spark.stop()
}
}

case class User07(name: String, age: Int)

Spark SQL 读取 hive

idea中配置

  1. 在pom.xml中添加依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
  1. 拷贝hive-site.xml到resources目录下

添加依赖后,​​最好把idea中的target目录给干掉​​,使之重新编译,确保hive-site.xml在classes目录下

我的hive是远程模式。hive-site.xml内容如下:

​​hive 配置metastore三种方式​​

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 连接存储hive元数据mysql的4大要素 -->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://hadoop102:3306/hive?createDatabaseIfNotExist=true&useSSL=false&useUnicode=true&characterEncoding=UTF-8</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>000000</value>
</property>

<!-- H2S运行绑定host -->
<property>
<name>hive.server2.thrift.bind.host</name>
<value>hadoop102</value>
</property>

<!-- 远程模式部署metastore 服务地址 -->
<property>
<name>hive.metastore.uris</name>
<value>thrift://hadoop102:9083</value>
</property>

<!-- 关闭元数据存储授权 -->
<property>
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>

<!-- 关闭元数据存储版本的验证 -->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>

<!--元数据文件存储hdfs位置-->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>

<!--开启本地模式-->
<property>
<name>hive.exec.mode.local.auto</name>
<value>true</value>
<description>Let Hive determine whether to run in local mode automatically</description>
</property>

</configuration>
package com.xcu.bigdata.spark.sql

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

/**
* @note : 不仅需要依赖,还需要拷贝hive-site.xml到resources目录
*/
object SparkSQL09_IO_Hive {
def main(args: Array[String]): Unit = {
//创建配置文件对象
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkSQL09_IO_Hive")
//创建SparkSession对象
val spark: SparkSession = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate()
//指定使用哪个数据库
spark.sql("use default")
//查看指定数据库中都有哪些表
spark.sql("show tables").show()
//查询特定表
spark.sql(
"""
|select
| *
|from
| demo02
|""".stripMargin).show()
//释放资源
spark.stop()
}
}

Spark SQL 代码读取mysql&代码读取hive_hive

​注意:​​如果访问不到hdfs,则需要把将集群中的core-site.xml和hdfs-site.xml也都复制并放在idea项目的resources下,名称也要保持一致(这种情况的原因很大情况是,没有配置全局环境变量)