1 ,数据准备 : 4 张表
- 学生表 ( student ) : ( sid , sname , ssex , sbirth , sclazz )
学号 ,姓名 ,性别 ,生日 ,班级
1 孙福龙 m 1989-03-09 1
2 王婷婷 w 1992-02-24 1
3 王丹丹 w 1989-05-20 2
4 孙尚文 m 2020-12-20 2
5 孙静雅 w 2022-02-20 1
6 王珞丹 w 1986-07-26 1
7 张三 w 1975-01-01 3
8 李四 m 1979-06-06 3- 课程表 ( course) : ( cid , cname , tid )
课程号 ,课程名称 ,教工编号
1 J2SE t003
2 Java Web t002
3 SSH t001
4 Oracle t001- 成绩表 ( score ) ( sid , cid , score )
学号 ,课程号 ,成绩
1 1 56
1 2 78
1 3 67
1 4 58
2 1 79
2 2 81
2 3 92
2 4 68
3 1 91
3 2 47
3 3 88
3 4 56
4 2 88
4 3 90
4 4 93
5 1 46
5 3 78
5 4 53
6 1 35
6 2 68
6 4 71
7 1 66
7 2 88
7 3 77
7 4 98
8 1 49
8 2 11
8 3 22
8 4 38- 教师表 ( teacher ) ( tid , tname , tsex , tbirth , tlevel , tdepart )
教工编号 ,教工姓名 ,教工性别 ,教工出生年月 ,职称 ,部门
1 叶平 m 1987-05-20 level01 computer
2 贺高 m 1985-06-12 level01 computer
3 杨艳 w 1982-03-16 level02 programer
4 周磊 w 1975-11-17 level03 find2 ,将数据上传到 s3 :

3 ,注意时间转换 : sql 中的 Date
package com.SparkSqlTest
import java.io.FileInputStream
import java.text.SimpleDateFormat
import java.util.Properties
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import java.sql.Date
object Demo01 {
def main(args: Array[String]): Unit = {
// 1 ,spark 上下文
val spark = SparkSession.builder()
.master("local[2]")
.config("spark.eventLog.enabled", "false")
.config("spark.driver.memory", "2g")
.config("spark.executor.memory", "2g")
.appName("SparkDemoFromS3")
.getOrCreate()
// 1 ,日志级别
spark.sparkContext.setLogLevel("WARN")
// 2 ,读资源文件
val properties = new Properties()
val path = Thread.currentThread().getContextClassLoader.getResource("conf/s3.properties").getPath
properties.load(new FileInputStream(path))
// 3 ,设置数据源 ( s3 )
val sc: SparkContext = spark.sparkContext
sc.hadoopConfiguration.set("fs.s3a.access.key", properties.getProperty("fs.s3a.access.key"))
sc.hadoopConfiguration.set("fs.s3a.secret.key", properties.getProperty("fs.s3a.secret.key"))
sc.hadoopConfiguration.set("fs.s3a.endpoint", properties.getProperty("fs.s3a.endpoint"))
// 4 ,隐式转换
import spark.implicits._
val format: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
// 5 ,建表 student ( 读文件 ,rowrdd ,schema )
val rddStudent: RDD[String] = spark.sparkContext.textFile("s3a://lifecyclebigdata/test/data03/student.csv")
val rddRowStudent: RDD[Row] = rddStudent.map(line => {
val arr: Array[String] = line.split("\t")
Row(arr(0).toInt, arr(1), arr(2), new Date(format.parse(arr(3)).getTime), arr(4).toInt)
})
val fieldsStudent: Array[StructField] = Array(
StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("sex", StringType, true),
StructField("birth", DateType, true),
StructField("clazz", IntegerType, true)
)
val schemaStudent: StructType = StructType(fieldsStudent)
val dfStudent: DataFrame = spark.createDataFrame(rddRowStudent,schemaStudent)
// 5 ,注册表 ( student )
dfStudent.createOrReplaceTempView("student")
spark.sql("select * from student").show()
spark.close()
}
}4 ,建表 : 4 张表
- 代码 :
package com.SparkSqlTest
import java.io.FileInputStream
import java.text.SimpleDateFormat
import java.util.Properties
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import java.sql.Date
object Demo01 {
def main(args: Array[String]): Unit = {
// 1 ,spark 上下文
val spark = SparkSession.builder()
.master("local[2]")
.config("spark.eventLog.enabled", "false")
.config("spark.driver.memory", "2g")
.config("spark.executor.memory", "2g")
.appName("SparkDemoFromS3")
.getOrCreate()
// 1 ,日志级别
spark.sparkContext.setLogLevel("WARN")
// 2 ,读资源文件
val properties = new Properties()
val path = Thread.currentThread().getContextClassLoader.getResource("conf/s3.properties").getPath
properties.load(new FileInputStream(path))
// 3 ,设置数据源 ( s3 )
val sc: SparkContext = spark.sparkContext
sc.hadoopConfiguration.set("fs.s3a.access.key", properties.getProperty("fs.s3a.access.key"))
sc.hadoopConfiguration.set("fs.s3a.secret.key", properties.getProperty("fs.s3a.secret.key"))
sc.hadoopConfiguration.set("fs.s3a.endpoint", properties.getProperty("fs.s3a.endpoint"))
// 4 ,隐式转换
import spark.implicits._
val format: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
// 1 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 1 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 1 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 5 ,建表 student ( 读文件 ,rowrdd ,schema )
val rddStudent: RDD[String] = spark.sparkContext.textFile("s3a://lifecyclebigdata/test/data03/student.csv")
val rddRowStudent: RDD[Row] = rddStudent.map(line => {
val arr: Array[String] = line.split("\t")
Row(arr(0).toInt, arr(1), arr(2), new Date(format.parse(arr(3)).getTime), arr(4).toInt)
})
val fieldsStudent: Array[StructField] = Array(
StructField("sid", IntegerType, true),
StructField("sname", StringType, true),
StructField("ssex", StringType, true),
StructField("sbirth", DateType, true),
StructField("sclazz", IntegerType, true)
)
val schemaStudent: StructType = StructType(fieldsStudent)
val dfStudent: DataFrame = spark.createDataFrame(rddRowStudent,schemaStudent)
// 注册表 ( student )
dfStudent.createOrReplaceTempView("student")
// 2 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 2 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 2 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 6 ,建表 student ( 读文件 ,rowrdd ,schema )
val rddCourse: RDD[String] = spark.sparkContext.textFile("s3a://lifecyclebigdata/test/data03/course.csv")
val rddRowCourse: RDD[Row] = rddCourse.map(line => {
val arr: Array[String] = line.split("\t")
Row(arr(0).toInt, arr(1), arr(2))
})
val fieldsCourse: Array[StructField] = Array(
StructField("cid", IntegerType, true),
StructField("cname", StringType, true),
StructField("tid", StringType, true)
)
val schemaCourse: StructType = StructType(fieldsCourse)
val dfCourse: DataFrame = spark.createDataFrame(rddRowCourse,schemaCourse)
// 注册表 ( course )
dfCourse.createOrReplaceTempView("course")
// 2 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 2 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 2 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 7 ,建表 score ( 读文件 ,rowrdd ,schema )
val rddScore: RDD[String] = spark.sparkContext.textFile("s3a://lifecyclebigdata/test/data03/score.csv")
val rddRowScore: RDD[Row] = rddScore.map(line => {
val arr: Array[String] = line.split("\t")
Row(arr(0).toInt, arr(1).toInt, arr(2).toInt)
})
val fieldsScore: Array[StructField] = Array(
StructField("sid", IntegerType, true),
StructField("cid", IntegerType, true),
StructField("score", IntegerType, true)
)
val schemaScore: StructType = StructType(fieldsScore)
val dfScore: DataFrame = spark.createDataFrame(rddRowScore,schemaScore)
dfScore.createOrReplaceTempView("score")
// 2 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 2 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 2 ,注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表注册表
// 7 ,建表 score ( 读文件 ,rowrdd ,schema )
val rddTeacher: RDD[String] = spark.sparkContext.textFile("s3a://lifecyclebigdata/test/data03/teacher.csv")
val rddRowTeacher: RDD[Row] = rddTeacher.map(line => {
val arr: Array[String] = line.split("\t")
Row(arr(0).toInt, arr(1), arr(2), new Date(format.parse(arr(3)).getTime), arr(4), arr(5))
})
val fieldsTeacher: Array[StructField] = Array(
StructField("tid", IntegerType, true),
StructField("tname", StringType, true),
StructField("tsex", StringType, true),
StructField("tbirth", DateType, true),
StructField("tlevel", StringType, true),
StructField("tdepart", StringType, true)
)
val schemaTeacher: StructType = StructType(fieldsTeacher)
val dfTeacher: DataFrame = spark.createDataFrame(rddRowTeacher,schemaTeacher)
dfTeacher.createOrReplaceTempView("teacher")
// 建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束
// 建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束
// 建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束
// 建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束
// 建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束,建表结束
// sql 语句 :
// 1 ,查全部
spark.sql("select * from student").show()
spark.sql("select * from course").show()
spark.sql("select * from score").show()
spark.sql("select * from teacher").show()
spark.close()
}
}- 结果 : student ,course ,score ,teacher
+---+------+----+----------+------+
|sid| sname|ssex| sbirth|sclazz|
+---+------+----+----------+------+
| 1|孙福龙| m|1989-03-09| 1|
| 2|王婷婷| w|1992-02-24| 1|
| 3|王丹丹| w|1989-05-20| 2|
| 4|孙尚文| m|2020-12-20| 2|
| 5|孙静雅| w|2022-02-20| 1|
| 6|王珞丹| w|1986-07-26| 1|
| 7| 张三| w|1975-01-01| 3|
| 8| 李四| m|1979-06-06| 3|
+---+------+----+----------+------+
+---+--------+----+
|cid| cname| tid|
+---+--------+----+
| 1| J2SE|t003|
| 2|Java Web|t002|
| 3| SSH|t001|
| 4| Oracle|t001|
+---+--------+----+
+---+---+-----+
|sid|cid|score|
+---+---+-----+
| 1| 1| 56|
| 1| 2| 78|
| 1| 3| 67|
| 1| 4| 58|
| 2| 1| 79|
| 2| 2| 81|
| 2| 3| 92|
| 2| 4| 68|
| 3| 1| 91|
| 3| 2| 47|
| 3| 3| 88|
| 3| 4| 56|
| 4| 2| 88|
| 4| 3| 90|
| 4| 4| 93|
| 5| 1| 46|
| 5| 3| 78|
| 5| 4| 53|
| 6| 1| 35|
| 6| 2| 68|
+---+---+-----+
only showing top 20 rows
+---+-----+----+----------+-------+---------+
|tid|tname|tsex| tbirth| tlevel| tdepart|
+---+-----+----+----------+-------+---------+
| 1| 叶平| m|1987-05-20|level01| computer|
| 2| 贺高| m|1985-06-12|level01| computer|
| 3| 杨艳| w|1982-03-16|level02|programer|
| 4| 周磊| w|1975-11-17|level03| find|
+---+-----+----+----------+-------+---------+
















