一、Dataset的创建
(1)普通创建
scala> spark.createDataset(1 to 3).show
+-----+
|value|
+-----+
| 1|
| 2|
| 3|
+-----+
scala> spark.createDataset(List(("a",1),("b",2),("c",3))).show
+---+---+
| _1| _2|
+---+---+
| a| 1|
| b| 2|
| c| 3|
+---+---+
scala> spark.createDataset(sc.parallelize(List(("a",1,1),("b",2,2)))).show
+---+---+---+
| _1| _2| _3|
+---+---+---+
| a| 1| 1|
| b| 2| 2|
+---+---+---+
1.createDataset()的参数可以是:Seq、Array、RDD
2.上面三行代码生成的Dataset分别是:Dataset[Int]、Dataset[(String,Int)]、Dataset[(String,Int,Int)]
3.Dataset与RDD有很多共同的函数,如map,filter等
scala> spark.createDataset(sc.makeRDD(List((1,2,3,4),(2,3,4,5)))).show
+---+---+---+---+
| _1| _2| _3| _4|
+---+---+---+---+
| 1| 2| 3| 4|
| 2| 3| 4| 5|
+---+---+---+---+
scala> sc.makeRDD(List((1,2,3,4),(2,3,4,5)))
res14: org.apache.spark.rdd.RDD[(Int, Int, Int, Int)] = ParallelCollectionRDD[25] at makeRDD at <console>:25
(2)使用样例类创建Dataset
scala> case class Student(name:String,age:Int)
defined class Student
scala> Seq(Student("zhangsan",15),Student("lisi",20))
res18: Seq[Student] = List(Student(zhangsan,15), Student(lisi,20))
scala> Seq(Student("zhangsan",15),Student("lisi",20)).toDS
res19: org.apache.spark.sql.Dataset[Student] = [name: string, age: int]
scala> Seq(Student("zhangsan",15),Student("lisi",20)).toDS.show
+--------+---+
| name|age|
+--------+---+
|zhangsan| 15|
| lisi| 20|
+--------+---+
样例类和普通类区别:
(1)不需要new可以直接生成对象
(2)默认实现序列化接口
(3)默认自动覆盖toString()、equals()、hashCode()
二、RDD->DataFrame
方式一:通过反射获取RDD内的Schema
case class Person(name:String,age:Int)
import spark.implicits._
val people=sc.textFile("file:///opt/datas/people.txt")
.map(_.split(","))
.map(p => Person(p(0), p(1).trim.toInt)).toDF()
people.show
people.createOrReplaceTempView("people")
val teenagers = spark.sql("SELECT name, age FROM people WHERE age >= 13 AND age <= 19")
teenagers.show()
方式二:通过编程接口指定Schema
case class Person(name String,age Int)
val people=sc.textFile("file:///opt/datas/people.txt")
val schemaString = "name age"
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType, StructField, StringType}
val schema = StructType(schemaString.split(" ").map(fieldName =>StructField(fieldName,StringType, true)))
val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
val peopleDataFrame = spark.createDataFrame(rowRDD, schema)
peopleDataFrame.createOrReplaceTempView("people")
val results = spark.sql("SELECT * FROM people")
results.show
val df = spark.read.json("file:///opt/datas/people.json")
//将DF转为RDD
df.rdd.collect
//Spark SQL写parquet文件
import org.apache.spark.sql.types.{StructType, StructField, StringType,ArrayType,IntegerType}
val schema=StructType(Array(StructField("name",StringType),
StructField("favorite_color",StringType),
StructField("favorite_numbers",ArrayType(IntegerType))))
val rdd=sc.parallelize(List(("Alyssa",null,Array(3,9,15,20)),("Ben","red",null)))
val rowRDD=rdd.map(p=>Row(p._1,p._2,p._3))
val df=spark.createDataFrame(rowRDD,schema)
df.write.parquet("/data/users") //在该目录下生成parquet文件
//Spark SQL读parquet文件
val df=spark.read.parquet("/data/users") //该目录下存在parquet文件
df.show
df.printSchema
三、具体演示
(1)通过反射获取RDD内的Schema
scala> case class Person(name String,age Int)
<console>:1: error: ':' expected but identifier found.
case class Person(name String,age Int)
^
scala> val people=sc.textFile("file:///opt/datas/people.txt")
people: org.apache.spark.rdd.RDD[String] = file:///opt/datas/people.txt MapPartitionsRDD[36] at textFile at <console>:26
scala> val schemaString = "name age"
schemaString: String = name age
scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row
scala> import org.apache.spark.sql.types.{StructType, StructField, StringType}
import org.apache.spark.sql.types.{StructType, StructField, StringType}
scala> val schema = StructType(schemaString.split(" ").map(fieldName =>StructField(fieldName,StringType, true)))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,StringType,true))
scala> val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[38] at map at <console>:30
scala> val peopleDataFrame = spark.createDataFrame(rowRDD, schema)
peopleDataFrame: org.apache.spark.sql.DataFrame = [name: string, age: string]
scala> peopleDataFrame.createOrReplaceTempView("people")
scala> val results = spark.sql("SELECT name FROM people")
results: org.apache.spark.sql.DataFrame = [name: string]
scala> results.show
+-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+
scala> val results = spark.sql("SELECT * FROM people")
results: org.apache.spark.sql.DataFrame = [name: string, age: string]
scala> results.show
+-------+---+
| name|age|
+-------+---+
|Michael| 19|
| Andy| 30|
| Justin| 19|
+-------+---+
(2)通过编程接口指定Schema
scala> case class Person(name:String,age:Int)
defined class Person
scala> import spark.implicits._
import spark.implicits._
scala> val people=sc.textFile("file:///opt/datas/people.txt")
people: org.apache.spark.rdd.RDD[String] = file:///opt/datas/people.txt MapPartitionsRDD[48] at textFile at <console>:32
scala> .map(_.split(","))
<console>:1: error: illegal start of definition
.map(_.split(","))
^
scala> .map(p => Person(p(0), p(1).trim.toInt)).toDF()
<console>:1: error: illegal start of definition
.map(p => Person(p(0), p(1).trim.toInt)).toDF()
^
scala> people.show
<console>:34: error: value show is not a member of org.apache.spark.rdd.RDD[String]
people.show
^
scala> people.createOrReplaceTempView("people")
<console>:34: error: value createOrReplaceTempView is not a member of org.apache.spark.rdd.RDD[String]
people.createOrReplaceTempView("people")
^
scala> val teenagers = spark.sql("SELECT name, age FROM people WHERE age >= 13 AND age <= 19")
teenagers: org.apache.spark.sql.DataFrame = [name: string, age: string]
scala> teenagers.show()
+-------+---+
| name|age|
+-------+---+
|Michael| 19|
| Justin| 19|
+-------+---+
将DataFrame转为RDD
scala> val df = spark.read.json("file:///opt/datas/people.json")
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
scala> df.rdd.collect
res17: Array[org.apache.spark.sql.Row] = Array([null,Michael], [30,Andy], [19,Justin])
Spark SQL写parquet文件
scala> import org.apache.spark.sql.types.{StructType, StructField, StringType,ArrayType,IntegerType}
import org.apache.spark.sql.types.{StructType, StructField, StringType, ArrayType, IntegerType}
scala> val schema=StructType(Array(StructField("name",StringType),
| StructField("favorite_color",StringType),
| StructField("favorite_numbers",ArrayType(IntegerType))))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(favorite_color,StringType,true), StructField(favorite_numbers,ArrayType(IntegerType,true),true))
scala> val rdd=sc.parallelize(List(("Alyssa",null,Array(3,9,15,20)),("Ben","red",null)))
rdd: org.apache.spark.rdd.RDD[(String, String, Array[Int])] = ParallelCollectionRDD[60] at parallelize at <console>:33
scala> val rowRDD=rdd.map(p=>Row(p._1,p._2,p._3))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[61] at map at <console>:34
scala> val df=spark.createDataFrame(rowRDD,schema)
df: org.apache.spark.sql.DataFrame = [name: string, favorite_color: string ... 1 more field]
scala> df.write.parquet("/data/users")
[Stage 21:>
Spark SQL读parquet文件
scala> val df=spark.read.parquet("/data/users")
df: org.apache.spark.sql.DataFrame = [name: string, favorite_color: string ... 1 more field]
scala> df.show
+------+--------------+----------------+
| name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa| null| [3, 9, 15, 20]|
| Ben| red| null|
+------+--------------+----------------+
scala> df.printSchema
root
|-- name: string (nullable = true)
|-- favorite_color: string (nullable = true)
|-- favorite_numbers: array (nullable = true)
| |-- element: integer (containsNull = true)