一、Dataset的创建

(1)普通创建

scala> spark.createDataset(1 to 3).show
+-----+
|value|
+-----+
| 1|
| 2|
| 3|
+-----+


scala> spark.createDataset(List(("a",1),("b",2),("c",3))).show
+---+---+
| _1| _2|
+---+---+
| a| 1|
| b| 2|
| c| 3|
+---+---+

scala> spark.createDataset(sc.parallelize(List(("a",1,1),("b",2,2)))).show
+---+---+---+
| _1| _2| _3|
+---+---+---+
| a| 1| 1|
| b| 2| 2|
+---+---+---+

1.createDataset()的参数可以是:Seq、Array、RDD
2.上面三行代码生成的Dataset分别是:Dataset[Int]、Dataset[(String,Int)]、Dataset[(String,Int,Int)]
3.Dataset与RDD有很多共同的函数,如map,filter等

scala> spark.createDataset(sc.makeRDD(List((1,2,3,4),(2,3,4,5)))).show
+---+---+---+---+
| _1| _2| _3| _4|
+---+---+---+---+
| 1| 2| 3| 4|
| 2| 3| 4| 5|
+---+---+---+---+
scala> sc.makeRDD(List((1,2,3,4),(2,3,4,5)))
res14: org.apache.spark.rdd.RDD[(Int, Int, Int, Int)] = ParallelCollectionRDD[25] at makeRDD at <console>:25

(2)使用样例类创建Dataset

scala> case class Student(name:String,age:Int)
defined class Student

scala> Seq(Student("zhangsan",15),Student("lisi",20))
res18: Seq[Student] = List(Student(zhangsan,15), Student(lisi,20))

scala> Seq(Student("zhangsan",15),Student("lisi",20)).toDS
res19: org.apache.spark.sql.Dataset[Student] = [name: string, age: int]

scala> Seq(Student("zhangsan",15),Student("lisi",20)).toDS.show
+--------+---+
| name|age|
+--------+---+
|zhangsan| 15|
| lisi| 20|
+--------+---+

样例类和普通类区别:
(1)不需要new可以直接生成对象
(2)默认实现序列化接口
(3)默认自动覆盖toString()、equals()、hashCode()

二、RDD->DataFrame

方式一:通过反射获取RDD内的Schema

case class Person(name:String,age:Int)
import spark.implicits._
val people=sc.textFile("file:///opt/datas/people.txt")
.map(_.split(","))
.map(p => Person(p(0), p(1).trim.toInt)).toDF()
people.show
people.createOrReplaceTempView("people")
val teenagers = spark.sql("SELECT name, age FROM people WHERE age >= 13 AND age <= 19")
teenagers.show()

方式二:通过编程接口指定Schema

case class Person(name String,age Int)
val people=sc.textFile("file:///opt/datas/people.txt")
val schemaString = "name age"
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType, StructField, StringType}
val schema = StructType(schemaString.split(" ").map(fieldName =>StructField(fieldName,StringType, true)))
val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
val peopleDataFrame = spark.createDataFrame(rowRDD, schema)
peopleDataFrame.createOrReplaceTempView("people")
val results = spark.sql("SELECT * FROM people")
results.show
val df = spark.read.json("file:///opt/datas/people.json")
//将DF转为RDD
df.rdd.collect


//Spark SQL写parquet文件
import org.apache.spark.sql.types.{StructType, StructField, StringType,ArrayType,IntegerType}
val schema=StructType(Array(StructField("name",StringType),
StructField("favorite_color",StringType),
StructField("favorite_numbers",ArrayType(IntegerType))))
val rdd=sc.parallelize(List(("Alyssa",null,Array(3,9,15,20)),("Ben","red",null)))
val rowRDD=rdd.map(p=>Row(p._1,p._2,p._3))
val df=spark.createDataFrame(rowRDD,schema)
df.write.parquet("/data/users") //在该目录下生成parquet文件

//Spark SQL读parquet文件
val df=spark.read.parquet("/data/users") //该目录下存在parquet文件
df.show
df.printSchema

三、具体演示

(1)通过反射获取RDD内的Schema

scala> case class Person(name String,age Int)
<console>:1: error: ':' expected but identifier found.
case class Person(name String,age Int)
^

scala> val people=sc.textFile("file:///opt/datas/people.txt")
people: org.apache.spark.rdd.RDD[String] = file:///opt/datas/people.txt MapPartitionsRDD[36] at textFile at <console>:26

scala> val schemaString = "name age"
schemaString: String = name age

scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row

scala> import org.apache.spark.sql.types.{StructType, StructField, StringType}
import org.apache.spark.sql.types.{StructType, StructField, StringType}

scala> val schema = StructType(schemaString.split(" ").map(fieldName =>StructField(fieldName,StringType, true)))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,StringType,true))

scala> val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[38] at map at <console>:30

scala> val peopleDataFrame = spark.createDataFrame(rowRDD, schema)
peopleDataFrame: org.apache.spark.sql.DataFrame = [name: string, age: string]

scala> peopleDataFrame.createOrReplaceTempView("people")

scala> val results = spark.sql("SELECT name FROM people")
results: org.apache.spark.sql.DataFrame = [name: string]

scala> results.show
+-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+


scala> val results = spark.sql("SELECT * FROM people")
results: org.apache.spark.sql.DataFrame = [name: string, age: string]

scala> results.show
+-------+---+
| name|age|
+-------+---+
|Michael| 19|
| Andy| 30|
| Justin| 19|
+-------+---+

(2)通过编程接口指定Schema

scala> case class Person(name:String,age:Int)
defined class Person

scala> import spark.implicits._
import spark.implicits._

scala> val people=sc.textFile("file:///opt/datas/people.txt")
people: org.apache.spark.rdd.RDD[String] = file:///opt/datas/people.txt MapPartitionsRDD[48] at textFile at <console>:32

scala> .map(_.split(","))
<console>:1: error: illegal start of definition
.map(_.split(","))
^

scala> .map(p => Person(p(0), p(1).trim.toInt)).toDF()
<console>:1: error: illegal start of definition
.map(p => Person(p(0), p(1).trim.toInt)).toDF()
^

scala> people.show
<console>:34: error: value show is not a member of org.apache.spark.rdd.RDD[String]
people.show
^

scala> people.createOrReplaceTempView("people")
<console>:34: error: value createOrReplaceTempView is not a member of org.apache.spark.rdd.RDD[String]
people.createOrReplaceTempView("people")
^

scala> val teenagers = spark.sql("SELECT name, age FROM people WHERE age >= 13 AND age <= 19")
teenagers: org.apache.spark.sql.DataFrame = [name: string, age: string]

scala> teenagers.show()
+-------+---+
| name|age|
+-------+---+
|Michael| 19|
| Justin| 19|
+-------+---+

将DataFrame转为RDD

scala> val df = spark.read.json("file:///opt/datas/people.json")
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]

scala> df.rdd.collect
res17: Array[org.apache.spark.sql.Row] = Array([null,Michael], [30,Andy], [19,Justin])

Spark SQL写parquet文件

scala> import org.apache.spark.sql.types.{StructType, StructField, StringType,ArrayType,IntegerType}
import org.apache.spark.sql.types.{StructType, StructField, StringType, ArrayType, IntegerType}

scala> val schema=StructType(Array(StructField("name",StringType),
| StructField("favorite_color",StringType),
| StructField("favorite_numbers",ArrayType(IntegerType))))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(favorite_color,StringType,true), StructField(favorite_numbers,ArrayType(IntegerType,true),true))

scala> val rdd=sc.parallelize(List(("Alyssa",null,Array(3,9,15,20)),("Ben","red",null)))
rdd: org.apache.spark.rdd.RDD[(String, String, Array[Int])] = ParallelCollectionRDD[60] at parallelize at <console>:33

scala> val rowRDD=rdd.map(p=>Row(p._1,p._2,p._3))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[61] at map at <console>:34

scala> val df=spark.createDataFrame(rowRDD,schema)
df: org.apache.spark.sql.DataFrame = [name: string, favorite_color: string ... 1 more field]

scala> df.write.parquet("/data/users")
[Stage 21:>

Spark SQL读parquet文件

scala> val df=spark.read.parquet("/data/users")
df: org.apache.spark.sql.DataFrame = [name: string, favorite_color: string ... 1 more field]

scala> df.show
+------+--------------+----------------+
| name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa| null| [3, 9, 15, 20]|
| Ben| red| null|
+------+--------------+----------------+


scala> df.printSchema
root
|-- name: string (nullable = true)
|-- favorite_color: string (nullable = true)
|-- favorite_numbers: array (nullable = true)
| |-- element: integer (containsNull = true)