SparkSql基础实战:
1.
import org.apache.spark.sql.{DataFrame, DataFrameReader, Dataset, SparkSession}
object MyDataFrame{
case class Person(name: String, age: Long)
def main(args:Array[String]): Unit ={
//1.get a SparkSession
//2.don't forget to use getOrCreate
val spark = SparkSession.builder().master("local").appName("MyDataFrame").getOrCreate()
readFile(spark)
runBasicDataFrameExample(spark)
runDatasetCreationExample(spark)
}
private def readFile(spark:SparkSession): Unit ={
//1.get a DataFrameReader
val res0: DataFrameReader = spark.read
//2.the type of file is .txt
val res2: DataFrameReader = res0.format("text")
//3.get a DataFrameReader instance
val res3: DataFrame = res2.load("F:\\people.txt")
res3.show()
//4.the result of res3.show() has one column,the column named value ,
/*
+------------------+
| value|
+------------------+
| name,Michael|
| name,Andy,age,30|
|name,Justin,age,19|
+------------------+
*/
}
private def runBasicDataFrameExample(spark:SparkSession):Unit= {
val res0: DataFrameReader = spark.read//get a DataFrameReader
val res1: DataFrame = res0.json("F:\\spark\\src\\main\\resources\\people.json")//read a json file
res1.show()
res1.select(col = "name").show
/*
+-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+
*/
res1.select("name","age").show
/*
+-------+----+
| name| age|
+-------+----+
|Michael|null|
| Andy| 30|
| Justin| 19|
+-------+----+
*/
import spark.implicits._
res1.select($"name").show//should use a implicit transformation
/*
+-------+
| name|
+-------+
|Michael|
| Andy|
| Justin|
+-------+
*/
res1.select($"name",$"age"+1).show//get column(age)+1
/*
if one column named "exampleColumn" is in json file,you can use select("exampleColumn") to show,even though this column
doesn't exist in other rows.
*/
res1.select($"hh").show
res1.select("name","hh").show
res1.select($"age">20).show
val tempView = res1.createOrReplaceTempView("people")//create one view, and the view is temporary
var sqlFrame: DataFrame = spark.sql("select * from people")//the result of select is type of DataFrame
sqlFrame.show
}
private def runDatasetCreationExample(spark:SparkSession): Unit ={
import spark.implicits._
// $example on:create_ds$
// Encoders are created for case classes
/*
1.the Seq is equal to scala.collection.Seq
2.in Seq[A],the A is one Type contains common types or you defined class
*/
val intDS: Seq[Int] = Seq(1,2,3)
intDS.toDS().show()
val stringDS1: Dataset[String] = Seq("LittleLawson","great").toDS()
stringDS1.show()
val caseClassDS: Dataset[Person] = Seq(Person("Andy", 32)).toDS()
caseClassDS.show()
// +----+---+
// |name|age|
// +----+---+
// |Andy| 32|
// +----+---+
// Encoders for most common types are automatically provided by importing spark.implicits._
val primitiveDS = Seq(1, 2, 3).toDS()
primitiveDS.map(_ + 1).collect() // Returns: Array(2, 3, 4)
// DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name
val path = "F:\\people.json"
val peopleDS = spark.read.json(path).as[Person]
peopleDS.show()
}
private def runInferSchemaExample(spark:SparkSession)= {
// For implicit conversions from RDDs to DataFrames
import spark.implicits._
// Create an RDD of Person objects from a text file, convert it to a Dataframe
val peopleDF = spark.sparkContext.textFile("F:\\people.txt")
.map(_.split(",")).map(attributes => Person(attributes(0), attributes(1).trim.toInt)).toDF()
peopleDF.createOrReplaceTempView("people") // Register the DataFrame as a temporary view
// SQL statements can be run by using the sql methods provided by Spark
val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")
// The columns of a row in the result can be accessed by field index
teenagersDF.map(teenager => "Name: " + teenager(0)).show()
// +------------+
// | value|
// +------------+
// |Name: Justin|
// +------------+
// or by field name
teenagersDF.map(teenager => "Name: " + teenager.getAs[String]("name")).show()
// +------------+
// | value|
// +------------+
// |Name: Justin|
// +------------+
}
}