SparkSql基础实战

原创

说文科技 2021-07-08 14:28:33 ©著作权

文章标签 # Spark Spark 文章分类 大数据

©著作权归作者所有：来自51CTO博客作者说文科技的原创作品，请联系作者获取转载授权，否则将追究法律责任

SparkSql基础实战：

import org.apache.spark.sql.{DataFrame, DataFrameReader, Dataset, SparkSession}

object MyDataFrame{
  case class Person(name: String, age: Long)
  def main(args:Array[String]): Unit ={
    //1.get a SparkSession
    //2.don't forget to use getOrCreate
    val spark = SparkSession.builder().master("local").appName("MyDataFrame").getOrCreate()

    readFile(spark)
    runBasicDataFrameExample(spark)
    runDatasetCreationExample(spark)
  }
  private def readFile(spark:SparkSession): Unit ={
    //1.get a DataFrameReader
    val res0: DataFrameReader = spark.read
    //2.the type of file is .txt
    val res2: DataFrameReader = res0.format("text")
    //3.get a DataFrameReader instance
    val res3: DataFrame = res2.load("F:\\people.txt")
    res3.show()
    //4.the result of res3.show() has one column,the column named value ,
    /*
    +------------------+
    |             value|
    +------------------+
    |      name,Michael|
    |  name,Andy,age,30|
    |name,Justin,age,19|
    +------------------+
     */
  }
  private def runBasicDataFrameExample(spark:SparkSession):Unit= {
    val res0: DataFrameReader = spark.read//get a DataFrameReader
    val res1: DataFrame = res0.json("F:\\spark\\src\\main\\resources\\people.json")//read a json file
    res1.show()
    res1.select(col = "name").show
    /*
    +-------+
    |   name|
    +-------+
    |Michael|
    |   Andy|
    | Justin|
    +-------+
     */
    res1.select("name","age").show
    /*
    +-------+----+
    |   name| age|
    +-------+----+
    |Michael|null|
    |   Andy|  30|
    | Justin|  19|
    +-------+----+
     */
    import spark.implicits._
    res1.select($"name").show//should use a implicit transformation
    /*
    +-------+
    |   name|
    +-------+
    |Michael|
    |   Andy|
    | Justin|
    +-------+
     */

    res1.select($"name",$"age"+1).show//get column(age)+1

    /*
    if one column named "exampleColumn" is  in json file,you can use select("exampleColumn") to show,even though this column
    doesn't exist in other rows.
     */
    res1.select($"hh").show
    res1.select("name","hh").show
    res1.select($"age">20).show

    val tempView = res1.createOrReplaceTempView("people")//create one view, and the view is temporary
    var sqlFrame: DataFrame = spark.sql("select * from people")//the result of select is type of DataFrame
    sqlFrame.show

  }
  private def runDatasetCreationExample(spark:SparkSession): Unit ={
    import spark.implicits._
    // $example on:create_ds$
    // Encoders are created for case classes
    /*
      1.the Seq is equal to scala.collection.Seq
      2.in Seq[A],the A is one Type contains common types or you defined class
     */
    val intDS: Seq[Int] = Seq(1,2,3)
    intDS.toDS().show()

    val stringDS1: Dataset[String] = Seq("LittleLawson","great").toDS()
    stringDS1.show()

    val caseClassDS: Dataset[Person] = Seq(Person("Andy", 32)).toDS()
    caseClassDS.show()
    // +----+---+
    // |name|age|
    // +----+---+
    // |Andy| 32|
    // +----+---+

    // Encoders for most common types are automatically provided by importing spark.implicits._
    val primitiveDS = Seq(1, 2, 3).toDS()
    primitiveDS.map(_ + 1).collect() // Returns: Array(2, 3, 4)

    // DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name
    val path = "F:\\people.json"
    val peopleDS = spark.read.json(path).as[Person]
    peopleDS.show()

  }
  private def runInferSchemaExample(spark:SparkSession)= {
    // For implicit conversions from RDDs to DataFrames
    import spark.implicits._

    // Create an RDD of Person objects from a text file, convert it to a Dataframe
    val peopleDF = spark.sparkContext.textFile("F:\\people.txt")
      .map(_.split(",")).map(attributes => Person(attributes(0), attributes(1).trim.toInt)).toDF()

    peopleDF.createOrReplaceTempView("people") // Register the DataFrame as a temporary view

    // SQL statements can be run by using the sql methods provided by Spark
    val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")

    // The columns of a row in the result can be accessed by field index
    teenagersDF.map(teenager => "Name: " + teenager(0)).show()
    // +------------+
    // |       value|
    // +------------+
    // |Name: Justin|
    // +------------+

    // or by field name
    teenagersDF.map(teenager => "Name: " + teenager.getAs[String]("name")).show()
    // +------------+
    // |       value|
    // +------------+
    // |Name: Justin|
    // +------------+
  }
}