手动配置schame

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
import org.apache.spark.sql.Row
import org.apache.spark.sql.DataFrame

// 定义列表数据 seq
val seq: Seq[(String, Int)] = Seq(("Bob", 14), ("Alice", 18))
val rdd: RDD[(String, Int)] = sc.parallelize(seq)

// 定义并封装Schema
val schema:StructType = StructType( Array(
StructField("name", StringType),
StructField("age", IntegerType)
))

// createDataFrame 方法有两个形参,第一个参数正是 RDD,第二个参数是 Schema。
// rdd 转换为 RDD[Row]
val rowRDD: RDD[Row] = rdd.map(fileds => Row(fileds._1, fileds._2))

// 创建 DataFrame
val dataFrame: DataFrame = spark.createDataFrame(rowRDD,schema)


dataFrame.show

RDD自动转DataFrame_apache

toDF自动转DataFrame

​spark.implicits​​ 包提供了各种隐式转换方法。

import spark.implicits._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame

// 定义列表数据 seq
val seq: Seq[(String, Int)] = Seq(("Bob", 14), ("Alice", 18))
val rdd: RDD[(String, Int)] = sc.parallelize(seq)

// 创建 DataFrame
val dataFrame: DataFrame = rdd.toDF
dataFrame.printSchema

dataFrame.show

RDD自动转DataFrame_spark_02