Scala
package blog
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
/**
* @Author Daniel
* @Description scala创建DataFrame的三种方式
**/
object CreateDataFrame {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[*]")
.appName("CreateDataFrame")
.getOrCreate()
import spark.implicits._
//通过toDF方法创建
val df1 = Seq(
(1, "Karol", 19),
(2, "Abby", 20),
(3, "Zena", 18)
).toDF("id", "name", "age")
df1.show()
//通过spark.createDataFrame创建
val schema = StructType(List(
StructField("id", IntegerType, nullable = false),
StructField("name", StringType, nullable = true),
StructField("age", IntegerType, nullable = true)
))
val rdd = spark
.sparkContext
.parallelize(Seq(
Row(1, "Karol", 19),
Row(2, "Abby", 20),
Row(3, "Zena", 18)
))
val df2 = spark
.createDataFrame(rdd, schema)
df2.show()
//通过读取文件创建
val df3 = spark
.read
.schema(schema)
.csv("file:///C:/info.txt")
df3.show()
spark.stop()
}
}
Python
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType, IntegerType
spark = SparkSession.builder \
.master("local") \
.appName("create_DataFrame") \
.getOrCreate()
# 表结构
schema = StructType([StructField("id", LongType(), True),
StructField("name", StringType(), True),
StructField("age", IntegerType(), True)])
# 通过sparkContext中的parallelize方法来创建RDD
def rdd_create():
rdd = spark.sparkContext.parallelize([
(1, "Karol", 19),
(2, "Abby", 20),
(3, "Zena", 18)])
# 创建DataFrame
df = spark.createDataFrame(rdd, schema)
df.show()
# 从csv文件(文本文件)创建DataFrame
def csv_create():
# 设置分隔符与路径
df = spark \
.read \
.option("sep", ",") \
.csv('info.txt', schema=schema)
df.show()
# 使用pandas创建DataFrame(因为方法较多,这里列举常用的两种)
def pandas_create():
# list
res = [(1, 'Karol', 19), (2, 'Abby', 20,), (3, 'Zena', 18)]
# 1 dict / list
# dict
# df = pd.DataFrame({'id': (1, 2, 3),'name': ('Karol', 'Abby', 'Zena'), 'Age': (19, 20, 18)})
df = pd.DataFrame(res, ['id', 'name', 'age'])
print(df)
# 2
rdd = pd.DataFrame(res)
df = spark.createDataFrame(rdd, schema)
df.show()
if __name__ == '__main__':
rdd_create()
csv_create()
pandas_create()