val spark = SparkSession
.builder()
.appName("Youtube")
.getOrCreate()
import spark.implicits._
/* 加载源数据 */
// 源数据下载自 https://netsg.cs.sfu.ca/youtubedata/
// 加载视频数据
val videoRDD =
spark.sparkContext.textFile("hdfs:///SparkLearning/youtube_video.txt")
val videoSchema = StructType(
Array[StructField](
StructField("video_id", StringType, nullable = true),
StructField("uploader", StringType, nullable = true),
StructField("age", IntegerType, nullable = true),
StructField("category", ArrayType(StringType), nullable = true),
StructField("length", IntegerType, nullable = true),
StructField("views", IntegerType, nullable = true),
StructField("rate", DoubleType, nullable = true),
StructField("ratings", IntegerType, nullable = true),
StructField("comments", IntegerType, nullable = true),
StructField("related_ids", ArrayType(StringType), nullable = true)
)
)
val rowVideoRDD = videoRDD
.map(_.split("\t"))
.map(attributes =>
Row(
attributes(0),
attributes(1),
attributes(2).toInt,
attributes(3).split("&"),
attributes(4).toInt,
attributes(5).toInt,
attributes(6).toDouble,
attributes(7).toInt,
attributes(8).toInt,
attributes(9).split("&")
)
)
val videoDF = spark.createDataFrame(rowVideoRDD, videoSchema)
// 加载用户数据
val userRDD =
spark.sparkContext.textFile("hdfs:///SparkLearning/youtube_user.txt")
val userSchema = StructType(
Array[StructField](
StructField("uploader", StringType, nullable = true),
StructField("videos", IntegerType, nullable = true),
StructField("friends", IntegerType, nullable = true)
)
)
val rowUserRDD = userRDD
.map(_.split("\t"))
.map(attributes =>
Row(attributes(0), attributes(1).toInt, attributes(2).toInt)
)
val userDF = spark.createDataFrame(rowUserRDD, userSchema)