表结构
求被评分次数最多的 10 部电影,并给出评分次数(电影名,评分次数)
import org.apache.spark.{SparkConf, SparkContext}
object test{
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("movie rate").setMaster("local");
val sc=new SparkContext(conf);
val rdd1=sc.textFile("C:\\Users\\吴悠\\Desktop\\movies.dat");
val rdd2=sc.textFile("C:\\Users\\吴悠\\Desktop\\ratings.dat");
val movie=rdd1.map{x=>val line=x.split("::");(line(0),line(1))}; //电影ID,电影名
val rate=rdd2.map{x=>val line=x.split("::");(line(1))}.map(x=>(x,1)).reduceByKey((x,y)=>x+y).sortBy(x=>x._2,false,1); //电影ID,评分次数
val result=rate.join(movie).sortBy(x=>x._2._1,false,1).take(10);
result.foreach( x =>println(x._2._2+"\t"+x._2._1));
}
}
result元组是<电影ID,(评分次数,电影名)>
分别求男性,女性当中评分最高的 10 部电影(性别,电影名,影评分)
package demo
import org.apache.spark.{SparkConf, SparkContext}
object WordCount{
def main(args:Array[String]) : Unit ={
val conf=new SparkConf().setAppName("movie rate").setMaster("local");
val sc=new SparkContext(conf);
val rdd1=sc.textFile("C:\\Users\\Administrator\\Desktop\\movies.dat");
val rdd2=sc.textFile("C:\\Users\\Administrator\\Desktop\\ratings.dat");
val rdd3=sc.textFile(path = "C:\\Users\\Administrator\\Desktop\\users.dat");
val movie=rdd1.map{x=>val line=x.split("::"); (line(0),line(1))}; //电影ID,电影名
val rate=rdd2.map{x=>val line=x.split("::"); (line(0),(line(1),line(2).toInt))}; //用户ID,(电影ID,评分)
val user=rdd3.map{x=>val line=x.split("::"); (line(0),line(1))}; //用户ID,性别
val man=user.filter(x=>(x._2=="M"));
val woman=user.filter(x=>(x._2=="F"));
val ManMovieRate=rate.join(man).map{x=>(x._2._1._1,x._2._1._2.toInt)}.reduceByKey((x,y)=>x+y); //男生电影ID,评分
val WomanMovieRate=rate.join(woman).map{x=>(x._2._1._1,x._2._1._2.toInt)}.reduceByKey((x,y)=>x+y); //女生电影ID,评分
val ManMovie=movie.join(ManMovieRate).map{x=>(x._2._1,x._2._2)}.sortBy(x=>x._2,false,1).take(10);
val WomanMovie=movie.join(WomanMovieRate).map{x=>(x._2._1,x._2._2)}.sortBy(x=>x._2,false,1).take(10);
ManMovie.foreach(x=>println("M"+"\t"+x._1+"\t"+x._2));
WomanMovie.foreach(x=>println("F"+"\t"+x._1+"\t"+x._2));
}
}
rate.join(man)元组是<用户ID,((电影ID,评分),性别)>
movie.join(ManMovieRate)元组是<电影ID,(电影名,评分)>