这时以前的文章链接:如何使用Spark ALS实现协同过滤http://www.aboutyun.com/forum.php?mod=viewthread&tid=13620(出处: about云开发),基于spark0.9官网当时给的例子,与本文有有所不同,本文最后也有经过修改的相关代码还有一些DataFrame的实现。
/**
* Created by raini on 16-2-26.
* spark-shell --master yarn --jars /home/raini/spark/lib/mysql-connector-java-5.1.38-bin.jar --driver-memory 4g --executor-memory 3g --executor-cores 1
*
* 推荐引擎相关概念概述:
* 场景:1.可选项众多 2.偏个人喜好
*
* 1.基于内容的过滤: 利用物品的内容或是属性信息以及某些相似度定义,来求出与该物品类似的物品。
* 2.基于用户的协同过滤: 利用大量已有的用户偏好来估计用户对其未接触过的物品的喜好程度。内在思想是相似度的定义。
* 3.两者的得分取决于若干用户或是物品之间依据相似度所构成的集合,即最近邻模型。
*
* 1.计算给定用户对某个物品的预计评级: 从用户因子矩阵取相应的行和物品因子矩阵取列,然后计算点积即可。
* 2.对于物品之间相似度的计算,可以利用最近邻模型中用到的相似度衡量方法。这里可直接将相似度计算转换成对两个物品因子之间相似度计算。
*
* ALS实现原理:迭代求解一系列最小二乘回归问题,相互更新迭代到模型收敛,是一种求解矩阵分解问题的最优化方法。
*/
object Recommend {
def main(args: Array[String]) {
if (args.length != 1) {
println("Usage: spark-shell --master yarn "")
exit(1)
}
// setup environment
val jarFile = System.getenv("SPARK_TEST_JAR")
val sparkHome = "/home/biyuzhe/spark"
val master = "spark://Raini:7077"
val masterHostname = "Raini" //Source.fromFile("/root/spark/masters").mkString.trim
val conf = new SparkConf()
.setMaster("local")
.setSparkHome(System.getenv("SPARK_HOME"))
.setAppName("MovieLensRecommend")
.set("spark.executor.memory", "3g")
.setJars(Seq(jarFile))
val sc = new SparkContext(conf)
/**提取特征(影片 ID 星级 事件戳)*/
val rawData = sc.textFile("file:///home/raini/data/ml-100k/u.data")
rawData.first()
// val ratingsList_Tuple = sc.textFile("file:///home/raini/data/ml-10M/ratings.dat").map { lines =>
// val fields = lines.split("::")
// (fields(0).toInt, fields(1).toInt, fields(2).toDouble, fields(3).toLong % 10)
// }
// ratingsList_Tuple.first
//
val rawRatings = rawData.map(_.split("\t").take(3))
rawRatings.first()
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble)}
ratings.first()
/**训练推荐模型*/
val rank = 50 //因子个数
val iteratings = 10 //迭代
val lambda = 0.01 //正则化参数
val model = ALS.train(ratings, rank, iteratings, lambda)
// model.userFeatures
// model.userFeatures.count()
// model.productFeatures.count()
/**使用推荐模型*/
//用户推荐(给定用户对 给定物品预计得分-点积)
// org.apache.spark.mllib.recommendation.MatrixFactorizationModel
val predictedRating = model.predict(789,123)
val userID = 789
val K = 10
val topKRecs = model.recommendProducts(userID,K)
println(topKRecs.mkString("\n"))
/**检验推荐内容*/
val movies = sc.textFile("file:///home/raini/data/ml-100k/u.item")
val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap()
titles(123)
//keyBy创建键值对RDD,lookup只返回给定键值
val moviesForUser = ratings.keyBy(_.user).lookup(789)
println(moviesForUser.size) //该用户评价过多少电影
//查看789用户评价最高的10部电影
moviesForUser.sortBy(-_.rating).take(10).map(rating => (titles(rating.product),rating.rating)).foreach(println)
//查看给789用户推荐的前10部电影
topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println)
/**物品推荐*/
import org.jblas.DoubleMatrix
val aMatrix = new DoubleMatrix(Array(1.0,2.0,3.0))
// 余玄相似度:两个向量的点积/各向量范数的乘积 L2正则化了的点积 1表示完全相似 0表两者互不相关
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val itemId = 567
val itemFactor = model.productFeatures.lookup(itemId).head
val itemVector = new DoubleMatrix(itemFactor)
// 计算物品相似度
cosineSimilarity(itemVector, itemVector)
//求各个物品与567的余玄相似度
val sims = model.productFeatures.map{ case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
(id, sim)
}
val sortedSims = sims.top(K)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity })
println(sortedSims.mkString("\n"))
/*
(567,0.9999999999999998)
(719,0.7483422118140968)
(940,0.7278745617680672)
(109,0.7252762702844262)
(1376,0.7140740036831982)
(201,0.7108942599667418)
(184,0.7071733229661871)
(1007,0.7027410411824709)
(670,0.7001937255541564)
(288,0.6987844388998511)
*/
/* 检查推荐的相似物品 */
// println(titles(itemId))
val sortedSims2 = sims.top(K + 1)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity })
sortedSims2.slice(1, 11).map{ case (id, sim) => (titles(id), sim) }.mkString("\n")
/*
String =
(Canadian Bacon (1994),0.7483422118140968)
(Airheads (1994),0.7278745617680672)
(Mystery Science Theater 3000: The Movie (1996),0.7252762702844262)
(Meet Wally Sparks (1997),0.7140740036831982)
(Evil Dead II (1987),0.7108942599667418)
(Army of Darkness (1993),0.7071733229661871)
(Waiting for Guffman (1996),0.7027410411824709)
(Body Snatchers (1993),0.7001937255541564)
(Scream (1996),0.6987844388998511)
(Nightmare on Elm Street, A (1984),0.6976928817165885)
*/
/** 评估推荐模型:军方差 K值平均准确率
*
* MSE:各平方误差的和 与 总数目 的商。 平方误差是指预测到的评级与真实评级的差值的平方。*/
// 取789用户的第一个评级-真实评级 预计评级
val actualRating = moviesForUser.take(1)(0)
// actualRating: Seq[org.apache.spark.mllib.recommendation.Rating] = WrappedArray(Rating(789,1012,4.0))
val predictedRating = model.predict(789, actualRating.product)
// predictedRating: Double = 3.984609159015388
// 计算平方误差
val squaredError = math.pow(predictedRating - actualRating.rating, 2.0)
// squaredError: Double = 2.368779862136182E-4
/** 计算整个dataset 的Mean Squared Error ,需要对每一条(user, product, actualRating, predictedRating)都计算平方误差 */
val usersProducts = ratings.map{ case Rating(user, product, rating) => (user, product)}
val predictions = model.predict(usersProducts).map{
case Rating(user, product, rating) => ((user, product), rating)
}
val ratingsAndPredictions = ratings.map{
case Rating(user, product, rating) => ((user, product), rating)
}.join(predictions) // 用户-物品 为主键,实际和预计评级为对应值
val MSE = ratingsAndPredictions.map{
case ((user, product), (actual, predicted)) => math.pow((actual - predicted), 2)
}.reduce(_ + _) / ratingsAndPredictions.count
println("Mean Squared Error = " + MSE)
// Mean Squared Error = 0.0854164620097481
val RMSE = math.sqrt(MSE)
println("Root Mean Squared Error = " + RMSE)
// Root Mean Squared Error = 0.29226094848567796
/** Compute Mean Average Precision at K
* MAPK是指整个数据集上的K值平均准确率APK的均值。
* APK是信息检索中常用的一个指标,用于衡量针对某个查询所返回的“前K个”文档的平均相关性。文档排名十分重要,同样适合评估隐式数据集的推荐*/
/* APK评估:每一个用户相当于以个查询,而每一个前K个推荐物组成的集合相当于一个查到的文档的结果集合。试图衡量模型对用户感兴趣和会去接触的物品的预测能力。 */
// Code for this function is based on: https://github.com/benhamner/Metrics
def avgPrecisionK(actual: Seq[Int], predicted: Seq[Int], k: Int): Double = {
val predK = predicted.take(k)
var score = 0.0
var numHits = 0.0
for ((p, i) <- predK.zipWithIndex) {
if (actual.contains(p)) {
numHits += 1.0
score += numHits / (i.toDouble + 1.0) // 命中次数/下标加1
}
}
if (actual.isEmpty) {
1.0
} else {
score / scala.math.min(actual.size, k).toDouble
}
}
val actualMovies = moviesForUser.map(_.product)
// actualMovies: Seq[Int] = ArrayBuffer(1012, 127, 475, 93, 1161, 286, 293, 9, 50, 294, 181, 1, 1008, 508, 284, 1017, 137, 111, 742, 248, 249, 1007, 591, 150, 276, 151, 129, 100, 741, 288, 762, 628, 124)
val predictedMovies = topKRecs.map(_.product)
// predictedMovies: Array[Int] = Array(156, 192, 482, 346, 1019, 23, 201, 479, 603, 269)
val apk10 = avgPrecisionK(actualMovies, predictedMovies, 10)
// apk10: Double = 0.0 可以看到效果不理想
/** 计算全局MAPK:计算每一个用户的APK得分,再求其平均。Compute recommendations for all users */
val itemFactors = model.productFeatures.map { case (id, factor) => factor }.collect()
val itemMatrix = new DoubleMatrix(itemFactors)
println(itemMatrix.rows, itemMatrix.columns)
// (1682,50) - movies数目,因子维度
val imBroadcast = sc.broadcast(itemMatrix)
// 计算每一个用户的推荐(预计评级:用户电影因子的点积),之后用预计评级对他们排序
val allRecs = model.userFeatures.map{ case (userId, array) =>
val userVector = new DoubleMatrix(array)
val scores = imBroadcast.value.mmul(userVector)
val sortedWithId = scores.data.zipWithIndex.sortBy(-_._1)
val recommendedIds = sortedWithId.map(_._2 + 1).toSeq
(userId, recommendedIds)
}
// next get all the movie ids per user, grouped by user id
val userMovies = ratings.map{ case Rating(user, product, rating) => (user, product) }.groupBy(_._1)
// userMovies: org.apache.spark.rdd.RDD[(Int, Iterable[(Int, Int)])] = ShuffledRDD[244] at groupBy at <console>:30
// finally, compute the APK for each user, and average them to find MAPK
val K = 10
val MAPK = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) =>
val actual = actualWithIds.map(_._2).toSeq
avgPrecisionK(actual, predicted, K)
}.reduce(_ + _) / allRecs.count
println("Mean Average Precision at K = " + MAPK)
// Mean Average Precision at K = 0.041147174670504494 推荐类任务得分通常较低,特别是当数量极大时
/** 使用MLlib内置的评估函数 (MSE, RMSE and MAPK) */
import org.apache.spark.mllib.evaluation.RegressionMetrics
val predictedAndTrue = ratingsAndPredictions.map { case ((user, product), (actual, predicted)) => (actual, predicted) }
val regressionMetrics = new RegressionMetrics(predictedAndTrue)
println("Mean Squared Error = " + regressionMetrics.meanSquaredError)
println("Root Mean Squared Error = " + regressionMetrics.rootMeanSquaredError)
// Mean Squared Error = 0.08541646200974809
// Root Mean Squared Error = 0.2922609484856779 与之前计算的结果完全一样
// MAPK
import org.apache.spark.mllib.evaluation.RankingMetrics
val predictedAndTrueForRanking = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) =>
val actual = actualWithIds.map(_._2)
(predicted.toArray, actual.toArray) //(预测物品IDs,实际物品IDs)
}
val rankingMetrics = new RankingMetrics(predictedAndTrueForRanking)
println("Mean Average Precision = " + rankingMetrics.meanAveragePrecision)
// Mean Average Precision = 0.08447647846869293\
// 老方法计算MAPK,将K值设很高时,和上面模型得到的结果相同,上面并不设定阈值K
val MAPK2000 = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) =>
val actual = actualWithIds.map(_._2).toSeq
avgPrecisionK(actual, predicted, 2000)
}.reduce(_ + _) / allRecs.count
println("Mean Average Precision = " + MAPK2000)
// Mean Average Precision = 0.0844764784686929
}
}
------------------------------------------------------------------------------------------------
分割线
------------------------------------------------------------------------------------------------
package RecommendModle
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object MoviesRecommond {
def main(args: Array[String]) {
if (args.length < 2) {
System.err.println("Usage : <master> <hdfs dir path>")
System.exit(1)
}
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
//创建入口对象
val conf = new SparkConf().setMaster(args(0)).setAppName("MoviesRecommond")
val sc = new SparkContext(conf)
//评分训练总数据集,元组格式 (args(1) + "/ratings.dat")
val ratingsList_Tuple = sc.textFile("file:///home/raini/data/ml-1m/ratings.dat").map { lines =>
val fields = lines.split("::")
(fields(0).toInt, fields(1).toInt, fields(2).toDouble, fields(3).toLong % 10)//这里将timespan这列对10做取余操作,这样一来个评分数据的这一列都是一个0-9的数字,做什么用?接着看下面
}
//评分训练总数据集,模拟键值对形式,键是0-9中的一个数字,值是Rating类型
val ratingsTrain_KV = ratingsList_Tuple.map(x =>
(x._4, Rating(x._1, x._2, x._3)))
//打印出从ratings.dat中,我们从多少个用户和电影之中得到了多少条评分记录
println("get " + ratingsTrain_KV.count()
+ "ratings from " + ratingsTrain_KV.map(_._2.user).distinct().count()
+ "users on " + ratingsTrain_KV.map(_._2.product).distinct().count() + " movies")
// get 1000209ratings from 6040users on 3706movies
//我的评分数据,RDD[Rating]格式(args(2))
val myRatedData_Rating = sc.textFile("file:///home/raini/data/ml-100k/u5.test").map { lines =>
val fields = lines.split("\t")
Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble)
}
//从训练总数据分出80%作为训练集,20%作为验证数据集,20%作为测试数据集,前面的将timespan对10做取余操作的作用就是为了从总数据集中分出三部分
//设置分区数
val numPartitions = 3
//将键的数值小于8的作为训练数据
val traningData_Rating = ratingsTrain_KV.filter(_._1 < 8)
.values//注意,由于原本的数据集是伪键值对形式的,而当做训练数据只需要RDD[Rating]类型的数据,即values集合
.union(myRatedData_Rating)//使用union操作将我的评分数据加入训练集中,以做为训练的基准
.repartition(numPartitions)
.cache()
//格式和意义和上面的类似,由于是验证数据,并不需要我的评分数据,所以不用union
val validateData_Rating = ratingsTrain_KV.filter(x => x._1 >= 6 && x._1 < 8)
.values
.repartition(numPartitions)
.cache()
val testData_Rating = ratingsTrain_KV.filter(_._1 >= 8)
.values
.cache()
//打印出用于训练,验证和测试的数据集分别是多少条记录
println("training data's num : " + traningData_Rating.count()
+ "\nvalidate data's num : " + validateData_Rating.count()
+ "\ntest data's num : " + testData_Rating.count())
// training data's num : 821160 validate data's num : 198919 test data's num : 199049
//开始模型训练,根据方差选择最佳模型
val ranks = List(8, 22)
val lambdas = List(0.1, 10.0)
val iters = List(5, 7)//这里的迭代次数要根据各自集群机器的硬件来选择,由于我的机器不行最多只能迭代7次,再多就会内存溢出
var bestModel: MatrixFactorizationModel = null
var bestValidateRnse = Double.MaxValue
var bestRank = 0
var bestLambda = -1.0
var bestIter = -1
//一个三层嵌套循环,会产生8个ranks ,lambdas ,iters 的组合,每个组合都会产生一个模型,计算8个模型的方差,最小的那个记为最佳模型
for (rank <- ranks; lam <- lambdas; iter <- iters) {
val model = ALS.train(traningData_Rating, rank, iter, lam)
//rnse为计算方差的函数,定义在最下方
val validateRnse = rnse(model, validateData_Rating, validateData_Rating.count())
println("validation = " + validateRnse
+ " for the model trained with rank = " + rank
+ " lambda = " + lam
+ " and numIter" + iter)
if (validateRnse < bestValidateRnse) {
bestModel = model
bestValidateRnse = validateRnse
bestRank = rank
bestLambda = lam
bestIter = iter
}
} // validation = 2.1705701973851252 for the model trained with rank = 22 lambda = 10.0 and numIter7
//将最佳模型运用在测试数据集上
val testDataRnse = rnse(bestModel, testData_Rating, testData_Rating.count())
println("the best model was trained with rank = " + bestRank + " and lambda = " + bestLambda
+ " and numIter = " + bestIter + " and Rnse on the test data is " + testDataRnse)
// the best model was trained with rank = 22 and lambda = 0.1 and numIter = 7 and Rnse on the test data is NaN
//计算和原先基础的相比其提升了多少
val meanRating = traningData_Rating.union(validateData_Rating).map(_.rating).mean()
val baseLineRnse = math.sqrt(testData_Rating.map(x => (meanRating - x.rating) * (meanRating - x.rating)).mean())
val improvent = (baseLineRnse - testDataRnse) / baseLineRnse * 100
println("the best model improves the baseline by " + "%2.2f".format(improvent) + "%")
//电影数据(1,Toy Story (1995),Animation|Children's|Comedy)格式, (args(1) + "/movies.dat")
val movieList_Tuple = sc.textFile("file:///home/raini/data/ml-1m/movies.dat").map { lines =>
val fields = lines.split("::")
(fields(0).toInt, fields(1), fields(2))
}
//Map类型,键为id,值为name
val movies_Map = movieList_Tuple.map(x =>
(x._1, x._2)).collect().toMap
//Map类型,键为id,值为type
val moviesType_Map = movieList_Tuple.map(x =>
(x._1, x._3)).collect().toMap
var i = 1
println("movies recommond for you:")
//得到我已经看过的电影的id
val myRatedMovieIds = myRatedData_Rating.map(_.product).collect().toSet
//从电影列表中将这些电影过滤掉,剩下的电影列表将被送到模型中预测每部电影我可能做出的评分
val recommondList = sc.parallelize(movies_Map.keys.filter(!myRatedMovieIds.contains(_)).toSeq)
//将结果数据按评分从大小小排序,选出评分最高的10条记录输出
bestModel.predict(recommondList.map((0, _)))
.collect()
.sortBy(-_.rating)
.take(10)
.foreach { r =>
println("%2d".format(i) + "----------> : \nmovie name --> "
+ movies_Map(r.product) + " \nmovie type --> "
+ moviesType_Map(r.product))
i += 1
}
//计算可能感兴趣的人
println("you may be interested in these people : ")
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
//将电影,用户,评分数据转换成为DataFrame,进行SparkSQL操作
val movies = movieList_Tuple
.map(m => Movies(m._1.toInt, m._2, m._3))
.toDF() // [1,Toy Story (1995),Animation|Children's|Comedy],
val ratings = ratingsList_Tuple
.map(r => Ratings(r._1.toInt, r._2.toInt, r._3.toInt))
.toDF() // [1,1193,5]
val users = sc.textFile("file:///home/raini/data/ml-1m/users.dat").map { lines => //args(1) + "/users.dat"
val fields = lines.split("::")
Users(fields(0).toInt, fields(2).toInt, fields(3).toInt)
}.toDF()
ratings.filter('rating >= 5)//过滤出评分列表中评分为5的记录
.join(movies, ratings("movieId") === movies("id"))//join电影DataFrame
.filter(movies("mType") === "Drama")//筛选出评分为5,且电影类型为Drama的记录,根据我的评分数据中电影的类型来进行筛选操作
.join(users, ratings("userId") === users("id"))//join用户DataFrame
.filter(users("age") === 23)//筛选出年龄和我一样的记录
.filter(users("occupation") === 15)//筛选出工作类型=15(和我的信息一致)的记录
.select(users("id"))//只保存用户id,得到的结果为和我的个人信息差不多的,而且喜欢看的电影类型也和我差不多 的用户集合
.take(10)
.foreach(println)
ratings.filter('rating >= 5).join(movies, ratings("movieId") === movies("id")).filter(movies("mType") === "Comedy").join(users, ratings("userId") === users("id")).filter(users("age") === 23).filter(users("occupation") === 15).select(users("id")).take(10).foreach(println)
}
//计算方差函数
def rnse(model: MatrixFactorizationModel, predictionData: RDD[Rating], n: Long): Double = {
//根据参数model,来对验证数据集进行预测
val prediction = model.predict(predictionData.map(x => (x.user, x.product)))
//将预测结果和验证数据集join之后计算评分的方差并返回
val predictionAndOldRatings = prediction.map(x => ((x.user, x.product), x.rating))
.join(predictionData.map(x => ((x.user, x.product), x.rating))).values
math.sqrt(predictionAndOldRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ - _) / n)
}
//样例类
case class Ratings(userId: Int, movieId: Int, rating: Int)
case class Movies(id: Int, name: String, mType: String)
case class Users(id: Int, age: Int, occupation: Int)
}