协同过滤介绍
协同过滤简单来说是利用某兴趣相投、拥有共同经验之群体的喜好来推荐用户感兴趣的信息,个人通过合作的机制给予信息相当程度的回应(如评分)并记录下来以达到过滤的目的进而帮助别人筛选信息,回应不一定局限于特别感兴趣的,特别不感兴趣信息的纪录也相当重要。
协同过滤算法是一种较为著名和常用的推荐算法,它基于对用户历史行为数据的挖掘发现用户的喜好偏向,并预测用户可能喜好的产品进行推荐。也就是常见的“猜你喜欢”,和“购买了该商品的人也喜欢”等功能。
协同过滤之Spark实例
官方案例链接
https://spark.apache.org/docs/2.2.2/ml-collaborative-filtering.html
Maven依赖
<properties>
<scala.version>2.11.8</scala.version>
<spark.version>2.2.2</spark.version>
<hadoop.version>2.7.6</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
CollaborativeFiltering.scala
package util
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
/**
* @Author Daniel
* @Description Spark协同过滤算法
* 输入数据为30个用户对100个电影的评分
* 数据格式:
* user::movie::rating::timestamp
* 通过对用户已评分的电影来推荐用户看一些他没有看过的电影
**/
// 数据格式
case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long)
object CollaborativeFiltering {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.appName("CollaborativeFiltering")
.getOrCreate()
// 数据实例化
def parseRating(str: String): Rating = {
val fields = str.split("::")
assert(fields.size == 4)
Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong)
}
import spark.implicits._
val ratings = spark.read.textFile("sample_movielens_ratings.txt")
.map(parseRating)
.toDF()
// 80%的数据用于训练模型,20%用于测试
val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
// 使用ALS在训练集上构建推荐模型
val als = new ALS()
// 迭代最大值
.setMaxIter(5)
// ALS中正则化参数,默认为1.0
.setRegParam(0.01)
.setUserCol("userId")
.setItemCol("movieId")
.setRatingCol("rating")
// 训练模型
val model = als.fit(training)
// 通过计算测试数据的均方根误差来评估模型
// setColdStartStrategy为了不得到NaN值
model.setColdStartStrategy("drop")
val predictions = model.transform(test)
val evaluator = new RegressionEvaluator()
.setMetricName("rmse")
.setLabelCol("rating")
.setPredictionCol("prediction")
val rmse = evaluator.evaluate(predictions)
println(s"Root-mean-square error(均方根误差) = $rmse")
// 为每个用户生成十大推荐电影
val userRecs = model.recommendForAllUsers(10)
// 为每部电影生成前10名用户推荐
val movieRecs = model.recommendForAllItems(10)
userRecs.orderBy("userId").show(30, false)
movieRecs.orderBy("movieId").show(100, false)
}
}
部分结果
|userId|recommendations -------------------------------------------------------
|0 |[[25,4.247095], [92,3.6386964], [2,3.0459275], [58,2.9969811], [32,2.8946023], [81,2.4610684], [26,2.428734], [95,2.3305135], [93,2.2790494], [61,2.2649255]] |
|1 |[[38,2.8552358], [49,2.5752294], [46,2.555421], [77,2.4367332], [51,2.2868218], [4,2.2531743], [28,2.2295506], [30,2.160061], [90,2.091165], [24,2.086698]] |
|2 |[[83,4.980396], [8,4.9696956], [93,4.961116], [39,4.917714], [37,4.855087], [89,3.990784], [34,3.9526663], [19,3.9220076], [40,3.8683553], [16,3.636133]] |
|3 |[[51,5.0957375], [18,4.156398], [76,3.9523013], [80,3.898552], [25,3.3176482], [75,3.1826537], [29,3.1740842], [79,3.0311487], [8,2.9666855], [17,2.9110801]] |
|4 |[[53,5.5128264], [77,4.1807184], [52,4.0699162], [41,4.0324464], [29,3.736351], [70,3.7025547], [62,3.6998646], [38,3.3742704], [87,3.189577], [40,3.0725355]] |
|movieId|recommendations
-------------------------------------------------------
|25 |[[7,4.6209264], [28,4.569703], [0,4.247095], [16,3.7311819], [3,3.3176482], [10,2.8684502], [12,2.7194147], [6,2.5982194], [26,2.3475862], [14,2.321721]] |
|26 |[[16,2.851448], [15,2.7439156], [0,2.428734], [28,2.377817], [25,2.3317814], [11,2.1339734], [12,2.0025218], [29,1.9792455], [17,1.8496197], [7,1.6573409]] |
|27 |[[22,5.577275], [11,5.1906443], [12,4.805975], [23,4.752778], [6,3.6258726], [2,2.875728], [24,2.82525], [25,2.1150773], [9,1.9219421], [8,1.8820488]] |
|28 |[[18,5.0727634], [26,4.240342], [25,3.8337395], [24,3.5129216], [28,3.2137554], [4,2.5407252], [22,2.433772], [20,2.3825085], [11,2.3464804], [15,2.302505]] |
|29 |[[8,5.835305], [14,5.0880346], [21,4.8710938], [16,3.9197617], [7,3.8690348], [4,3.736351], [3,3.1740842], [22,3.111862], [13,2.704935], [19,2.4294906]] |
结果解释
可以看到user表的第一个id为0的用户,推荐了id为25的电影;从movie表中也可以看到id为25的电影中也有推荐id为0的用户来观看