MapReduce之基于内容的电影推荐(二)
因为这个MapReduce方案比复杂,所以我把它拆分成三块,这样阅读和管理起来比也方便
接上一篇博文MapReduce之基于内容的电影推荐(一),接下来利用MapReduce计算两个电影的相似度,通过计算他们的关联度来计算两个电影的相似度
这篇博文主要介绍通过MapReduce获取计算关联度所需要的相关参数
MapReduce计算两个电影关联度
- 1、对于每一对电影A和B,找出同时对A和B评分的所有人
- 2、用这些评分来建立一个电影A和电影B的向量
- 3、计算两个向量之间的关联度
- 4、只要有人看过一个电影,我们就可以推荐与它关联度最高的电影
MapReduce阶段2
在这个阶段中,完成上述的前两个步骤,最后生成一个电影A和电影B的向量
mapper阶段任务
在mapper阶段,需要将输入与自身连接,方便后续生成向量,为了让评分输入与自身连接,mapper需要收集各个用户的所有电影,如下所示
用户和电影: 示例输入
用户 | 电影 | 评分 | 评分人数 |
User1 | Movie1 | 1 | 10 |
User1 | Movie2 | 2 | 20 |
User1 | Movie3 | 3 | 30 |
User2 | Movie1 | 1 | 10 |
… | … | … | … |
mapper输出
键 | 值 |
User1 | <Movie1,1,10> |
User1 | <Movie2,2,20> |
User1 | <Movie3,3,30> |
User2 | <Movie1,1,10> |
… | … |
mapper阶段编码
package com.deng.MovieRecommend;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class VectorMapper extends Mapper<LongWritable,Text,Text,Tuple3> {
public Tuple3<String,Integer,Integer> t3;
public void map(LongWritable key,Text value,Context context){
String line=value.toString();
String[] inform=line.split("\t");
// 在使用作业链的过程中,自定义的数据类型并没有从一个作业传递到另一个作业,所用通过通过字符串在转换回自定义数据类型
t3=toTuple3(inform[1]);
try {
context.write(new Text(inform[0]),t3);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public Tuple3 toTuple3(String s){
String[] line=s.split(",");
String movie=line[0].substring(7);
Integer rating=Integer.parseInt(line[1]);
Integer numberOfRates=Integer.parseInt(line[2].substring(0,line[2].length()-1));
return new Tuple3(movie,rating,numberOfRates);
}
}
reducer阶段任务
reduce()函数接受一个用户(键)和一个Tuple3(movie,rating,numOfRaters) (值),发出各个用户评分的所有电影的唯一组合,接下来通过计算生成关联度所需要的相关向量。
reducer的输入如下表所示
reducer输入
键 | 值 |
User1 | [ <Movie1,1,10>,<Movie2,2,20>,<Movie3,3,30> ] |
User2 | [<Movie1,1,10>,<Movie2,2,20>,<Movie3,3,30> ] |
… | … |
reducer输出
键 | 值 |
<Movie1,Movie2> | <1,10,2,20,2,1,4> |
<Movie1,Movie2> | <1,10,3,30,3,1,9> |
… | … |
reducer阶段编码
package com.deng.MovieRecommend;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class VectorReducer extends Reducer<Text,Tuple3,Tuple2, Tuple7> {
public Tuple2 reduceKey;
public Tuple7 reduceValue;
// 相关函数使用这些而外计算信息
public Integer ratingProduct,rating1Squared,rating2Squared,
m1rating,m2rating,m1NumOfRaters,m2NumOfTaters;
public void reduce(Text key,Iterable<Tuple3> values,Context context){
// 生成形如 <Movie1,Movie2> ,<Movie1,Movie3> 的链表
List<Tuple2<Tuple3,Tuple3>> list=generateUniqueCombinations(values);
for(Tuple2<Tuple3,Tuple3> pair: list){
Tuple3 m1=pair.first();
Tuple3 m2=pair.second();
//定义键
reduceKey= new Tuple2(m1.first(),m2.first());
// 计算相关数据
m1rating=Integer.parseInt(m1.second().toString());
m1NumOfRaters=Integer.parseInt(m1.third().toString());
m2rating=Integer.parseInt(m2.second().toString());
m2NumOfTaters=Integer.parseInt(m2.third().toString());
ratingProduct=m1rating*m2rating;
rating1Squared=m1rating*m1rating;
rating2Squared=m2rating*m2rating;
//定义值
reduceValue=new Tuple7(m1rating,m2NumOfTaters,m2rating,m2NumOfTaters,
ratingProduct,rating1Squared,rating2Squared);
try {
context.write(reduceKey,reduceValue);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public List<Tuple2<Tuple3,Tuple3>> generateUniqueCombinations(Iterable<Tuple3> values){
List<Tuple3> scan=new ArrayList<Tuple3>();
Map<Tuple2<Tuple3, Tuple3>,Integer> mp=new HashMap<Tuple2<Tuple3, Tuple3>, Integer>();
Tuple2<Tuple3,Tuple3> ls;
for(Tuple3 value:values){
scan.add(new Tuple3(value.first(),value.second(),value.third()));
}
List<Tuple2<Tuple3,Tuple3>> list=new ArrayList<Tuple2<Tuple3, Tuple3>>();
for(Tuple3 item: scan){
for(Tuple3 item2: scan){
if(item.equal(item2)) continue;
if(item.compareTo(item2)==1){
ls=new Tuple2(item,item2);
}else{
ls=new Tuple2(item2,item);
}
if(mp.get(ls)==null){
list.add(ls);
mp.put(ls,1);
}else continue;
}
}
return list;
}
}
其中,Tuple7设计模式如下
package com.deng.MovieRecommend;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Tuple7<T1,T2,T3,T4,T5,T6,T7> implements Writable, WritableComparable<Tuple7<T1,T2,T3,T4,T5,T6,T7>> {
private T1 _1;
private T2 _2;
private T3 _3;
private T4 _4;
private T5 _5;
private T6 _6;
private T7 _7;
public Tuple7(){
}
public Tuple7(T1 _1,T2 _2,T3 _3,T4 _4,T5 _5,T6 _6,T7 _7){
set(_1,_2,_3,_4,_5,_6,_7);
}
private void set(T1 _1,T2 _2,T3 _3,T4 _4,T5 _5,T6 _6,T7 _7){
this._1=_1;
this._2=_2;
this._3=_3;
this._4=_4;
this._5=_5;
this._6=_6;
this._7=_7;
}
public T1 get_1() {
return _1;
}
public void set_1(T1 _1) {
this._1 = _1;
}
public T2 get_2() {
return _2;
}
public void set_2(T2 _2) {
this._2 = _2;
}
public T3 get_3() {
return _3;
}
public void set_3(T3 _3) {
this._3 = _3;
}
public T4 get_4() {
return _4;
}
public void set_4(T4 _4) {
this._4 = _4;
}
public T5 get_5() {
return _5;
}
public void set_5(T5 _5) {
this._5 = _5;
}
public T6 get_6() {
return _6;
}
public void set_6(T6 _6) {
this._6 = _6;
}
public T7 get_7() {
return _7;
}
public void set_7(T7 _7) {
this._7 = _7;
}
@Override
public int compareTo(Tuple7<T1, T2, T3, T4, T5, T6, T7> o) {
return 0;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
Text.writeString(dataOutput, String.valueOf(_1));
Text.writeString(dataOutput, String.valueOf(_2));
Text.writeString(dataOutput, String.valueOf(_3));
Text.writeString(dataOutput, String.valueOf(_4));
Text.writeString(dataOutput, String.valueOf(_5));
Text.writeString(dataOutput, String.valueOf(_6));
Text.writeString(dataOutput, String.valueOf(_7));
}
@Override
public void readFields(DataInput dataInput) throws IOException {
_1= (T1) Text.readString(dataInput);
_2= (T2) Text.readString(dataInput);
_3= (T3) Text.readString(dataInput);
_4= (T4) Text.readString(dataInput);
_5= (T5) Text.readString(dataInput);
_6= (T6) Text.readString(dataInput);
_7= (T7) Text.readString(dataInput);
}
public String toString() {
StringBuilder sb = new StringBuilder("Tuple7[");
sb.append(_1).append(",").append(_2).append(",").append(_3).
append(",").append(_4).append(",").append(_5).append(",").append(_6)
.append(",").append(_7);
return sb.append("]").toString();
}
}
在这里,由于 reduer中的 Iterable< Tuple3> values 只能迭代一次,所以通过一个迭代把数据克隆或者新建到链表里面进行存储,才能执行接下来的运算过程。