新闻信息流在做推荐之前需要对新闻进行处理,对增量的新闻去重处理为其中一个重要环节,当然去重方法有很多种,考虑到当前情景的使用性,我们选择了基于simhash算法对文本进行去重。
首先我们的流任务从kafka获取新闻,没分钟一个批次,也就是每分钟处理一次,由于接受的新闻格式为json格式,需要对其进行解析,解析出新闻id,title,summry,date四个字段,去重时把title与summry组合作为文本,之后做simhash。
simhash算法首先需要对文本做tf-idf特征提取,获得词袋及其权重分值,分次方式我们选用hanlp的开源项目,此方法好处是除了核心词典以外,用户可以自行增加词典。分完词之后需要对每个文本的词分别作hash,然后按位乘以权重,相应位为1,则权重为1 * weight,相应位为0,则权重为-1 * weight,之后对各个关键词按位求和,结果中的没一位权重值大于0,置为1,否则置为0,最终每个文本获得一个64位的hash值。增量内容去重我们可以直接求每两个文本之间的距离来,按阈值来确定重复与否,但是考虑到后面需要做增量与存量的去重,需要借助第三方redis,要先把每次发的增量新闻存入redis数据库中。为了减小计算量我们想到一个方法,把每个文本的hash值切成4份,按照每份的值分组,这样就把极有可能相似的文本分到了一列表中,再对列表中的文本做距离运算,大大节省的运算的开支。最后,我们把去重之后的新闻写入kafka,发到下一个流任务,并把数据罗盘到hive表中。
package com.simhash
import java.text.SimpleDateFormat
import java.util.{Date, HashMap}
import com.hankcs.hanlp.HanLP.Config
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary
import com.hankcs.hanlp.tokenizer.NLPTokenizer
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.json.JSONObject
import org.json4s.DefaultFormats
import org.json4s.jackson.Serialization.write
import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer
import scala.collection.{immutable, mutable}
import scala.math.BigInt
object Simhash {
//As Object field, global, static, no need to serialize
implicit val formats = DefaultFormats
/**
* 读取kafka流数据
* 基于simhash算法做内容去重
* 再发到kafka,并罗盘到hdfs
* @param args
*/
def main(args: Array[String]): Unit = {
val checkpointPath = args(0)
val vTopics = args(1)
val vBrokers = args(2)
val vDocPartition = args(3).toInt
val vSend2Topic = args(4)
val tableName = args(5)
val conAnlyDataPath = args(6)
val sc = SparkContext.getOrCreate()
//定义创建ssc函数
def funCreateStreamingContext(): StreamingContext = {
val newSsc = new StreamingContext(sc, Seconds(60))
println("Creating new StreamingContext ")
newSsc
}
val ssc = StreamingContext.getActiveOrCreate(checkpointPath, funCreateStreamingContext)
//从kafka读取数据
val vMessageStreamPer = loadDataAsJsonStrFromKafka(vTopics, vBrokers, ssc, vDocPartition)
//基础处理流解析//
val vMessageStreamBase = vMessageStreamPer.transform(messsge => {
val rddsc = messsge.sparkContext
//解析数据,生成DataFrame
if (!messsge.isEmpty()) {
val messsge1 = messsge.filter { x => !x.isEmpty }
val stringRDD = messsge1.map { record => {
//channel的位置一定要在最后一位
val filedArray = Array(("docid", "filed"),
("title", "filed"),
("summary", "filed"),
("date", "filed"))
funAnalysisJson(record, filedArray)
}
}
val messageRdd =stringRDD.filter(x => x.head != "" && x.head != null && !x.head.isEmpty)
messageRdd
}
else {
rddsc.emptyRDD[Array[String]]
}
})
vMessageStreamBase.foreachRDD(rdd => {
//过滤重复docid
val linesRdd = rdd.map(x => (x.head, x)).reduceByKey((x, y)=> x )
//文本特征提取,simhash算法获取hash值
val simhashRdd = tfIdf(linesRdd)
//切分hash值为4份,每份16位,并按照每份的值分组,把可能相似的文章分到一组
val simhashRdd2 = splitHash(simhashRdd)
//获取重复列表的文章id
val removeList = dedup(simhashRdd2)
//过滤重复文章
val incrementRdd = linesRdd.filter(x =>
!removeList.contains(x._1)
)
//去重后文章发到kafka
send2Kafka(incrementRdd, vBrokers, vSend2Topic)
//罗盘到hdfs,映射成hive表
write2Hive(incrementRdd, tableName, conAnlyDataPath)
})
ssc.start() // Start the computation
ssc.awaitTermination()
ssc.stop()
}
/**
* 把数据罗盘到hive表
* @param rdd
* @param tableName
* @param conAnlyDataPath
* @return
*/
def write2Hive(rdd: RDD[(String, Array[String])], tableName: String, conAnlyDataPath: String): Unit ={
val vdate = getrealTime("yyyyMMdd")
val vhour = getrealTime("HH")
val vminute = getrealTime("mm")
val sqlContext = new HiveContext(rdd.sparkContext)
import sqlContext.implicits._
val hiveOutput = rdd.map { case (docid, doc) =>
(doc(0), doc(1), doc(2), doc(3))
}.toDF("docid", "title", "summary", "date")
val sqlCode = "alter table " + tableName + " add if not exists partition (pt_d='" + vdate + "', pt_h='" + vhour + "', pt_min='" + vminute + "')"
val simAnlyDataPathPartition = conAnlyDataPath + "/pt_d=" + vdate + "/pt_h=" + vhour + "/pt_min=" + vminute
hiveOutput.write.mode("append").format("parquet").save(simAnlyDataPathPartition)
sqlContext.sql("use biads")
sqlContext.sql(sqlCode)
}
/**
* 获取当前时间
* @param pattern pattern
* @return
*/
def getrealTime(pattern: String): String = {
val timeTag = System.currentTimeMillis()
val changeTime = new Date(timeTag)
val dataFormat = new SimpleDateFormat(pattern)
dataFormat.format(changeTime)
}
/**
*
* @param rdd
* @param brokers
* @param send2Topic
*/
def send2Kafka(rdd: RDD[(String, Array[String])], brokers: String, send2Topic: String)={
val resultOut = rdd.map { case (docid, doc) =>
Map("docid" -> doc(0),
"title" -> doc(1),
"summary" -> doc(2),
"date" -> doc(3))
}
val props = new HashMap[String, Object]()
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
"org.apache.kafka.common.serialization.StringSerializer")
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
"org.apache.kafka.common.serialization.StringSerializer")
//分布式发kafka方式
resultOut.foreachPartition(it =>{
val producer = new KafkaProducer[String, String](props)
it.foreach(x =>{
// 即将rdd中数据转化为数组,然后write函数将rdd内容转化为json格式
val str = write(x)
// 封装成Kafka消息
val message = new ProducerRecord[String, String](send2Topic, null, str)
// 给Kafka发送消息
producer.send(message)
})
})
}
/**
*文本转hash值
* @param rdd
*/
def tfIdf(rdd: RDD[(String, Array[String])]): RDD[(String, String, Array[String])] = {
val rdd1 = rdd.map {
case (docid, line) =>
//title和summary组合作为文本内容进行分词
val text = line(1) + line(2)
Config.IOAdapter = new HadoopFileIoAdapter()
val tmpWord = NLPTokenizer.segment(text)
CoreStopWordDictionary.apply(tmpWord)
val tf = mutable.Map[String, Double]()
val idfMap = mutable.Map[String, Double]()
tmpWord.foreach(x => {
if (tf.contains(x.word))
tf(x.word) += 1
else
tf += x.word -> 1
})
tmpWord.foreach(x => {
if (!idfMap.contains(x.word))
idfMap += x.word -> 1 / (1 * math.log10(x.getFrequency + 1) + 1)
})
val tfidf = tf.map {
case (word, tf) =>
val weight = if (idfMap.contains(word)) ((10 + 1) * tf) / (tf + 10 * (1 - 0 + 0 * text.length / 150.0)) * idfMap(word) else 1.0
word -> weight
}.toList.sortWith(_._2 > _._2).take(200)
val tfidfHash = tfidf.map{ case (word, weight) =>
(BigInt(java.security.MessageDigest.getInstance("MD5").
digest(word.getBytes("UTF-8")).map(0xFF & _).
map("%02x".format(_)).foldLeft("")(_ + _), 16), weight)
}
/**
* 计算SimHash值
* 按位乘以权重,相应位为1,则权重为1 * weight,相应位为0,则权重为-1 * weight
* 之后对各个关键词按位求和,结果中的每一位权重值 > 0, 置为1,否则置为0
*/
val BIT_LEN = 64
val tmp = BigInt(1)
val bitList = ListBuffer((0 until BIT_LEN).map(_ => 0.0) : _*)
tfidfHash.foreach{
case (hash, weight) =>
(0 until BIT_LEN).foreach(i =>{
if(((tmp << i) & hash) != 0) bitList(i) += weight else bitList(i) -= weight
})
}
var result = BigInt(0)
var const = BigInt(1)
(0 until BIT_LEN).foreach(i =>{
if(bitList(i) > 0) result |= (const << i)
})
(docid, result.toString(), Array(funStringToTimeStamp(line(3), "yyyy-MM-dd HH:mm:ss").toString))
}
rdd1
}
/**
*为提高运算性能,切分hash值为4份,每份16位
* @param rdd
* @return
*/
def splitHash(rdd: RDD[(String, String, Array[String])]): RDD[(String, scala.Iterable[(String, String, Array[String])])]={
val f = 64
val k = 3
val bucketNum = 4
val offsets : List[Int] = (0 until bucketNum).map(i => f / bucketNum * i).toList
val rdd2 = rdd.flatMap(x =>{
val bandkeys = offsets.zipWithIndex.map{
case (offset, i) =>
val m = if(i == offsets.length - 1) BigInt(2).pow(f - offset) - 1 else BigInt(2).pow(offsets(i + 1) - offsets(i)) - 1
//把word的hash值分为4部分,获取每一部分的hash值
val c = (BigInt(x._2) >> offset) & m
//拼接成key
List("hw", c.toString(), i.toString).mkString(":")
}
//每篇文章的hash值被分成4份,分别与改篇文章组成新的元组
bandkeys.map(bandkey =>(bandkey, x))
}).groupByKey()
rdd2
}
/**
* 各组内文章按照汉明距离去重,获取重复列表
* @param rdd
*/
def dedup(rdd: RDD[(String, scala.Iterable[(String, String, Array[String])])]): List[String] ={
val rdd2 = rdd.values.filter(_.size > 1).map{
x =>
val midList = x.toList
val removeList = for {
step1 <- 0 until midList.size - 1
step2 <- step1 + 1 until midList.size
sh1 = midList.drop(step1).head
sh2 = midList.drop(step2).head
removeSh = if (distance(sh1._2, sh2._2) <= 3) {
if (sh1._3(0) > sh2._3(0)) sh2._1
else if (sh1._3(0) == sh2._3(0) && sh1._1 > sh2._1) sh2._1
else sh1._1
} else ""
} yield removeSh
removeList.toList
}.filter(_.nonEmpty)
rdd2.flatMap(x => x).collect().toSet.filter(_.nonEmpty).toList
}
/**
* 汉明距离计算
* @param sh1
* @param sh2
* @return
*/
def distance(sh1: String, sh2: String): Int = {
var xor = (BigInt(sh1) ^ BigInt(sh2)) & ((BigInt(1) << 64) - 1)
var cnt = 0
while(xor != 0){
cnt += 1
xor &= xor-1
}
cnt
}
/**
* 将时间字符串修改为时间戳
* @param time 时间
* @param timeFormatted 时间格式
* @return
*/
def funStringToTimeStamp(time:String, timeFormatted:String) :Long = {
val fm = new SimpleDateFormat(timeFormatted)
val dt = fm.parse(time)
dt.getTime
}
/**
* 解析json字符串
* @param recored
* @param fieldArray
* @return
*/
def funAnalysisJson(recored:String, fieldArray:Array[(String,String)]): Array[String] = {
val ss = new Array[String](fieldArray.length)
try {
val vJsonObject = new JSONObject(recored)
for (i <- fieldArray.indices) {
try {
if (fieldArray(i)._2 == "filed")
ss(i) = vJsonObject.getString(fieldArray(i)._1).trim
else {
ss(i) = vJsonObject.getString(fieldArray(i)._1).replace("[", "").
replace("]", "").replace(""""""", "").split(",").mkString(",")
}
}
catch {
case e: Exception => {
ss(i) = ""
}
}
}
ss
}
catch {
case e: Exception => {
ss
}
}
}
/**
* 从kafka读取数据
* @param topics
* @param brokerList
* @param ssc
* @param numPartitions
* @return
*/
def loadDataAsJsonStrFromKafka(topics: String,
brokerList: String,
ssc: StreamingContext,
numPartitions: Int): DStream[String] = {
//获取topic组
val topicsSet = topics.split(",").toSet
//kafka参数
val kafkaParams = immutable.Map[String, String]("bootstrap.servers" -> brokerList)
//读取messages
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topicsSet).repartition(numPartitions)
println("Kafka stream definition is ok!")
messages.map(_._2)
}
}