新闻信息流在做推荐之前需要对新闻进行处理,对增量的新闻去重处理为其中一个重要环节,当然去重方法有很多种,考虑到当前情景的使用性,我们选择了基于simhash算法对文本进行去重。

     首先我们的流任务从kafka获取新闻,没分钟一个批次,也就是每分钟处理一次,由于接受的新闻格式为json格式,需要对其进行解析,解析出新闻id,title,summry,date四个字段,去重时把title与summry组合作为文本,之后做simhash。

      simhash算法首先需要对文本做tf-idf特征提取,获得词袋及其权重分值,分次方式我们选用hanlp的开源项目,此方法好处是除了核心词典以外,用户可以自行增加词典。分完词之后需要对每个文本的词分别作hash,然后按位乘以权重,相应位为1,则权重为1 * weight,相应位为0,则权重为-1 * weight,之后对各个关键词按位求和,结果中的没一位权重值大于0,置为1,否则置为0,最终每个文本获得一个64位的hash值。增量内容去重我们可以直接求每两个文本之间的距离来,按阈值来确定重复与否,但是考虑到后面需要做增量与存量的去重,需要借助第三方redis,要先把每次发的增量新闻存入redis数据库中。为了减小计算量我们想到一个方法,把每个文本的hash值切成4份,按照每份的值分组,这样就把极有可能相似的文本分到了一列表中,再对列表中的文本做距离运算,大大节省的运算的开支。最后,我们把去重之后的新闻写入kafka,发到下一个流任务,并把数据罗盘到hive表中。

package com.simhash

import java.text.SimpleDateFormat
import java.util.{Date, HashMap}

import com.hankcs.hanlp.HanLP.Config
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary
import com.hankcs.hanlp.tokenizer.NLPTokenizer
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.json.JSONObject
import org.json4s.DefaultFormats
import org.json4s.jackson.Serialization.write

import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer
import scala.collection.{immutable, mutable}
import scala.math.BigInt

object Simhash {

    //As Object field, global, static, no need to serialize
    implicit val formats = DefaultFormats

    /**
      * 读取kafka流数据
      * 基于simhash算法做内容去重
      * 再发到kafka,并罗盘到hdfs
      * @param args
      */
    def main(args: Array[String]): Unit = {
        val checkpointPath = args(0)
        val vTopics = args(1)
        val vBrokers = args(2)
        val vDocPartition = args(3).toInt
        val vSend2Topic = args(4)
        val tableName = args(5)
        val conAnlyDataPath = args(6)

        val sc = SparkContext.getOrCreate()
        //定义创建ssc函数
        def funCreateStreamingContext(): StreamingContext = {
            val newSsc = new StreamingContext(sc, Seconds(60))
            println("Creating new StreamingContext ")
            newSsc
        }
        val ssc = StreamingContext.getActiveOrCreate(checkpointPath, funCreateStreamingContext)
        //从kafka读取数据
        val vMessageStreamPer = loadDataAsJsonStrFromKafka(vTopics, vBrokers, ssc, vDocPartition)
        //基础处理流解析//
        val vMessageStreamBase = vMessageStreamPer.transform(messsge => {
            val rddsc = messsge.sparkContext

            //解析数据,生成DataFrame
            if (!messsge.isEmpty()) {
                val messsge1 = messsge.filter { x => !x.isEmpty }

                val stringRDD = messsge1.map { record => {
                    //channel的位置一定要在最后一位
                    val filedArray = Array(("docid", "filed"),
                        ("title", "filed"),
                        ("summary", "filed"),
                        ("date", "filed"))
                    funAnalysisJson(record, filedArray)
                }
                }
                val messageRdd =stringRDD.filter(x => x.head != "" && x.head != null && !x.head.isEmpty)
                messageRdd
            }
            else {
                rddsc.emptyRDD[Array[String]]
            }
        })
        vMessageStreamBase.foreachRDD(rdd => {
            //过滤重复docid
            val linesRdd = rdd.map(x => (x.head, x)).reduceByKey((x, y)=> x )
            //文本特征提取,simhash算法获取hash值
            val simhashRdd = tfIdf(linesRdd)
            //切分hash值为4份,每份16位,并按照每份的值分组,把可能相似的文章分到一组
            val simhashRdd2 = splitHash(simhashRdd)
            //获取重复列表的文章id
            val removeList = dedup(simhashRdd2)
            //过滤重复文章
            val incrementRdd = linesRdd.filter(x =>
                !removeList.contains(x._1)
            )
            //去重后文章发到kafka
            send2Kafka(incrementRdd, vBrokers, vSend2Topic)
            //罗盘到hdfs,映射成hive表
            write2Hive(incrementRdd, tableName, conAnlyDataPath)
        })
        ssc.start() // Start the computation
        ssc.awaitTermination()
        ssc.stop()
    }

    /**
      * 把数据罗盘到hive表
      * @param rdd
      * @param tableName
      * @param conAnlyDataPath
      * @return
      */
    def write2Hive(rdd: RDD[(String, Array[String])], tableName: String, conAnlyDataPath: String): Unit ={
        val vdate = getrealTime("yyyyMMdd")
        val vhour = getrealTime("HH")
        val vminute = getrealTime("mm")
        val sqlContext = new HiveContext(rdd.sparkContext)
        import sqlContext.implicits._
        val hiveOutput = rdd.map { case (docid, doc) =>
            (doc(0), doc(1), doc(2), doc(3))
        }.toDF("docid", "title", "summary", "date")
        val sqlCode = "alter table " + tableName + " add if not exists partition (pt_d='" + vdate + "', pt_h='" + vhour + "', pt_min='" + vminute + "')"
        val simAnlyDataPathPartition = conAnlyDataPath + "/pt_d=" + vdate + "/pt_h=" + vhour + "/pt_min=" + vminute
        hiveOutput.write.mode("append").format("parquet").save(simAnlyDataPathPartition)
        sqlContext.sql("use biads")
        sqlContext.sql(sqlCode)
    }

    /**
      * 获取当前时间
      * @param pattern  pattern
      * @return
      */
    def getrealTime(pattern: String): String = {
        val timeTag = System.currentTimeMillis()
        val changeTime = new Date(timeTag)
        val dataFormat = new SimpleDateFormat(pattern)
        dataFormat.format(changeTime)
    }
    /**
      *
      * @param rdd
      * @param brokers
      * @param send2Topic
      */
    def send2Kafka(rdd:  RDD[(String, Array[String])], brokers: String, send2Topic: String)={
        val resultOut = rdd.map { case (docid,  doc) =>
            Map("docid" -> doc(0),
                "title" -> doc(1),
                "summary" -> doc(2),
                "date" -> doc(3))
        }
        val props = new HashMap[String, Object]()
        props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
        props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
            "org.apache.kafka.common.serialization.StringSerializer")
        props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
            "org.apache.kafka.common.serialization.StringSerializer")
        //分布式发kafka方式
        resultOut.foreachPartition(it =>{
            val producer = new KafkaProducer[String, String](props)
            it.foreach(x =>{
                // 即将rdd中数据转化为数组,然后write函数将rdd内容转化为json格式
                val str = write(x)
                // 封装成Kafka消息
                val message = new ProducerRecord[String, String](send2Topic, null, str)
                // 给Kafka发送消息
                producer.send(message)
            })
        })
    }
    /**
      *文本转hash值
      * @param rdd
      */
    def tfIdf(rdd: RDD[(String, Array[String])]): RDD[(String, String, Array[String])] = {
        val rdd1 = rdd.map {
            case (docid, line) =>
                //title和summary组合作为文本内容进行分词
                val text = line(1) + line(2)
                Config.IOAdapter = new HadoopFileIoAdapter()
                val tmpWord = NLPTokenizer.segment(text)
                CoreStopWordDictionary.apply(tmpWord)
                val tf = mutable.Map[String, Double]()
                val idfMap = mutable.Map[String, Double]()
                tmpWord.foreach(x => {
                    if (tf.contains(x.word))
                        tf(x.word) += 1
                    else
                        tf += x.word -> 1
                })
                tmpWord.foreach(x => {
                    if (!idfMap.contains(x.word))
                        idfMap += x.word -> 1 / (1 * math.log10(x.getFrequency + 1) + 1)
                })

                val tfidf = tf.map {
                    case (word, tf) =>
                        val weight = if (idfMap.contains(word)) ((10 + 1) * tf) / (tf + 10 * (1 - 0 + 0 * text.length / 150.0)) * idfMap(word) else 1.0
                        word -> weight
                }.toList.sortWith(_._2 > _._2).take(200)
                val tfidfHash = tfidf.map{ case (word, weight) =>
                    (BigInt(java.security.MessageDigest.getInstance("MD5").
                        digest(word.getBytes("UTF-8")).map(0xFF & _).
                        map("%02x".format(_)).foldLeft("")(_ + _), 16), weight)
                }

                /**
                  * 计算SimHash值
                  * 按位乘以权重,相应位为1,则权重为1 * weight,相应位为0,则权重为-1 * weight
                  * 之后对各个关键词按位求和,结果中的每一位权重值 > 0, 置为1,否则置为0
                  */
                val BIT_LEN = 64
                val tmp = BigInt(1)
                val bitList = ListBuffer((0 until BIT_LEN).map(_ => 0.0) : _*)
                tfidfHash.foreach{
                    case (hash, weight) =>
                        (0 until BIT_LEN).foreach(i =>{
                            if(((tmp << i) & hash) != 0) bitList(i) += weight else bitList(i) -= weight
                        })
                }
                var result = BigInt(0)
                var const = BigInt(1)
                (0 until BIT_LEN).foreach(i =>{
                    if(bitList(i) > 0) result |= (const << i)
                })
                (docid, result.toString(), Array(funStringToTimeStamp(line(3), "yyyy-MM-dd HH:mm:ss").toString))
        }
        rdd1
    }

    /**
      *为提高运算性能,切分hash值为4份,每份16位
      * @param rdd
      * @return
      */
    def splitHash(rdd: RDD[(String, String, Array[String])]): RDD[(String, scala.Iterable[(String, String, Array[String])])]={
        val f = 64
        val k = 3
        val bucketNum = 4
        val offsets : List[Int] = (0 until bucketNum).map(i => f / bucketNum * i).toList
        val rdd2 = rdd.flatMap(x =>{
            val bandkeys = offsets.zipWithIndex.map{
                case (offset, i) =>
                    val m = if(i == offsets.length - 1) BigInt(2).pow(f - offset) - 1 else BigInt(2).pow(offsets(i + 1) - offsets(i)) - 1
                    //把word的hash值分为4部分,获取每一部分的hash值
                    val c = (BigInt(x._2) >> offset) & m
                    //拼接成key
                    List("hw", c.toString(), i.toString).mkString(":")
            }
            //每篇文章的hash值被分成4份,分别与改篇文章组成新的元组
            bandkeys.map(bandkey =>(bandkey, x))
        }).groupByKey()
        rdd2
    }

    /**
      * 各组内文章按照汉明距离去重,获取重复列表
      * @param rdd
      */
    def dedup(rdd:  RDD[(String, scala.Iterable[(String, String, Array[String])])]): List[String] ={
        val rdd2 = rdd.values.filter(_.size > 1).map{
            x =>
                val midList = x.toList
                val removeList = for {
                    step1 <- 0 until midList.size - 1
                    step2 <- step1 + 1 until midList.size
                    sh1 = midList.drop(step1).head
                    sh2 = midList.drop(step2).head
                    removeSh = if (distance(sh1._2, sh2._2) <= 3) {
                        if (sh1._3(0) > sh2._3(0)) sh2._1
                        else if (sh1._3(0) == sh2._3(0) && sh1._1 > sh2._1) sh2._1
                        else sh1._1
                    } else ""
                } yield removeSh
                removeList.toList
        }.filter(_.nonEmpty)
        rdd2.flatMap(x => x).collect().toSet.filter(_.nonEmpty).toList
    }

    /**
      * 汉明距离计算
      * @param sh1
      * @param sh2
      * @return
      */
    def distance(sh1: String, sh2: String): Int = {
        var xor = (BigInt(sh1) ^ BigInt(sh2)) & ((BigInt(1) << 64) - 1)
        var cnt = 0
        while(xor != 0){
            cnt += 1
            xor &= xor-1
        }
        cnt
    }
    /**
      * 将时间字符串修改为时间戳
      * @param time 时间
      * @param timeFormatted 时间格式
      * @return
      */
    def funStringToTimeStamp(time:String, timeFormatted:String) :Long = {
        val fm = new SimpleDateFormat(timeFormatted)
        val dt = fm.parse(time)
        dt.getTime
    }
    /**
      * 解析json字符串
      * @param recored
      * @param fieldArray
      * @return
      */
    def funAnalysisJson(recored:String, fieldArray:Array[(String,String)]): Array[String] = {

        val ss = new Array[String](fieldArray.length)
        try {
            val vJsonObject = new JSONObject(recored)

            for (i <- fieldArray.indices) {
                try {
                    if (fieldArray(i)._2 == "filed")
                        ss(i) = vJsonObject.getString(fieldArray(i)._1).trim
                    else {
                        ss(i) = vJsonObject.getString(fieldArray(i)._1).replace("[", "").
                            replace("]", "").replace(""""""", "").split(",").mkString(",")
                    }
                }
                catch {
                    case e: Exception => {
                        ss(i) = ""
                    }
                }
            }
            ss
        }
        catch {
            case e: Exception => {
                ss
            }
        }
    }

    /**
      * 从kafka读取数据
      * @param topics
      * @param brokerList
      * @param ssc
      * @param numPartitions
      * @return
      */
    def loadDataAsJsonStrFromKafka(topics: String,
                                   brokerList: String,
                                   ssc: StreamingContext,
                                   numPartitions: Int): DStream[String] = {
        //获取topic组
        val topicsSet = topics.split(",").toSet
        //kafka参数
        val kafkaParams = immutable.Map[String, String]("bootstrap.servers" -> brokerList)
        //读取messages
        val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
            ssc, kafkaParams, topicsSet).repartition(numPartitions)
        println("Kafka stream definition is ok!")
        messages.map(_._2)
    }
}