013-通过trident实现单词计数功能

原创

艾文编程 2023-03-10 21:58:00 博主文章分类：分布式数据计算技术 ©著作权

©著作权归作者所有：来自51CTO博客作者艾文编程的原创作品，请联系作者获取转载授权，否则将追究法律责任

使用trident框架完成wordcount单词计数，使用到大量的trident函数，比如分组，合并等

package storm.trident;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import kafka.cluster.Cluster;

import org.apache.commons.collections.MapUtils;
import org.apache.commons.io.FileUtils;

import storm.trident.fluent.GroupedStream;
import storm.trident.operation.BaseAggregator;
import storm.trident.operation.BaseFunction;
import storm.trident.operation.TridentCollector;
import storm.trident.spout.IBatchSpout;
import storm.trident.tuple.TridentTuple;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.task.TopologyContext;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
import backtype.storm.utils.Utils;

/**
* Trident 实现单词计数
*
* @author shenfl
*
*/
public class TridentWordCount {

     public static void main(String[] args) {

          TridentTopology tridentTopology = new TridentTopology();
          Stream stream = tridentTopology.newStream("spout_id", new DataSourceSpout());

          GroupedStream groupedStream = stream.each(new Fields("line"), new SplitBolt(), new Fields("word"))// 单词切分
                    .groupBy(new Fields("word"));// word字段分组
          // 对每批tuple进行聚合
          Stream batchMap = groupedStream.aggregate(new Fields("word"), new WordBatchAggregate(), new Fields("batchMap"));
          // 对所有batch进行汇总输出
          batchMap.each(new Fields("batchMap"), new TotalAggregator(), new Fields(""));

          LocalCluster localCluster = new LocalCluster();
          localCluster.submitTopology(TridentTopology.class.getSimpleName(), new Config(), tridentTopology.build());
     }

     /**
     *
     * 这里简单打印输出
     *
     * @author shenfl
     *
     */
     public static class TotalAggregator extends BaseFunction {
          // 汇总hashMap
          Map<String, Integer> hashMap = new HashMap<String, Integer>();

          public void execute(TridentTuple tuple, TridentCollector collector) {

               Map<String, Integer> batchMap = (Map<String, Integer>) tuple.getValueByField("batchMap");

               for (Map.Entry<String, Integer> entry : batchMap.entrySet()) {
                    // 单词
                    String word = entry.getKey();
                    // 对应单词在batch中的数量
                    Integer value = entry.getValue();

                    Integer count = hashMap.get(word);
                    if (count == null) {
                         count = 0;
                    }
                    hashMap.put(word, value + count);
               }

               Utils.sleep(1000);
               System.out.println("===============");
               for (Entry<String, Integer> entry : hashMap.entrySet()) {
                    System.out.println(entry);
               }
          }
     }

     /**
     * 每批tuple进行聚合，聚合结果位<k,v> ，k表示单词，v表示单词出现的次数
     *
     * @author shenfl
     *
     */
     public static class WordBatchAggregate extends BaseAggregator<Map<String, Integer>> {

          public Map<String, Integer> init(Object batchId, TridentCollector collector) {
               return new HashMap<String, Integer>();
          }

          /**
          * @param tuple
          *            一个单词
          * @val tuple 单词出现的次数
          */
          public void aggregate(Map<String, Integer> val, TridentTuple tuple, TridentCollector collector) {

               String word = tuple.getStringByField("word");

               Integer v = val.get(word);
               if (v == null) {
                    v = 0;
               }
               v++;
               val.put(word, v);
          }

          public void complete(Map<String, Integer> val, TridentCollector collector) {
               // 聚合完成后，完成batchId这一批单词的统计
               collector.emit(new Values(val));
          }
     }

     /**
     * 对每行文本进行切割 ，然后word发射,每个tuple是一个单词
     *
     * @author shenfl
     *
     */
     public static class SplitBolt extends BaseFunction {

          public void execute(TridentTuple tuple, TridentCollector collector) {
               String line = tuple.getString(0);
               String[] words = line.split("\t");
               for (String word : words) {
                    collector.emit(new Values(word));
               }
          }
     }

     /**
     * 数据源
     *
     * @author shenfl
     *
     */
     public static class DataSourceSpout implements IBatchSpout {
          private Map conf;
          private TopologyContext context;
          /**
          *
          */
          private static final long serialVersionUID = 1L;
          HashMap<Long, List<List<Object>>> batches = new HashMap<Long, List<List<Object>>>();

          public void open(Map conf, TopologyContext context) {
               this.conf = conf;
               this.context = context;
          }

          int i = 0;

          /**
          * 死循环，，emitBatch实现每次发送多个tuple，每次都一个batchId
          */
          public void emitBatch(long batchId, TridentCollector collector) {

               List<List<Object>> batch = this.batches.get(batchId);
               if (batch == null) {
                    batch = new ArrayList<List<Object>>();
                    // 读取文件列表
                    Collection<File> listFiles = FileUtils.listFiles(new File("d:/test"), new String[] { "txt" }, true);
                    try {
                         for (File file : listFiles) {
                              // 当前文件所有行
                              List<String> Lines = FileUtils.readLines(file);
                              for (String line : Lines) {
                                   // 每个参数代表一个tuple
                                   batch.add(new Values(line));
                              }
                              // 文件移动别的目录
                              FileUtils.moveFile(file, new File(file + ".done" + System.currentTimeMillis()));
                         }
                    } catch (IOException e) {
                         e.printStackTrace();
                    }
                    this.batches.put(batchId, batch);
               }
               // 以批为单位进行发射
               for (List<Object> list : batch) {
                    collector.emit(list);
               }
          }

          public void ack(long batchId) {
               this.batches.remove(batchId);
          }

          public void close() {
          }

          public Map getComponentConfiguration() {
               Config conf = new Config();
               conf.setMaxTaskParallelism(1);
               return conf;
          }

          /**
          * 每个tuple以行为为单位进行发射，bolt每次接收一行
          */
          public Fields getOutputFields() {
               return new Fields("line");
          }
     }
}