使用trident框架完成wordcount单词计数,使用到大量的trident函数,比如分组,合并等



package storm.trident;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import kafka.cluster.Cluster;

import org.apache.commons.collections.MapUtils;
import org.apache.commons.io.FileUtils;

import storm.trident.fluent.GroupedStream;
import storm.trident.operation.BaseAggregator;
import storm.trident.operation.BaseFunction;
import storm.trident.operation.TridentCollector;
import storm.trident.spout.IBatchSpout;
import storm.trident.tuple.TridentTuple;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.task.TopologyContext;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
import backtype.storm.utils.Utils;

/**
* Trident 实现单词计数
*
* @author shenfl
*
*/
public class TridentWordCount {

public static void main(String[] args) {

TridentTopology tridentTopology = new TridentTopology();
Stream stream = tridentTopology.newStream("spout_id", new DataSourceSpout());

GroupedStream groupedStream = stream.each(new Fields("line"), new SplitBolt(), new Fields("word"))// 单词切分
.groupBy(new Fields("word"));// word字段分组
// 对每批tuple进行聚合
Stream batchMap = groupedStream.aggregate(new Fields("word"), new WordBatchAggregate(), new Fields("batchMap"));
// 对所有batch进行汇总输出
batchMap.each(new Fields("batchMap"), new TotalAggregator(), new Fields(""));

LocalCluster localCluster = new LocalCluster();
localCluster.submitTopology(TridentTopology.class.getSimpleName(), new Config(), tridentTopology.build());
}

/**
*
* 这里简单打印输出
*
* @author shenfl
*
*/
public static class TotalAggregator extends BaseFunction {
// 汇总hashMap
Map<String, Integer> hashMap = new HashMap<String, Integer>();

public void execute(TridentTuple tuple, TridentCollector collector) {

Map<String, Integer> batchMap = (Map<String, Integer>) tuple.getValueByField("batchMap");

for (Map.Entry<String, Integer> entry : batchMap.entrySet()) {
// 单词
String word = entry.getKey();
// 对应单词在batch中的数量
Integer value = entry.getValue();

Integer count = hashMap.get(word);
if (count == null) {
count = 0;
}
hashMap.put(word, value + count);
}

Utils.sleep(1000);
System.out.println("===============");
for (Entry<String, Integer> entry : hashMap.entrySet()) {
System.out.println(entry);
}
}
}

/**
* 每批tuple进行聚合,聚合结果位<k,v> ,k表示单词,v表示单词出现的次数
*
* @author shenfl
*
*/
public static class WordBatchAggregate extends BaseAggregator<Map<String, Integer>> {

public Map<String, Integer> init(Object batchId, TridentCollector collector) {
return new HashMap<String, Integer>();
}

/**
* @param tuple
* 一个单词
* @val tuple 单词出现的次数
*/
public void aggregate(Map<String, Integer> val, TridentTuple tuple, TridentCollector collector) {

String word = tuple.getStringByField("word");

Integer v = val.get(word);
if (v == null) {
v = 0;
}
v++;
val.put(word, v);
}

public void complete(Map<String, Integer> val, TridentCollector collector) {
// 聚合完成后,完成batchId这一批单词的统计
collector.emit(new Values(val));
}
}

/**
* 对每行文本进行切割 ,然后word发射,每个tuple是一个单词
*
* @author shenfl
*
*/
public static class SplitBolt extends BaseFunction {

public void execute(TridentTuple tuple, TridentCollector collector) {
String line = tuple.getString(0);
String[] words = line.split("\t");
for (String word : words) {
collector.emit(new Values(word));
}
}
}

/**
* 数据源
*
* @author shenfl
*
*/
public static class DataSourceSpout implements IBatchSpout {
private Map conf;
private TopologyContext context;
/**
*
*/
private static final long serialVersionUID = 1L;
HashMap<Long, List<List<Object>>> batches = new HashMap<Long, List<List<Object>>>();

public void open(Map conf, TopologyContext context) {
this.conf = conf;
this.context = context;
}

int i = 0;

/**
* 死循环,,emitBatch实现每次发送多个tuple,每次都一个batchId
*/
public void emitBatch(long batchId, TridentCollector collector) {

List<List<Object>> batch = this.batches.get(batchId);
if (batch == null) {
batch = new ArrayList<List<Object>>();
// 读取文件列表
Collection<File> listFiles = FileUtils.listFiles(new File("d:/test"), new String[] { "txt" }, true);
try {
for (File file : listFiles) {
// 当前文件所有行
List<String> Lines = FileUtils.readLines(file);
for (String line : Lines) {
// 每个参数代表一个tuple
batch.add(new Values(line));
}
// 文件移动别的目录
FileUtils.moveFile(file, new File(file + ".done" + System.currentTimeMillis()));
}
} catch (IOException e) {
e.printStackTrace();
}
this.batches.put(batchId, batch);
}
// 以批为单位进行发射
for (List<Object> list : batch) {
collector.emit(list);
}
}

public void ack(long batchId) {
this.batches.remove(batchId);
}

public void close() {
}

public Map getComponentConfiguration() {
Config conf = new Config();
conf.setMaxTaskParallelism(1);
return conf;
}

/**
* 每个tuple以行为为单位进行发射,bolt每次接收一行
*/
public Fields getOutputFields() {
return new Fields("line");
}
}
}