Hadoop實時數據處理主要依賴于Apache Storm、Apache Flink等流處理框架。以下是使用這些框架進行實時數據處理的基本步驟:
環境搭建
編寫拓撲結構
提交拓撲
監控和管理
數據輸出
環境搭建
編寫程序
本地測試
部署到集群
監控和管理
數據輸出
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import java.util.Map;
public class WordCountTopology {
public static class RandomSentenceSpout extends BaseRichSpout {
SpoutOutputCollector collector;
String[] sentences = new String[]{
"the cow jumped over the moon",
"an apple a day keeps the doctor away",
"four score and seven years ago",
"snow white and the seven dwarfs",
"i am at two with nature"
};
int index = 0;
@Override
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
this.collector = collector;
}
@Override
public void nextTuple() {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
e.printStackTrace();
}
String sentence = sentences[index];
index = (index + 1) % sentences.length;
collector.emit(new Values(sentence));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("sentence"));
}
}
public static class SplitSentence implements org.apache.storm.topology.BasicFunction {
@Override
public void execute(org.apache.storm.tuple.Tuple tuple, org.apache.storm.topology.BasicOutputCollector collector, org.apache.storm.tuple.BasicOutputCollector basicOutputCollector) {
String sentence = tuple.getString(0);
for (String word : sentence.split(" ")) {
collector.emit(new org.apache.storm.tuple.Values(word));
}
}
}
public static class WordCount implements org.apache.storm.topology.BasicFunction {
@Override
public void execute(org.apache.storm.tuple.Tuple tuple, org.apache.storm.topology.BasicOutputCollector collector) {
String word = tuple.getString(0);
Integer count = (Integer) tuple.getValueByField("count");
collector.emit(new org.apache.storm.tuple.Values(word, count + 1));
}
}
public static void main(String[] args) throws Exception {
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("spout", new RandomSentenceSpout(), 5);
builder.setBolt("split", new SplitSentence(), 8).shuffleGrouping("spout");
builder.setBolt("count", new WordCount(), 12).fieldsGrouping("split", new Fields("word"));
Config conf = new Config();
conf.setDebug(true);
if (args != null && args.length > 0) {
conf.setNumWorkers(3);
StormSubmitter.submitTopologyWithProgressBar(args[0], conf, builder.createTopology());
} else {
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("word-count", conf, builder.createTopology());
Thread.sleep(10000);
cluster.killTopology("word-count");
cluster.shutdown();
}
}
}
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class WordCount {
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<String> text = env.fromElements(
"the cow jumped over the moon",
"an apple a day keeps the doctor away",
"four score and seven years ago",
"snow white and the seven dwarfs",
"i am at two with nature"
);
DataStream<Tuple2<String, Integer>> counts = text
.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
for (String word : value.toLowerCase().split("\\s")) {
if (word.length() > 0) {
out.collect(new Tuple2<>(word, 1));
}
}
}
})
.returns(Types.TUPLE(Types.STRING, Types.INT))
.keyBy(0)
.sum(1);
counts.print();
env.execute("Word Count");
}
}
通過以上步驟和示例代碼,你可以開始使用Hadoop生態系統中的實時數據處理工具來處理和分析數據流。