Flink的处理流程
一 从集合读取数据
import bag.day01.SensorReading
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
//提供隐式转换
import org.apache.flink.api.scala._
object CollectionSuource {
def main(args: Array[String]): Unit = {
val env =StreamExecutionEnvironment.getExecutionEnvironment
//设置并行度
env.setParallelism(1)
//读取集合数据
val stream1 = env .fromCollection(
List( SensorReading("sensor_1", 1547718199, 35.8),
SensorReading("sensor_6", 1547718201, 15.4),
SensorReading("sensor_7", 1547718202, 6.7),
SensorReading("sensor_10", 1547718205, 38.1) ))
stream1.print()
env.execute()
}
}
二 从文件读取
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
object FileSource {
def main(args: Array[String]): Unit = {
val env =StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val read_text = env.readTextFile("C:\\Users\\Administrator\\Desktop\\文档\\01.txt")
read_text.print()
env.execute()
}
}
三 监控socket
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
object StreamWordCount {
def main(args: Array[String]): Unit = {
val eal = StreamExecutionEnvironment.createRemoteEnvironment("hadoop101",7077)
val tool:ParameterTool=ParameterTool.fromArgs(args)
val host=tool.get("host")
val port=tool.getInt("port")
eal.setParallelism(4)
// eal.disableOperatorChaining() 切断任务链
val input_Stream:DataStream[String] = eal.socketTextStream(host,port)
val data_info = input_Stream.flatMap(_.split(" ")).slotSharingGroup("a")
.filter(_.nonEmpty).
map((_,1)).setParallelism(3).keyBy(0).
sum(1)
//并行度为一 只能保证唯一顺序输出,无法保证顺序输出
data_info.print().setParallelism(1)
eal.execute("word_count")
}
}
四 接收kafka
导入依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_2.12</artifactId>
<version>1.10.1</version>
</dependency>
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
import java.util.Properties
case class SensorReading(id:String, timestamp:Long, temp:Double)
object TestRemote {
def main(args: Array[String]): Unit = {
val env =StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
properties.setProperty("bootstrap.servers", "lin:9092")
properties.setProperty("group.id", "consumer-group")
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("auto.offset.reset", "latest")
val stream3 = env.addSource(new FlinkKafkaConsumer011[String]("flink", new SimpleStringSchema(), properties))
stream3.print()
env.execute()
}
}
五自定义Source
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.api.scala._
import scala.util.Random
object MySource {
def main(args: Array[String]): Unit = {
val env =StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val env_patch = ExecutionEnvironment.getExecutionEnvironment
val sort4 = env.addSource(new mysource_funtion())
/*sort4.filter(x=>x.temp>100).print()*/
val gh = sort4.split(data=>{
if(data.temp>100) Seq("high")
else Seq("low")
})
val low = gh.select("low").map(x=>(x.id,x.temp))
val high = gh.select("high")
val warnings = high.map(x=>(x.id,x.temp))
val gs= warnings.connect(low)
val gf = gs.map(
warningData =>(warningData._1,warningData._2,"warning"),
low=>(low._1,"healthy")
)
gf.print()
env.execute()
}
}
class mysource_funtion extends SourceFunction[SensorReading]{
var running = true
override def cancel(): Unit = running=false
override def run(ctx: SourceFunction.SourceContext[SensorReading]): Unit = {
// 初始化一个随机数发生器
val rand = new Random()
var curTemp = 1.to(100).map( i => ( "sensor_" + i, 65 + rand.nextGaussian() * 20 ) )
while(running){ // 更新温度值
//使用高斯分布
curTemp = curTemp.map( t => (t._1, t._2 + rand.nextGaussian() ) )
// 获取当前时间戳
val curTime = System.currentTimeMillis()
curTemp.foreach( t => ctx.collect(SensorReading(t._1, curTime, t._2)) )
Thread.sleep(100) }
}
}