Java示例

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector; import java.util.Properties; public class KafkaStreamJoin { public static void main(String[] args) throws Exception {
// 创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // 设置 Kafka 配置信息
Properties props = new Properties();
props.setProperty("bootstrap.servers", "localhost:9092");
props.setProperty("group.id", "test"); // 创建 FlinkKafkaConsumer,并添加数据源1
FlinkKafkaConsumer<String> kafkaConsumer1 = new FlinkKafkaConsumer<>("topic1", new SimpleStringSchema(), props);
DataStream<String> stream1 = env.addSource(kafkaConsumer1); // 创建 FlinkKafkaConsumer,并添加数据源2
FlinkKafkaConsumer<String> kafkaConsumer2 = new FlinkKafkaConsumer<>("topic2", new SimpleStringSchema(), props);
DataStream<String> stream2 = env.addSource(kafkaConsumer2); // 提取时间戳,以便基于时间的窗口
DataStream<Tuple2<String, Integer>> keyedStream1 = stream1.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String value) throws Exception {
String[] parts = value.split(",");
return new Tuple2<>(parts[0], Integer.parseInt(parts[1]));
}
}).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<Tuple2<String, Integer>>() {
@Override
public long extractAscendingTimestamp(Tuple2<String, Integer> element) {
return element.f1;
}
}).keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
@Override
public String getKey(Tuple2<String, Integer> value) throws Exception {
return value.f0;
}
}); DataStream<Tuple2<String, Integer>> keyedStream2 = stream2.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String value) throws Exception {
String[] parts = value.split(",");
return new Tuple2<>(parts[0], Integer.parseInt(parts[1]));
}
}).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<Tuple2<String, Integer>>() {
@Override
public long extractAscendingTimestamp(Tuple2<String, Integer> element) {
return element.f1;
}
}).keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
@Override
public String getKey(Tuple2<String, Integer> value) throws Exception {
return value.f0;
}
}); // 合并两个流,使用窗口进行计算
DataStream<String> result = keyedStream1.join(keyedStream2)
.where
(Tuple2<String, Integer> left, Tuple2<String, Integer> right, Collector<String> out) -> {
out.collect(left.f0 + "," + left.f1 + "," + right.f1);
})
.window(SlidingEventTimeWindows.of(Time.seconds(30), Time.seconds(10)))
.sum(1)
.map(new MapFunction<Tuple2<String, Integer>, String>() {
@Override
public String map(Tuple2<String, Integer> value) throws Exception {
return value.f0 + "," + value.f1;
}
}); // 将计算结果写入 MySQL 数据库
String sql = "INSERT INTO result (key, count) VALUES (?, ?)";
JDBCOutputFormat jdbcOutputFormat = JDBCOutputFormat.buildJDBCOutputFormat()
.setDrivername("com.mysql.jdbc.Driver")
.setDBUrl("jdbc:mysql://localhost:3306/test")
.setUsername("root")
.setPassword("password")
.setQuery(sql)
.setBatchInterval(5000)
.finish(); result.addSink(jdbcOutputFormat); // 执行程序
env.execute("KafkaStreamJoin");
}
}

  

使用FlinkTableApi

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.connectors.mysql.MySQLUpsertTableSink;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.api.config.TableConfigOptions;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.types.Row; import java.util.Properties; public class KafkaStreamJoin { public static void main(String[] args) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings); Properties props = new Properties();
props.setProperty("bootstrap.servers", "localhost:9092");
props.setProperty("group.id", "test"); DataStream<String> stream1 = env
.addSource(new FlinkKafkaConsumer<>("stream1", new SimpleStringSchema(), props));
DataStream<String> stream2 = env
.addSource(new FlinkKafkaConsumer<>("stream2", new SimpleStringSchema(), props)); Table table1 = tableEnv.fromDataStream(stream1, "key1, value1, ts1.rowtime");
Table table2 = tableEnv.fromDataStream(stream2, "key2, value2, ts2.rowtime"); Table resultTable = table1.join(table2)
.where("key1 = key2 && ts2 >= ts1 - INTERVAL '5' SECOND && ts2 <= ts1 + INTERVAL '5' SECOND")
.select("key1, value1, value2")
.groupBy("key1, value1, value2, TUMBLE(ts1, INTERVAL '30' SECOND, INTERVAL '10' SECOND)")
.select("key1, value1, value2, count(1) as cnt"); // 配置 MySQL 连接信息
String driverName = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/test";
String username = "root";
String password = "123456"; // 定义 MySQL UpsertTableSink
TableSink<Row> tableSink = new MySQLUpsertTableSink(
new String[]{"key1", "value1", "value2", "cnt"},
new int[]{Types.STRING, Types.INT, Types.INT, Types.LONG},
url, username, password, driverName); // 配置 TableEnvironment
tableEnv.getConfig().getConfiguration().setBoolean(TableConfigOptions.WRITE_MODE_ALLOW_SPECIFIC, true);
tableEnv.getConfig().getConfiguration().setString(TableConfigOptions.WRITE_MODE, "UPSERT"); // 将计算结果写入 MySQL 数据库
resultTable
.map(new MapFunction<Row, Row>() {
@Override
public Row map(Row value) throws Exception {
return value;
}
})
.addSink(tableSink); env.execute("KafkaStreamJoin");
}
}

java版本对应的maven库

Group ID Artifact ID Version
org.apache.flink flink-core 1.12.5
org.apache.flink flink-streaming-java_2.12 1.12.5
org.apache.flink flink-table-api-java-bridge_2.12 1.12.5
org.apache.flink flink-table-planner_2.12 1.12.5
org.apache.flink flink-connector-kafka_2.12 1.12.5
org.apache.kafka kafka-clients 2.4.1
mysql mysql-connector-java 8.0.23
org.apache.flink flink-jdbc_2.12 1.12.5

  

scala示例

import org.apache.flink.api.common.functions.JoinFunction
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.windowing.assigners.SlidingProcessingTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer object FlinkKafkaJoinExample {
case class SensorReading(id: String, timestamp: Long, temperature: Double) def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment // 从Kafka中读取数据流
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("group.id", "test-group")
properties.setProperty("auto.offset.reset", "earliest") val stream1: DataStream[SensorReading] = env.addSource(
new FlinkKafkaConsumer[String]("topic1", new SimpleStringSchema(), properties)
)
.map(data => {
val dataArray = data.split(",")
SensorReading(dataArray(0), dataArray(1).toLong, dataArray(2).toDouble)
})
.assignAscendingTimestamps(_.timestamp) val stream2: DataStream[SensorReading] = env.addSource(
new FlinkKafkaConsumer[String]("topic2", new SimpleStringSchema(), properties)
)
.map(data => {
val dataArray = data.split(",")
SensorReading(dataArray(0), dataArray(1).toLong, dataArray(2).toDouble)
})
.assignAscendingTimestamps(_.timestamp) // 将两条数据流进行合并
val joinedStream: DataStream[(String, Double, Double)] = stream1.join(stream2)
.where(_.id)
.equalTo(_.id)
.window(SlidingProcessingTimeWindows.of(Time.seconds(10), Time.seconds(5)))
.apply(new JoinFunction[SensorReading, SensorReading, (String, Double, Double)] {
override def join(first: SensorReading, second: SensorReading): (String, Double, Double) =
(first.id, first.temperature, second.temperature)
}) // 对合并后的流进行算子操作
val resultStream: DataStream[(String, Double)] = joinedStream
.map(data => (data._1, (data._2 + data._3) / 2))
.filter(data => data._2 > 30) // 将结果输出到MySQL
resultStream.addSink(new JdbcSink[(String, Double)](
"INSERT INTO result_table (id, temperature) VALUES (?, ?)",
new JdbcStatementBuilder[(String, Double)] {
override def accept(ps: PreparedStatement, v: (String, Double)): Unit = {
ps.setString(1, v._1)
ps.setDouble(2, v._2)
}
},
new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.withUrl("jdbc:mysql://localhost:3306/test")
.withDriverName("com.mysql.jdbc.Driver")
.withUsername("root")
.withPassword("password")
.build()
)) env.execute("Flink Kafka Join Example")
}
}

  

scala的依赖包maven引用

<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.12.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>1.12.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>1.12.2</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.26</version>
</dependency>
</dependencies>

Flink双流消费kafka合并数据,并包含滑动窗口、算子、输出到MySQL的示例的更多相关文章

  1. SparkStreaming消费kafka中数据的方式

    有两种:Direct直连方式.Receiver方式 1.Receiver方式: 使用kafka高层次的consumer API来实现,receiver从kafka中获取的数据都保存在spark exc ...

  2. Flink消费Kafka到HDFS实现及详解

    1.概述 最近有同学留言咨询,Flink消费Kafka的一些问题,今天笔者将用一个小案例来为大家介绍如何将Kafka中的数据,通过Flink任务来消费并存储到HDFS上. 2.内容 这里举个消费Kaf ...

  3. 使用Flume消费Kafka数据到HDFS

    1.概述 对于数据的转发,Kafka是一个不错的选择.Kafka能够装载数据到消息队列,然后等待其他业务场景去消费这些数据,Kafka的应用接口API非常的丰富,支持各种存储介质,例如HDFS.HBa ...

  4. Spark streaming消费Kafka的正确姿势

    前言 在游戏项目中,需要对每天千万级的游戏评论信息进行词频统计,在生产者一端,我们将数据按照每天的拉取时间存入了Kafka当中,而在消费者一端,我们利用了spark streaming从kafka中不 ...

  5. Flink消费Kafka数据并把实时计算的结果导入到Redis

    1. 完成的场景 在很多大数据场景下,要求数据形成数据流的形式进行计算和存储.上篇博客介绍了Flink消费Kafka数据实现Wordcount计算,这篇博客需要完成的是将实时计算的结果写到redis. ...

  6. spark streaming从指定offset处消费Kafka数据

    spark streaming从指定offset处消费Kafka数据 -- : 770人阅读 评论() 收藏 举报 分类: spark() 原文地址:http://blog.csdn.net/high ...

  7. flink 根据时间消费kafka

    经常遇到这样的场景,13点-14点的时候flink程序发生了故障,或者集群崩溃,导致实时程序挂掉1小时,程序恢复的时候想把程序倒回13点或者更前,重新消费kafka中的数据. 下面的代码就是根据指定时 ...

  8. flink---实时项目--day02-----1. 解析参数工具类 2. Flink工具类封装 3. 日志采集架构图 4. 测流输出 5. 将kafka中数据写入HDFS 6 KafkaProducer的使用 7 练习

    1. 解析参数工具类(ParameterTool) 该类提供了从不同数据源读取和解析程序参数的简单实用方法,其解析args时,只能支持单只参数. 用来解析main方法传入参数的工具类 public c ...

  9. Spark Streaming消费Kafka Direct方式数据零丢失实现

    使用场景 Spark Streaming实时消费kafka数据的时候,程序停止或者Kafka节点挂掉会导致数据丢失,Spark Streaming也没有设置CheckPoint(据说比较鸡肋,虽然可以 ...

  10. Kafka重复消费和丢失数据研究

    Kafka重复消费原因 底层根本原因:已经消费了数据,但是offset没提交. 原因1:强行kill线程,导致消费后的数据,offset没有提交. 原因2:设置offset为自动提交,关闭kafka时 ...

随机推荐

  1. 2022-4-6内部群每日三题-清辉PMP

    1.产品负责人一直听取一个项目干系人的意见,远远超过其他项目干系人.敏捷管理专业人士应该怎么做? A.允许这名干系人和产品负责人自己解决问题. B.与这名干系人安排一次私人会议,以澄清他们的需求. C ...

  2. WindowsServer2012搭建FTP服务器站点

    公司需要搭建一个FTP服务器给银行推送账单,这个文章整理的比较详细,可以参考 数据来源: https://blog.csdn.net/u010483330/article/details/125931 ...

  3. Byte流的压缩小技巧

    使用Lz4: public class Lz4Tool { public static byte[] CompressBytes(byte[] bytes) { return LZ4Codec.Wra ...

  4. 某星球存在两种生物,A种生物有1个头6条腿,B种生物有3个头4条腿。来自地球的太空船刚刚在该星球降落, 突然发现一大群这两种生物组成的队伍,由于时间紧,只数了头的数量和腿的数量,请帮助宇航员分析A、B两种生物各有多少个。

    package competition;import java.util.Scanner;/*        某星球存在两种生物,A种生物有1个头6条腿,B种生物有3个头4条腿.来自地球的太空船刚刚在 ...

  5. linux缓冲区溢出尝试

    #include <stdio.h>#include <string.h>char Lbuffer[] = "01234567890123456789======== ...

  6. Apple Sources

    1. libsystem_malloc.dylib的源码 https://opensource.apple.com/tarballs/libmalloc/ .这里有多个版本(例如用otool找到iOS ...

  7. ucharts的区域图、折线图(有x轴的),修改x轴显示为隔一个显示

    1.原本的显示方式: 2.想要的效果: 3.这边我使用的是uchart的组件,在uni_modules > qiun-data-charts > js_sdk > u-charts, ...

  8. 第一个程序,Hello,World!

    Hello World 创建一个文件夹,存放代码 新建一个java文件 后缀名为.java 编写代码 public class Hello{    public static void main(st ...

  9. 基于CMMI的软件工程第一章读书笔记

    基于CMMI的软件工程第一章读书笔记 软件作为产品,就如机械业以及一般的加工业一样,只有对产品的产生流程和角色分工及其相应的管理活动有一个成熟的模式,能"更快,更好,更便宜"地开发 ...

  10. 2003031126-石升福-python数据分析第三周作业

    项目 Numpy 博客名称 2003031126-石升福-python数据分析第三周作业 课程班级博客链接 https://edu.cnblogs.com/campus/pexy/20sj 作业链接 ...