kafka-spark偏移量提交至redis kafka1.0版本

kafka版本 1.0.0

spark版本 spark-streaming-kafka-0-10_2.11
/**
* @created by imp ON 2019/12/21
*/
class KafkaManagerByRedis(zkHost:String,kafkaParams: Map[String, Object]) extends Logging {

private val (zkClient,zkConnection) = ZkUtils.createZkClientAndConnection(zkHost , 10000 , 10000)
private val zkUtils = new ZkUtils(zkClient,zkConnection , false)
private val jedis = JedisUtil.getInstance().getJedis

/**
* def createDirectStream:InputDStream
**/

def createDirectStream[K: ClassTag, V: ClassTag](ssc: StreamingContext, topics: Seq[String]): InputDStream[ConsumerRecord[K, V]] = {
//1:readOffset
val groupId = kafkaParams("group.id").toString
val topic = topics(0)
val topicPartition: Map[TopicPartition, Long] = readOffset(topic, groupId)
KafkaUtils.createDirectStream[K, V](
ssc,
PreferConsistent,
Subscribe[K, V](topics, kafkaParams, topicPartition)
)
}

/**
* 读取偏移量
*
* @param topics
* @param groupId 消费组
* @return Map[car-1 , car-2 , Long]
**/

private def readOffset(topic: String, groupId: String): Map[TopicPartition, Long] = {
val topicPartitionMap = collection.mutable.HashMap.empty[TopicPartition, Long]
//去zk上拿topic和分区信息
val topicAndPartitionMaps: mutable.Map[String, Seq[Int]] = zkUtils.getPartitionsForTopics(Seq(topic))
val groupId = kafkaParams("group.id").toString
val redisKey = topic + "|" + groupId
topicAndPartitionMaps.foreach(topicPartitions =>{
val zkGroupTopicsDirs: ZKGroupTopicDirs = new ZKGroupTopicDirs(groupId , topicPartitions._1)
topicPartitions._2.foreach(partition => {
//迭代分区
val map: util.Map[String, String] = jedis.hgetAll(redisKey)
val offsetMap: mutable.Map[String, String] = mapAsScalaMap(map)
if (offsetMap != null && offsetMap.size != 0) {
logger.error("groupId:"+groupId+"获取到redis的偏移量数据")
topicPartitionMap.put(new TopicPartition(topicPartitions._1, Integer.valueOf(partition)), offsetMap(partition.toString).toLong)
}
else {
logger.error("程序第一次启动,redis还未存储，获取kafka的偏移量")
val consumer = new KafkaConsumer[String, Object](kafkaParams)
val topicCollection = List(new TopicPartition(topicPartitions._1 , partition))
consumer.assign(topicCollection)
val avaliableOffset: Long = consumer.beginningOffsets(topicCollection).values().head
consumer.close()
topicPartitionMap.put(new TopicPartition(topicPartitions._1 , Integer.valueOf(partition)) , avaliableOffset)
}
})
}
)

//currentoffset 、 earliestoffset leatestOffset
//cur < ear || cur > leaty ==> 矫正--> ear
//TODO 矫正
val earliestOffsets = getEarliestOffsets(kafkaParams, topic)
val topics = List(topic)
val latestOffsets = getLatestOffsets(kafkaParams, topics)
for ((k, v) <- topicPartitionMap) {
val current = v
val earliest = earliestOffsets.get(k).get
val latest = latestOffsets.get(k).get
if (current < earliest || current > latest) {
topicPartitionMap.put(k, earliest)
}
}
topicPartitionMap.toMap
}

/**
* 获取最早的偏移量
*
* @param kafkaParams
* @param topics
* @return
*/
private def getEarliestOffsets(kafkaParams: Map[String, Object], topic: String) = {
val newKafkaParams = mutable.Map[String, Object]()
newKafkaParams ++= kafkaParams
newKafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")
//kafka api
val consumer = new KafkaConsumer(kafkaParams)
//订阅
val topics = Seq[String](topic)
consumer.subscribe(topics)
val noOffsetForPartitionExceptionSet: mutable.Set[Nothing] = mutable.Set()
try {
consumer.poll(0)
} catch {
case e: NoOffsetForPartitionException =>
// noOffsetForPartitionExceptionSet.add(e.partition())
//邮件报警
}
//获取分区信息
val topicp = consumer.assignment().toSet
//暂定消费
consumer.pause(topicp)
//从头开始
consumer.seekToBeginning(topicp)
val toMap = topicp.map(line => line -> consumer.position(line)).toMap
val earliestOffsetMap = toMap
consumer.unsubscribe()
consumer.close()
earliestOffsetMap
}

private def getLatestOffsets(kafkaParams: Map[String, Object], topic: Seq[String]) = {
val newKafkaParams = mutable.Map[String, Object]()
newKafkaParams ++= kafkaParams
newKafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest")

//kafka api
val consumer = new KafkaConsumer[String, Array[Byte]](newKafkaParams)
//订阅
consumer.subscribe(topic)
val noOffsetForPartitionExceptionSet = mutable.Set()
try {
consumer.poll(0)
} catch {
case e: NoOffsetForPartitionException =>
// noOffsetForPartitionExceptionSet.add(e.partition())
//邮件报警
}
//获取分区信息
val topicp = consumer.assignment().toSet
//暂定消费
consumer.pause(topicp)
//从尾开始
consumer.seekToEnd(topicp)
val toMap: Map[TopicPartition, Long] = topicp.map(line => line -> consumer.position(line)).toMap
val earliestOffsetMap = toMap
consumer.unsubscribe()
consumer.close()
earliestOffsetMap
}

def persistOffset[K, V](rdd: RDD[ConsumerRecord[K, V]], storeOffset: Boolean = true, topic: String) = {
val groupId = kafkaParams("group.id").toString
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
offsetRanges.foreach(offsetRange => {
val redisKey = topic + "|" + groupId
val data = if (storeOffset) offsetRange.untilOffset else offsetRange.fromOffset
jedis.hset(redisKey, offsetRange.partition.toString, data.toString)
println("topic:" + offsetRange.topic + "分区:" + offsetRange.partition + "开始消费" + offsetRange.fromOffset + "消费到" + offsetRange.untilOffset + "共计" + offsetRange.count())
})

}

}

object KafkaManagerByRedis {
def main(args: Array[String]): Unit = {
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.121.12:9092,192.168.121.12:9093",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "test1",
"auto.offset.reset" -> ("earliest "),
"enable.auto.commit" -> (false: java.lang.Boolean) //禁用自动提交Offset，否则可能没正常消费完就提交了，造成数据错误
)
val zkServer=""
val kafkama = new KafkaManagerByRedis(zkServer,kafkaParams)
kafkama.getEarliestOffsets(kafkaParams, "cheng_du_gps_topic")
.foreach(m => println(m._1.topic(), m._1.partition(), m._2))

kafkama.getLatestOffsets(kafkaParams, List("cheng_du_gps_topic"))
.foreach(m => println(m._1.topic(), m._1.partition(), m._2))
}
}

kafka-spark偏移量提交至redis kafka1.0版本的更多相关文章

Redis 3.0版本启动时出现警告的解决办法
原文:http://m.blog.csdn.net/article/details?id=50864933 Redis 3.0.7版本启动时出现警告的解决办法发表于2016/3/12 12:52:4 ...
centos安装redis 5.0版本的集群
我在本地VM-Centos里安装5.0.5时安装遇到了些问题,参考了Blog:https://www.cnblogs.com/shawhe/p/9548620.html 顺利安装完成. 安装redis ...
Redis 3.0正式版发布，正式支持Redis集群
Redis是一个开源.基于C语言.基于内存亦可持久化的高性能NoSQL数据库,同时,它还提供了多种语言的API.近日,Redis 3.0在经过6个RC版本后,其正式版终于发布了.Redis 3.0的最 ...
【转载】Redis 4.0 自动内存碎片整理（Active Defrag）源码分析
click原文链接原文链接:https://blog.csdn.net/zouhuajianclever/article/details/90669409阅读本文前建议先阅读此篇博客: Redis源码 ...
阿里云发布 Redis 5.0 缓存服务：全新 Stream 数据类型带来不一样缓存体验
4月24日,阿里云正式宣布推出全新 Redis 5.0 版本云数据库缓存服务,据悉该服务完全兼容 4.0 及早期版本,继承了其一贯的安全,稳定,高效等特点并带来了全新的 Stream 数据结构及多项优 ...
Redis 源码简洁剖析 11 - 主 IO 线程及 Redis 6.0 多 IO 线程
Redis 到底是不是单线程的程序? 多 IO 线程的初始化 IO 线程运行函数 IOThreadMain 如何推迟客户端「读」操作? 如何推迟客户端「写」操作? 如何把待「读」客户端分配给 IO 线 ...
demo2 Kafka+Spark Streaming+Redis实时计算整合实践 foreachRDD输出到redis
基于Spark通用计算平台,可以很好地扩展各种计算类型的应用,尤其是Spark提供了内建的计算库支持,像Spark Streaming.Spark SQL.MLlib.GraphX,这些内建库都提供了 ...
Kafka：ZK+Kafka+Spark Streaming集群环境搭建（十三）kafka+spark streaming打包好的程序提交时提示虚拟内存不足（Container is running beyond virtual memory limits. Current usage: 119.5 MB of 1 GB physical memory used; 2.2 GB of 2.1 G）
异常问题:Container is running beyond virtual memory limits. Current usage: 119.5 MB of 1 GB physical mem ...
Kafka：ZK+Kafka+Spark Streaming集群环境搭建（九）安装kafka_2.11-1.1.0
如何搭建配置centos虚拟机请参考<Kafka:ZK+Kafka+Spark Streaming集群环境搭建(一)VMW安装四台CentOS,并实现本机与它们能交互,虚拟机内部实现可以上网.& ...

随机推荐

PyQt(Python+Qt)学习随笔：Qt Designer中部件的toolTip、toolTipDuration、statusTip、whatsThis属性
toolTip属性 toolTip属性设置部件的toolTip提示信息,toolTip提示信息在鼠标放到控件上会浮动出一个小框显示提示信息.默认情况下,仅显示活动窗口子部件的toolTip,可以通过在 ...
secret_key伪造session来进行越权
从swpuctf里面的一道ctf题目来讲解secret_key伪造session来进行越权. 以前没有遇到过这种题目,这次遇到了之后查了一些资料把它做了出来,记录一下知识点. 参考资料 http:// ...
如何写好PPT，什么样的PPT容易被人理解记住
PPT一般是用于讲解性的行为而存在,那如果写好PPT呢?如果写好,这个完全要取决于你所面向的目标读者,是用于学术行为呢?还是用于商业行为.面对不同的目标群体,有不同的策略.但是无论面向群体是谁我们都有 ...
Struts2 S2-061(CVE-2020-17530)漏洞复现
0x00 漏洞简介 Apache Struts2框架是一个用于开发Java EE网络应用程序的Web框架.Apache Struts于2020年12月08日披露 S2-061 Struts 远程代码执 ...
C++11新特性变参模板、完美转发（简述）
变参模板 (Variadic Template) - 使得 emplace 可以接受任意参数,这样就可以适用于任意对象的构建完美转发 - 使得接收下来的参数能够原样的传递给对象的构造函数,这带来另 ...
masterha_check_repl --conf=/etc/mha/app1.cnf检查错误
[mysql@node3 ~]$ masterha_check_repl --conf=/etc/mha/app1.cnf Tue Jul 7 22:43:26 2020 - [warning] Gl ...
dataframe，list，numpy之间的互相转换
dataframe,numpy,list之间的互相转换由于目前学校要做一些数据分析处理的作业有要用到dataframe,list,numpy之间的转化,所以在此总结一下这些用法. dataframe ...
IOS开发中实现UITableView按照首字母将集合进行检索分组
在开发公司项目中遇到了将图书目录进行按照首字母分组排序的问题 1.在项目添加解析汉字拼音的Pinyin.h文件 /* * pinyin.c */ #define HANZI_START 19968 # ...
自顶向下redis4.0（5）持久化
redis4.0的持久化目录 redis4.0的持久化简介正文 rdb持久化 save命令 bgsave命令 rdb定期保存数据进程结束保存数据 aof持久化数据缓冲区刷新数据到磁盘 ap ...
get \post 接口代码及断言编写
post 请求接口 import requests import json url_path = "http://www.baidu.com" data = {"user ...

kafka-spark偏移量提交至redis kafka1.0版本

kafka-spark偏移量提交至redis kafka1.0版本的更多相关文章

随机推荐

热门专题