【SparkStreaming学习之四】 SparkStreaming+kafka管理消费offset
环境
虚拟机:VMware 10
Linux版本:CentOS-6.5-x86_64
客户端:Xshell4
FTP:Xftp4
jdk1.8
scala-2.10.4(依赖jdk1.8)
spark-1.6
从kafka消费消息的偏移量存储到ZK 或者 mysql 或者 hbase,进行主动管理。
以下举例通过ZK进行存储管理:
package manageoffset; import java.util.Map; import kafka.common.TopicAndPartition;
import manageoffset.getOffset.GetTopicOffsetFromKafkaBroker;
import manageoffset.getOffset.GetTopicOffsetFromZookeeper; import org.apache.log4j.Logger;
import org.apache.spark.streaming.api.java.JavaStreamingContext; public class UseZookeeperManageOffset {
/**
* 使用log4j打印日志,“UseZookeeper.class” 设置日志的产生类
*/
static final Logger logger = Logger.getLogger(UseZookeeperManageOffset.class); public static void main(String[] args) {
/**
* 加载log4j的配置文件,方便打印日志
*/
ProjectUtil.LoadLogConfig();
logger.info("project is starting..."); /**
* 从kafka集群中得到topic每个分区中生产消息的最大偏移量位置
*/
Map<TopicAndPartition, Long> topicOffsets = GetTopicOffsetFromKafkaBroker.getTopicOffsets("node1:9092,node2:9092,node3:9092", "mytopic"); /**
* 从zookeeper中获取当前topic每个分区 consumer 消费的offset位置
*/
Map<TopicAndPartition, Long> consumerOffsets =
GetTopicOffsetFromZookeeper.getConsumerOffsets("node3:2181,node4:2181,node5:2181","zhy","mytopic"); /**
* 合并以上得到的两个offset ,
* 思路是:
* 如果zookeeper中读取到consumer的消费者偏移量,那么就zookeeper中当前的offset为准。
* 否则,如果在zookeeper中读取不到当前消费者组消费当前topic的offset,就是当前消费者组第一次消费当前的topic,
* offset设置为topic中消息的最大位置。
*/
if(null!=consumerOffsets && consumerOffsets.size()>0){
topicOffsets.putAll(consumerOffsets);
}
/**
* 如果将下面的代码解开,是将topicOffset 中当前topic对应的每个partition中消费的消息设置为0,就是从头开始。
*/
// for(Map.Entry<TopicAndPartition, Long> item:topicOffsets.entrySet()){
// item.setValue(0l);
// } /**
* 构建SparkStreaming程序,从当前的offset消费消息
*/
JavaStreamingContext jsc = SparkStreamingDirect.getStreamingContext(topicOffsets,"zhy"); jsc.start();
jsc.awaitTermination();
jsc.close();
}
}
package manageoffset.getOffset; import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set; import com.google.common.collect.ImmutableMap; import kafka.api.PartitionOffsetRequestInfo;
import kafka.cluster.Broker;
import kafka.common.TopicAndPartition;
import kafka.javaapi.OffsetRequest;
import kafka.javaapi.OffsetResponse;
import kafka.javaapi.PartitionMetadata;
import kafka.javaapi.TopicMetadata;
import kafka.javaapi.TopicMetadataRequest;
import kafka.javaapi.TopicMetadataResponse;
import kafka.javaapi.consumer.SimpleConsumer; /**
* 测试之前需要启动kafka
* @author root
*
*/
public class GetTopicOffsetFromKafkaBroker
{
public static void main(String[] args)
{ Map<TopicAndPartition, Long> topicOffsets = getTopicOffsets("node1:9092,node2:9092,node3:9092", "mytopic");
Set<Entry<TopicAndPartition, Long>> entrySet = topicOffsets.entrySet();
for(Entry<TopicAndPartition, Long> entry : entrySet) {
TopicAndPartition topicAndPartition = entry.getKey();
Long offset = entry.getValue();
String topic = topicAndPartition.topic();
int partition = topicAndPartition.partition();
System.out.println("topic = "+topic+",partition = "+partition+",offset = "+offset);
} } /**
* 从kafka集群中得到当前topic,生产者在每个分区中生产消息的偏移量位置
* @param KafkaBrokerServer
* @param topic
* @return
*/
public static Map<TopicAndPartition,Long> getTopicOffsets(String KafkaBrokerServer, String topic){
Map<TopicAndPartition,Long> retVals = new HashMap<TopicAndPartition,Long>();
//遍历每个broker
for(String broker:KafkaBrokerServer.split(",")){
//使用SimpleConsumer访问broker
SimpleConsumer simpleConsumer = new SimpleConsumer(broker.split(":")[0],Integer.valueOf(broker.split(":")[1]), 64*10000,1024,"consumer");
TopicMetadataRequest topicMetadataRequest = new TopicMetadataRequest(Arrays.asList(topic));
TopicMetadataResponse topicMetadataResponse = simpleConsumer.send(topicMetadataRequest);
//遍历获取的元数据
for (TopicMetadata metadata : topicMetadataResponse.topicsMetadata()) {
//遍历每个partition上的元数据
for (PartitionMetadata part : metadata.partitionsMetadata()) {
Broker leader = part.leader();
if (leader != null)
{
TopicAndPartition topicAndPartition = new TopicAndPartition(topic, part.partitionId());
PartitionOffsetRequestInfo partitionOffsetRequestInfo = new PartitionOffsetRequestInfo(kafka.api.OffsetRequest.LatestTime(), 10000);
OffsetRequest offsetRequest = new OffsetRequest(ImmutableMap.of(topicAndPartition, partitionOffsetRequestInfo), kafka.api.OffsetRequest.CurrentVersion(), simpleConsumer.clientId());
OffsetResponse offsetResponse = simpleConsumer.getOffsetsBefore(offsetRequest);
if (!offsetResponse.hasError()) {
long[] offsets = offsetResponse.offsets(topic, part.partitionId());
retVals.put(topicAndPartition, offsets[0]);
}
}
}
}
simpleConsumer.close();
}
return retVals;
}
}
package manageoffset.getOffset; import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set; import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.retry.RetryUntilElapsed; import com.fasterxml.jackson.databind.ObjectMapper; import kafka.common.TopicAndPartition; public class GetTopicOffsetFromZookeeper { /**
* 从zookeeper中获取当前topic每个分区 consumer 消费的offset位置
* @param zkServers
* @param groupID
* @param topic
* @return
*/
public static Map<TopicAndPartition,Long> getConsumerOffsets(String zkServers,String groupID, String topic) {
Map<TopicAndPartition,Long> retVals = new HashMap<TopicAndPartition,Long>();
ObjectMapper objectMapper = new ObjectMapper();
CuratorFramework curatorFramework
= CuratorFrameworkFactory
.builder()
.connectString(zkServers)
.connectionTimeoutMs(1000)
.sessionTimeoutMs(10000)
.retryPolicy(new RetryUntilElapsed(1000, 1000))
.build(); curatorFramework.start(); try
{
String nodePath = "/consumers/"+groupID+"/offsets/" + topic;
if(curatorFramework.checkExists().forPath(nodePath)!=null)
{
List<String> partitions = curatorFramework.getChildren().forPath(nodePath);
for(String partiton:partitions)
{
int partitionL = Integer.valueOf(partiton);
Long offset = objectMapper.readValue(curatorFramework.getData().forPath(nodePath+"/"+partiton),Long.class);
TopicAndPartition topicAndPartition = new TopicAndPartition(topic,partitionL);
retVals.put(topicAndPartition, offset);
}
}
}
catch(Exception e)
{
e.printStackTrace();
}
curatorFramework.close();
return retVals;
} public static void main(String[] args) {
Map<TopicAndPartition, Long> consumerOffsets = getConsumerOffsets("node3:2181,node4:2181,node5:2181","zhy","mytopic");
Set<Entry<TopicAndPartition, Long>> entrySet = consumerOffsets.entrySet();
for(Entry<TopicAndPartition, Long> entry : entrySet) {
TopicAndPartition topicAndPartition = entry.getKey();
String topic = topicAndPartition.topic();
int partition = topicAndPartition.partition();
Long offset = entry.getValue();
System.out.println("topic = "+topic+",partition = "+partition+",offset = "+offset);
}
}
}
package manageoffset; import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference; import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder; import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.retry.RetryUntilElapsed;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange; import com.fasterxml.jackson.databind.ObjectMapper; public class SparkStreamingDirect
{ /**
* 根据传入的offset来读取kafka消息
* @param topicOffsets
* @param groupID
* @return
*/
public static JavaStreamingContext getStreamingContext(Map<TopicAndPartition, Long> topicOffsets,String groupID){
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkStreamingOnKafkaDirect");
conf.set("spark.streaming.kafka.maxRatePerPartition", "10");
JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));
jsc.checkpoint("/checkpoint");
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list","node1:9092,node2:9092,node3:9092");
kafkaParams.put("group.id","MyFirstConsumerGroup"); for(Map.Entry<TopicAndPartition,Long> entry:topicOffsets.entrySet()){
System.out.println(entry.getKey().topic()+"\t"+entry.getKey().partition()+"\t"+entry.getValue());
} JavaInputDStream<String> message = KafkaUtils.createDirectStream(
jsc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
String.class,
kafkaParams,
topicOffsets,
new Function<MessageAndMetadata<String,String>,String>() {
private static final long serialVersionUID = 1L; public String call(MessageAndMetadata<String, String> v1)throws Exception {
return v1.message();
}
}
); final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<>(); JavaDStream<String> lines = message.transform(new Function<JavaRDD<String>, JavaRDD<String>>() {
private static final long serialVersionUID = 1L; @Override
public JavaRDD<String> call(JavaRDD<String> rdd) throws Exception {
OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
offsetRanges.set(offsets);
return rdd;
}
}
); message.foreachRDD(new VoidFunction<JavaRDD<String>>(){
private static final long serialVersionUID = 1L; @Override
public void call(JavaRDD<String> t) throws Exception { ObjectMapper objectMapper = new ObjectMapper(); CuratorFramework curatorFramework
= CuratorFrameworkFactory
.builder()
.connectString("node3:2181,node4:2181,node5:2181")
.connectionTimeoutMs(1000)
.sessionTimeoutMs(10000)
.retryPolicy(new RetryUntilElapsed(1000, 1000))
.build(); curatorFramework.start(); for (OffsetRange offsetRange : offsetRanges.get()) {
long fromOffset = offsetRange.fromOffset();//开始offset
long untilOffset = offsetRange.untilOffset();//结束offset
final byte[] offsetBytes = objectMapper.writeValueAsBytes(offsetRange.untilOffset());
String nodePath = "/consumers/"+groupID+"/offsets/" + offsetRange.topic()+ "/" + offsetRange.partition();
System.out.println("nodePath = "+nodePath);
System.out.println("fromOffset = "+fromOffset+",untilOffset="+untilOffset); //ZK记录分区主题的消费偏移量
if(curatorFramework.checkExists().forPath(nodePath) != null)
{
curatorFramework.setData().forPath(nodePath,offsetBytes);
}
else
{
curatorFramework.create().creatingParentsIfNeeded().forPath(nodePath, offsetBytes);
}
}
curatorFramework.close();
} }); lines.print(); return jsc;
}
}
package manageoffset; import java.io.IOException;
import java.io.InputStream;
import java.util.Properties; import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator; public class ProjectUtil {
/**
* 使用log4j配置打印日志
*/
static final Logger logger = Logger.getLogger(UseZookeeperManageOffset.class);
/**
* 加载配置的log4j.properties,默认读取的路径在src下,如果将log4j.properties放在别的路径中要手动加载
*/
public static void LoadLogConfig() {
PropertyConfigurator.configure("d:/eclipse4.7WS/SparkStreaming_Kafka_Manage/resource/log4j.properties");
} /**
* 加载配置文件
* 需要将放config.properties的目录设置成资源目录
* @return
*/
public static Properties loadProperties() { Properties props = new Properties();
InputStream inputStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("config.properties");
if(null != inputStream) {
try {
props.load(inputStream);
} catch (IOException e) {
logger.error(String.format("Config.properties file not found in the classpath"));
}
}
return props; } public static void main(String[] args) {
Properties props = loadProperties();
String value = props.getProperty("hello");
System.out.println(value);
}
}
【SparkStreaming学习之四】 SparkStreaming+kafka管理消费offset的更多相关文章
- 【kafka学习之四】kafka集群性能测试
kafka集群的性能受限于JVM参数.服务器的硬件配置以及kafka的配置,因此需要对所要部署kafka的机器进行性能测试,根据测试结果,找出符合业务需求的最佳配置. 1.kafka broker j ...
- Kafka 是如何管理消费位点的?
Kafka 是一个高度可扩展的分布式消息系统,在实时事件流和流式处理为中心的架构越来越风靡的今天,它扮演了这个架构中核心存储的角色.从某种角度说,Kafka 可以看成实时版的 Hadoop 系统.Ha ...
- Spark Streaming消费Kafka Direct保存offset到Redis,实现数据零丢失和exactly once
一.概述 上次写这篇文章文章的时候,Spark还是1.x,kafka还是0.8x版本,转眼间spark到了2.x,kafka也到了2.x,存储offset的方式也发生了改变,笔者根据上篇文章和网上文章 ...
- Kafka管理与监控——broker宕机后无法消费问题
背景 因磁盘满了,导致kafka所有的服务器全部宕机了,然后重启kafka集群,服务是启动成功了,但有一些报错: broker1: broker2: broker3:一直在刷以下错误信息 虽然报了这些 ...
- Kafka管理工具介绍【转】
Kafka内部提供了许多管理脚本,这些脚本都放在$KAFKA_HOME/bin目录下,而这些类的实现都是放在源码的kafka/core/src/main/scala/kafka/tools/路径下. ...
- Kafka 温故(五):Kafka的消费编程模型
Kafka的消费模型分为两种: 1.分区消费模型 2.分组消费模型 一.分区消费模型 二.分组消费模型 Producer : package cn.outofmemory.kafka; import ...
- Kafka无法消费!?究竟是bug的“沦陷”还是配置的“扭曲”?
在一个月黑风高的夜晚,突然收到现网生产环境Kafka消息积压的告警,梦中惊醒啊,马上起来排查日志. 问题现象 消费请求卡死在查找Coordinator Coordinator为何物?Coordinat ...
- Kafka学习笔记之Kafka背景及架构介绍
0x00 概述 本文介绍了Kafka的创建背景,设计目标,使用消息系统的优势以及目前流行的消息系统对比.并介绍了Kafka的架构,Producer消息路由,Consumer Group以及由其实现的不 ...
- Kafka无法消费?!我的分布式消息服务Kafka却稳如泰山!
在一个月黑风高的夜晚,突然收到现网生产环境Kafka消息积压的告警,梦中惊醒啊,马上起来排查日志. 问题现象:消费请求卡死在查找Coordinator Coordinator为何物?Coordinat ...
随机推荐
- 关于ajax 返回值验证问题
如果后台返回布尔值true时 前端 if(data==true){ //true 不能加引号 否则就变成了字符串了 alert(data+'操作成功!'+status); }
- 在vue脚手架中使用npm的方式使用swiper
打开项目的根目录,然后打开命令窗口,输入 npm install swiper@4.4.1 @后为指定版本号,也可以不写 在main.js 中,引入 import Swiper from 'swipe ...
- Vue(二十五)打包后路径报错问题
1.修改 config - index.js 2.修改 build - utils.js
- vue_class 绑定_style 绑定
1. class 绑定 data: { myClass: "bClass", hasA: true, hasB: false } 字符串模式: 类名不确定 <p class= ...
- Hibernate--Day01
Hibernate是一个面向对象的持久化框架 持久化: 1,把内存中的Java对象保存到存储设备上面: 2,最好的解诀方案:把对象持久化到数据库里面: 3, 在Java里面,把对象持久化到数据库只能使 ...
- TypeScript初探
TypeScript初探 TypeScript什么? 官方给的定义:TypeScript是一种由微软开发的自由和开源的编程语言,它是JavaScript类型的超集,可以编译成纯JavaScript,本 ...
- Python_tkinter(2)_常用控件
1.Label--标签(文字/位图)控件 from tkinter import * root = Tk() root.geometry('200x200') # Label控件 字体.边框.背景 l ...
- vue框架与koa2服务器实现跨域通信
首先我们在vue中引入axios, npm install axios --save 在需要用到的页面引入axios import axios from "axios"; 用axi ...
- 树状数组-逆序对-HDU6318
Swaps and Inversions Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 32768/32768 K (Java/Others ...
- spring-data-redis2.0以上配置redis连接
1.所需的maven依赖 <dependency> <groupId>org.springframework.data</groupId> <artifact ...