Kafka运维大全来了!优化、监控、故障处理……

.jpg)
.jpg)
.jpg)
broker.id=
listeners=SASL_PLAINTEXT://hosip:9092
#broker.id=
listeners=SASL_PLAINTEXT://:9092
zookeeper.connect=zkip1:,zkip2:,zkip3:/kafka # Timeout in ms for connecting to zookeeper
delete.topic.enable=true
zookeeper.connection.timeout.ms=
zookeeper.session.timeout.ms=
controlled.shutdown.enable=true
#很重要
unclean.leader.election.enable=true
auto.create.topics.enable=false
#副本拉取线程数
num.replica.fetchers=
auto.leader.rebalance.enable=true
leader.imbalance.per.broker.percentage=
leader.imbalance.check.interval.seconds=
#副本拉取的最小大小1mb
replica.fetch.min.bytes=
#副本拉取的最大大小20mb
replica.fetch.max.bytes=
#多长时间拉取一次副本
replica.fetch.wait.max.ms=
#超过多长时间副本退出isr
replica.socket.timeout.ms=
#replica.fetch.wait.max.ms=
#缓存大小
replica.socket.receive.buffer.bytes=
num.network.threads=
num.io.threads=
#每当producer写入10000条消息时,刷数据到磁盘
log.flush.interval.messages=
#每间隔1秒钟时间,刷数据到磁盘
log.flush.interval.ms=
socket.receive.buffer.bytes=
socket.send.buffer.bytes=
queued.max.requests=
sasl.enabled.mechanisms=PLAIN
sasl.mechanism.inter.broker.protocol=PLAIN
allow.everyone.if.no.acl.found=false
super.users=User:admin
authorizer.class.name = kafka.security.auth.SimpleAclAuthorizer
security.inter.broker.protocol=SASL_PLAINTEXT
nohup kafka-server-start.sh /usr/local/kafka/config/server.properties >/dev/null >& &
$KAFKA_HOME/bin/kafka-topics.sh --create --topic logstash-yarnnodelog --replication-factor --partitions --zookeeper zkip:/kafka
$KAFKA_HOME/bin/kafka-topics.sh --list --zookeeper zkip:
$KAFKA_HOME/bin/kafka-console-consumer.sh --zookeeper zkip: --topic topic-test --from-beginning
kafka-console-consumer.sh --bootstrap-server brokerip: --from-beginning --topic logstash --new-consumer --consumer.config=/opt/beh/core/kafka/config/consumer.properties
$KAFKA_HOME/bin/kafka-console-producer.sh --broker-list brokerip: --topic topic-test
$KAFKA_HOME/bin/kafka-topics.sh --zookeeper zkip: --delete --topic topic-test
$KAFKA_HOME/bin/kafka-topics.sh --describe --zookeeper zkip:/ --topic test20160807
zookeeper.connect=zkip1:, zkip2:, zkip3: # Timeout in ms for connecting to zookeeper
zookeeper.connection.timeout.ms=
listeners=SASL_PLAINTEXT://:9092
security.inter.broker.protocol=SASL_PLAINTEXT
sasl.enabled.mechanisms=PLAIN
sasl.mechanism.inter.broker.protocol=PLAIN
auto.create.topics.enable=false
allow.everyone.if.no.acl.found=false
delete.topic.enable=true
super.users=User:admin
authorizer.class.name = kafka.security.auth.SimpleAclAuthorizer
KafkaServer {
org.apache.kafka.common.security.plain.PlainLoginModule required
username="admin"
password="admin"
user_admin="admin"
user_hadoop="hadoop"
user_producer1="producer1_test"
user_consumer1="consumer1_test"
user_producer2="producer2_test"
user_consumer2="consumer2_test";
};
vi kafka_client_consumer_jaas.conf KafkaClient {
org.apache.kafka.common.security.plain.PlainLoginModule required
username="consumer1"
password="consumer1_test";
}; Vi kafka_client_producer_jaas.conf KafkaClient {
org.apache.kafka.common.security.plain.PlainLoginModule required
username="producer1"
password="producer1_test";
};
consumer.properties
echo security.protocol=SASL_PLAINTEXT >> producer.properties
echo sasl.mechanism=PLAIN >> producer.properties
echo security.protocol=SASL_PLAINTEXT >> consumer.properties
echo sasl.mechanism=PLAIN >> consumer.properties vi producer.properties security.protocol=SASL_PLAINTEXT
sasl.mechanism=PLAIN vi consumer.properties security.protocol=SASL_PLAINTEXT
sasl.mechanism=PLAIN
export KAFKA_OPTS="-Djava.security.auth.login.config=/opt/beh/core/kafka/config/kafka_server_jaas.conf"
nohup kafka-server-start.sh /opt/beh/core/kafka/config/server.properties &
if [ "x$KAFKA_OPTS" ]; then
export KAFKA_OPTS="-Djava.security.auth.login.config=/opt/beh/core/kafka/config/kafka_client_jaas.conf"
fi
if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then
export KAFKA_HEAP_OPTS="-Xmx512M"
fi
exec $(dirname $)/kafka-run-class.sh kafka.tools.ConsoleProducer "$@" vi kafka-console-consumer.sh if [ "x$KAFKA_OPTS" ]; then
export KAFKA_OPTS="-Djava.security.auth.login.config=/opt/beh/core/kafka/config/kafka_client_jaas.conf"
fi if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then
export KAFKA_HEAP_OPTS="-Xmx512M"
fi exec $(dirname $)/kafka-run-class.sh kafka.tools.ConsoleConsumer "$@"
.jpg)
nohup kafka-server-start.sh /opt/beh/core/kafka/config/server.properties &
kafka-acls.sh --list --authorizer-properties zookeeper.connect=localhost:
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:admin --operation ClusterAction --cluster --add (更新metedata权限)
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:admin --cluster --add
$KAFKA_HOME/bin/kafka-topics.sh --create --topic topic-test1 --replication-factor --partitions --zookeeper localhost:
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --add --allow-principal User:Bob --allow-principal User:Alice --allow-host xxx.xx.xx. --allow-host xxx.xx.xx. --operation Read --operation Write --topic Test-topic
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:producer1 --topic=topic-test --operation Write --add kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:producer1 --topic=test1 --operation Write --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:producer1 --consumer --topic=topic-test --group=* --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:hadoop --consumer --topic=topic-test1 --group=test-consumer-group --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --producer --topic=topic-test1 --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --consumer --topic=topic-test1 --group=* --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --consumer --topic=topic-test1 --group=test-consumer-group --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:producer1 --topic=* --operation Write --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:hadoop --consumer --topic=* --group=test-consumer-group --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:hadoop --consumer --topic=* --group=* --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --topic=* --operation Write --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --consumer --topic=* --group=topic-test --add
kafka-acls.sh --authorizer-properties zookeeper.connect=localhost: --allow-principal User:* --consumer --topic=* --group=* --add
bin/kafka-acls.sh --authorizer-properties zookeeper.connect=data-rt-dev02:/kafka_test10 --remove --allow-principal User:Bob --allow-principal User:Alice --allow-host xxx.xx.xx. --allow-host xxx.xx.xx. --operation Read --operation Write --topic test
kafka-acls.sh --list --authorizer-properties zookeeper.connect=localhost:
kafka-acls.sh --list --authorizer-properties zookeeper.connect=localhost: User:hadoop
kafka-acls.sh --list --authorizer-properties zookeeper.connect=localhost: --topic=topic-test1
$KAFKA_HOME/bin/kafka-console-producer.sh --broker-list broker1: --topic topic-test --producer.config=/opt/beh/core/kafka/config/producer.properties
kafka-console-consumer.sh --bootstrap-server broker1: --from-beginning --topic topic-test --new-consumer --consumer.config=/opt/beh/core/kafka/config/consumer.properties
put("sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username=\"consumer1\" password=\"consumer1_test\";");
put("security.protocol", "SASL_PLAINTEXT");
put("sasl.mechanism", "PLAIN");
- 评估数据量:要求研发提前评估topic一个周期全量的数据大小。
- 计算磁盘总存储:如一块盘825g,一个节点20快盘,10个节点。那么磁盘总存储就是165000g。
- 预估实际数据存储占比:topic一个周期全量数据大小占磁盘总存储的百分比,超过百分之六十,即要求研发减少存储周期。
- 计算磁盘总块数:一个节点20快盘,10个节点,总磁盘块数200个。
- 合理预分区:分区数量为磁盘总数的整数倍。如所有的topic总数据量为50000gb,磁盘个数为200,那么就可以设置总分区数为200,400,600.具体多少分区数视业务决定。若分区数为400,那么一个分区的大小约125g。例如某一个topic:cbss001的预估数据量是210g,那么通过计算可以将其分成两个分区。这样根据Kafka副本落盘策略,各个主机磁盘就能保证最大限度的存储均衡。
- 坏盘会导致节点宕掉,及时更换坏盘,重启节点即可。
- unclean.leader.election.enable 该参数为true配置到topic中会引起消息重复消费。但为false时,会引起节点9092端口断开连接,导致Kafka进程假死。
- 内存溢出,其会导致节点副本不能上线isr。
- 进程,文件数限制也会造成节点报错,后续调优中会给出优化参数。
- flower副本不能及时同步leader副本,同步超时导致副本下线isr。
- 消费offset越界,这种情况首先重启节点,若还是报错,则找到该offset越界的分区,删除几条message,再次查看。知道不报错为止。
vi topics-to-move.json
{"topics": [{"topic": "foo1"}, {"topic": "foo2"}], "version": }
bin/kafka-reassign-partitions.sh --zookeeper localhost:
--topics-to-move-json-file topics-to-move.json --broker-list "5,6" --generate Current partition replica assignment
{"version":,
"partitions":[
{"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]},{"topic":"foo2","partition":,"replicas":[,]}
]
}
Proposed partition reassignment configuration
{"version":,
"partitions":[
{"topic":"foo1","partition":,"replicas":[,]},{"topic":"foo1","partition":,"replicas":[,]},
{"topic":"foo2","partition":,"replicas":[,]},{"topic":"foo2","partition":,"replicas":[,]},
{"topic":"foo1","partition":,"replicas":[,]},{"topic":"foo2","partition":,"replicas":[,]}
]
}
bin/kafka-reassign-partitions.sh --zookeeper localhost: --reassignment-json-file expand-cluster-reassignment.json --execute
Current partition replica assignment
{"version":,
"partitions":[
{"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}
] }
Save this to use as the --reassignment-json-file option during rollback Successfully started reassignment of partitions
{"version":,
"partitions":[
{"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}, {"topic":"foo1","partition":,"replicas":[,]}, {"topic":"foo2","partition":,"replicas":[,]}
]
}
执行验证:–verify bin/kafka-reassign-partitions.sh --zookeeper localhost: --reassignment-json-file custom-reassignment.json --verify
Status of partition reassignment: Reassignment of partition [foo1,]
completed successfully
Reassignment of partition [foo2,]
completed successfully
log.retention.bytes (一个topic的大小限制 =分区数*log.retention.bytes)
log.retention.minutes
log.retention.bytes和log.retention.minutes任意一个达到要求,都会执行数据删除
kafka-configs.sh --zookeeper zkip1: --describe --entity-type topics --entity-name CdrNormal
Configs for topics:CdrNormal are retention.ms=
#!/usr/bin/python
#_*_coding:utf-8_*_
import pycurl
import json
import StringIO
import time
import sys
import zookeeper zk=zookeeper.init("zkip1:2181")
t = zookeeper.get_children(zk,"/brokers/ids")
d=0
for i in t:
d=d+1
b=16-d
if d == 16:
print "ok cb实时kafka1节点存活正常"
sys.exit(0)
else:
print "Critical cb实时kafka1节点有:",b,"个死去节点"
sys.exit(2)
#!/usr/bin/python
#_*_coding:utf-8_*_
import paramiko
import sys
hostname = ['IP1',' IP2']
username = sys.argv[]
password = sys.argv[]
percent = sys.argv[]
disk={}
error=""
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
for i in range(,len(hostname)):
ssh.connect(hostname[i],,username,password)
stdin,stdout,stderr = ssh.exec_command("df -TPh|awk '+$6>%s {print $7}'" % percent)
path = stdout.readlines()
#print path
disk[hostname[i]]=path
#print disk
#it=iter(disk.keys())
#print disk.values()
#for key in hostname:
# print i
#print disk[hostname[i]]
#print disk[next(it)]
#print len(disk[next(it)])
#if len(disk[next(it)])==0:
if not disk:
print("未采集到集群信息!")
sys.exit()
else:
for i in disk.keys():
#print disk.get(i)
if not disk.get(i):
continue
else:
error += "节点"+i+":"
for j in range(,len(disk[i])):
if j == len(disk[i])-:
error += disk[i][j].encode('utf-8')+"。"
else:
error += disk[i][j].encode('utf-8')+","
if not error:
print("cb_rt_kafka业务数据采集集群正常")
sys.exit()
else:
#print ("cb_rt_kafka业务数据采集集群,%s,磁盘存储超出百分之七十") % error.replace("\n", "")
print ("cb_rt_kafka业务数据采集集群,%s,磁盘存储超出百分之%s") % (error.replace("\n", ""),percent)
sys.exit()
ssh.close()
Kafka运维大全来了!优化、监控、故障处理……的更多相关文章
- Kafka运维大全来了!优化、监控、故障处理
Kafka运维大全来了!优化.监控.故障处理…… Kafka概念 Kafka是分布式发布-订阅消息系统.它最初由LinkedIn公司开发,之后成为Apache项目的一部分.Kafka是一个分布式的 ...
- Linux运维不可不知的性能监控和调试工具
Linux运维不可不知的性能监控和调试工具 1 nagios Nagios是一个开源监控解决方案,我觉得他可以监控一切 ,可以看一下我以前的文章:NAGIOS 2 ps #用来查看程序的运行情况 ps ...
- Kafka运维填坑(转)
前提: 只针对Kafka 0.9.0.1版本; 说是运维,其实偏重于问题解决; 大部分解决方案都是google而来, 我只是作了次搬运工; 有些问题的解决方案未必一定是通用的, 若应用到线上请慎重; ...
- kafka运维填坑
转载自:https://www.jianshu.com/p/d2cbaae38014 前提: 只针对Kafka 0.9.0.1版本; 说是运维,其实偏重于问题解决; 大部分解决方案都是google而来 ...
- 从零开始运维之旅:如何监控你的 Windows?
小弟乃刚刚踏入运维圈的资深小白一枚,正所谓完事开头难,公司里怕我把生产系统搞坏就让我先在测试环境上先练练手.巧的是测试环境又是我熟悉的 Windows 环境,心中窃喜啊.但问题随之而来,运维从何下手呢 ...
- Kafka运维
如何在Kafka上创建topic? 手工脚本创建 ./kafka-topics.sh –zookeeper 127.0.0.1:2181 –create –topic test.example –re ...
- Kafka运维命令大全
1.集群管理 前台启动broker bin/kafka-server-start.sh <path>/server.properties Ctrl + C 关闭 后台启动broker bi ...
- python自动化运维二:业务服务监控
p { margin-bottom: 0.25cm; line-height: 120% } a:link { } p { margin-bottom: 0.25cm; line-height: 12 ...
- Linux入门之运维(1) 系统监控 vmstat top
vmstat命令是最常见的Linux/Unix监控工具,可以展现给定时间间隔的服务器的状态值,包括服务器的CPU使用率,内存使用,虚拟内存交换情况,IO读写情况.这个命令是我查看Linux/Unix最 ...
随机推荐
- 《【面试突击】— Redis篇》-- Redis的主从复制?哨兵机制?
能坚持别人不能坚持的,才能拥有别人未曾拥有的.关注左上角编程大道公众号,让我们一同坚持心中所想,一起成长!! <[面试突击]— Redis篇>-- Redis的主从复制?哨兵机制? 在这个 ...
- 在Winform界面使用自定义用户控件及TabelPanel和StackPanel布局控件
在很多时候,我们做一些非常规化的界面的时候,往往需要创建一些用户控件,在其中绘制好一些基础的界面块,作为后续重复使用的一个单元,用户控件同时也可以封装处理一些简单的逻辑.在开发Winform各种类型项 ...
- Scrapy定制命令开启爬虫
一.单爬虫运行 每次运行scrapy都要在终端输入命令太麻烦了 在项目的目录下创建manager.py(任意名称) from scrapy.cmdline import execute if __na ...
- 玩转Django2.0---Django笔记建站基础十一(一)(音乐网站开发)
第十一章 音乐网站开发 本章以音乐网站项目为例,介绍Django在实际项目开发中的应用,该网站共分为6个功能模块分别是:网站首页.歌曲排行榜.歌曲播放.歌曲点评.歌曲搜索和用户管理. 11.1 网站需 ...
- FastDF step by step
step one 肯定是安装一个FastDF服务了 step two FasDFS配置节点 step third 码代码
- robotframework,移动端(小程序)自动化,解决无法输入中文
1.如何输入中文 方法: 在open application参数最后,新增unicodeKeyboard=True resetKeyboard=True:不加入这两个参数时,中文无法输入
- 数据结构与算法 --- js描述栈
js描述栈及栈的使用 栈的特性就是只能通过一端访问,这一段就是叫做栈顶.咖啡馆内的一摞盘子就是最形象的栈的例子: 根据栈的特性,就可以定义栈的一些特殊属性和方法;用js的描述栈的时候底层数据结构用的是 ...
- Shell之作业控制
命令 含义 jobs 列出所有正在运行的作业 ^Z(Ctrl+z) 暂停作业 bg 启动被暂停的作业 fg 将后台作业调到前台 kill 向指定作业发送kill信号 nohup 忽略所有发送给子命令的 ...
- Docker底层架构之命名空间
前言 命名空间是 Linux 内核一个强大的特性.每个容器都有自己单独的命名空间,运行在其中的 应用都像是在独立的操作系统中运行一样.命名空间保证了容器之间彼此互不影响.相应的命名空间功能如下: pi ...
- spring源码系列博文总索引
一 目录 准备 1 使用Gradle构建spring5源码的一些坑和步骤 IOC模块 1spring IOC接口设计分析 2 spring IOC容器实现类分析 3 spring IOC特定场景源码步 ...