# 从http://flume.apache.org/download.html 下载flume
#############################################
# 概述:Flume 是Cloudera提供的一个高可用的,高可靠的,分布式的海量日志采集、聚合和传输的软件。
# Flume的核心是把数据从数据源(source)收集过来,送到指定的目的地(sink)。为了保证输送的过程一定
# 成功,在送到目的地(sink)之前,会先缓存数据(channel),待数据真正到达目的地(sink)后,再删除自
# 己缓存的数据。
#############################################
# 上传到Linux,
tar zxvf apache-flume-1.8.-bin.tar.gz
rm -rf apache-flume-1.8.-bin.tar.gz
mv apache-flume-1.8.-bin/ flume-1.8.
cd flume-1.8./conf/
cp flume-env.sh.template flume-env.sh vim flume-env.sh
# 导入正确的JDK路径
export JAVA_HOME=/usr/local/src/jdk1..0_161 ########################################
# 从网络端口接收数据,下沉到logger
######################################## 采集配置文件,netcat-logger.conf # Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1 # Describe/configure the sources
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = # Describe the sinks
a1.sinks.k1.type = logger # Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity =
a1.channels.c1.transactionCapacity = # Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1 ######################################## 采集配置文件 结束 # 启动命令
bin/flume-ng agent --conf conf/ --conf-file conf/netcat-logger.conf --name a1 -Dflume.root.logger=INFO,console
# 将出现监听: Created serverSocket:sun.nio.ch.ServerSocketChannelImpl[/127.0.0.1:]
# 用另一个终端来测试:
yum install -y telnet
telnet localhost # 登录成功会显示 Connected to localhost. Escape character is '^]'.
hello, world. # 发送一段文字。 看启动监听的终端有没有收到。
# 监听端:-- ::, (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:)] Event: { headers:{} body: 6C 6C 6F 2C 6F 6C 2E 0D hello,world.. } ##########################################
# 采集目录到HDFS上。# 启动好HDFS,
################################## spooldir-hdfs.cnf 文件: #Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1 # Describe/configure the sources
# 注意不能往监控目录中重复放置同名文件,一旦重名,服务将出错并停止。
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /root/logs
a1.sources.r1.fileHeader = true # Describe the sinks
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path = /flume/events/%y-%m-%d/%H%M/
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue =
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval =
a1.sinks.k1.hdfs.rollSize =
a1.sinks.k1.hdfs.rollCount =
a1.sinks.k1.hdfs.batchSize =
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# 生成的文件类型,默认是Sequencefile, 可用DataStream ,则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream # Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity =
a1.channels.c1.transactionCapacity = # Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1 ################################## # 启动命令 如果/root/logs中已有文件,则会被立刻采集到HDFS
bin/flume-ng agent -c conf/ -f conf/spooldir-hdfs.cnf -n a1 -Dflume.root.logger=INFO,console
# 成功后:-- ::, (lifecycleSupervisor--) [INFO - org.apache.flume.instrumentation.MonitoredCounterGroup.start(MonitoredCounterGroup.java:)] Component type: SOURCE, name: r1 started # 在/root/logs/下创建一个文件,监听端会显示:Writer callback called.
# HDFS上则得到文件:/flume/events/--//events-.
# 注意 spooldir 不能往源目录/root/logs/中重复放置同名文件,一旦重名,服务将出错并停止工作。 ##########################################
### 增量采集内容变化的文件到HDFS
########################################## tail-hdfs.cnf 文件 #Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1 # Describe/configure the sources
# 注意不能往监控目录中重复放置同名文件,一旦重名,服务将出错并停止。
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /root/logs/test.log
a1.sources.r1.channels = c1 # Describe the sinks
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path = /flume/tailout/%y-%m-%d/%H%M/
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue =
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval =
a1.sinks.k1.hdfs.rollSize =
a1.sinks.k1.hdfs.rollCount =
a1.sinks.k1.hdfs.batchSize =
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# 生成的文件类型,默认是Sequencefile, 可用DataStream ,则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream # Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity =
a1.channels.c1.transactionCapacity = # Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1 ##########################################
# 启动命令 如果/root/logs中已有文件,则会被立刻采集到HDFS
bin/flume-ng agent -c conf -f conf/tail-hdfs.cnf -n a1 -Dflume.root.logger=INFO,console # 模拟数据不断写入.
while true; do date >>/root/logs/test.log;sleep 1.5;done ##########################################
#Load balance 负载均衡
##########################################
# 使用三台机器,设置二级flume, 前面一台采集,使用轮询方式发往后面的二台,后二台再收集前一台发来的数据,下沉到目标。
scp -r flume-1.8./ slave2:/usr/local/src/
scp -r flume-1.8./ slave3:/usr/local/src/ # 使用slave1在最前,slave2 , slave3在其后的方式。 ################# 第一级slave1 配置文件:exec-avro.cnf #agent1 name
agent1.channels = c1
agent1.sources = r1
agent1.sinks = k1 k2 # set group
agent1.sinkgroups = g1 # set channel
agent1.channels.c1.type = memory
agent1.channels.c1.capacity =
agent1.channels.c1.transactionCapacity = agent1.sources.r1.channels = c1
agent1.sources.r1.type = exec
agent1.sources.r1.command = tail -F /root/logs/.log # set sink1
agent1.sinks.k1.channel = c1
agent1.sinks.k1.type = avro
agent1.sinks.k1.hostname = slave2
agent1.sinks.k1.port = # set sink2
agent1.sinks.k2.channel = c1
agent1.sinks.k2.type = avro
agent1.sinks.k2.hostname = slave3
agent1.sinks.k2.port = # set sink group
agent1.sinkgroups.g1.sinks = k1 k2 # set failover
agent1.sinkgroups.g1.processor.type = load_balance
agent1.sinkgroups.g1.processor.backoff = true
agent1.sinkgroups.g1.processor.selector = round_robin
agent1.sinkgroups.g1.processor.selector.maxTimeOut = ############# end ############## ################# 第二级slave2 配置文件:avro-logger.cnf # Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1 # Describe/configure the sources
a1.sources.r1.type = avro
a1.sources.r1.channels = c1
a1.sources.r1.bind = slave2
a1.sources.r1.port = # Describe the sinks
a1.sinks.k1.type = logger # Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity =
a1.channels.c1.transactionCapacity = # Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1 ############# slave2 end ############## ################# 第二级slave3 配置文件:avro-logger.cnf 唯一的改变是slave3 # Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1 # Describe/configure the sources
a1.sources.r1.type = avro
a1.sources.r1.channels = c1
a1.sources.r1.bind = slave3
a1.sources.r1.port = # Describe the sinks
a1.sinks.k1.type = logger # Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity =
a1.channels.c1.transactionCapacity = # Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1 ############# slave3 end ############## ## 先启动第二级的slave2, slave3
bin/flume-ng agent -c conf -f conf/avro-logger.cnf -n a1 -Dflume.root.logger=INFO,console
## 再启动一级的slave1
bin/flume-ng agent -c conf -f conf/exec-avro.cnf -n agent1 -Dflume.root.logger=INFO,console
# 启动成功后,第二级终端会出现类似:CONNECTED: /192.168.112.11:
# 而后续终止第一级时,第二级会出现类似: /192.168.112.11: disconnected. # 模拟数据写入. 会看到仅第二级有采集动作,第一级不作显示。
while true; do date >>/root/logs/.log;sleep ;done #############################################
# Failover 容错
# 同一时间后端只有一台机器工作.
#############################################
# 还是使用三台机器,设置二级flume, 前面一台采集,发往后面的某一台,优先级最高的收集前一台发来的数据;
# 如果这台机器挂了,另一台自动替补
scp -r flume-1.8./ slave2:/usr/local/src/
scp -r flume-1.8./ slave3:/usr/local/src/ # 使用slave1在最前,slave2 , slave3在其后的方式。 ################# 第一级slave1 配置文件:exec-avro.cnf #agent1 name
agent1.channels = c1
agent1.sources = r1
agent1.sinks = k1 k2 # set group
agent1.sinkgroups = g1 # set channel
agent1.channels.c1.type = memory
agent1.channels.c1.capacity =
agent1.channels.c1.transactionCapacity = agent1.sources.r1.channels = c1
agent1.sources.r1.type = exec
agent1.sources.r1.command = tail -F /root/logs/.log # set sink1
agent1.sinks.k1.channel = c1
agent1.sinks.k1.type = avro
agent1.sinks.k1.hostname = slave2
agent1.sinks.k1.port = # set sink2
agent1.sinks.k2.channel = c1
agent1.sinks.k2.type = avro
agent1.sinks.k2.hostname = slave3
agent1.sinks.k2.port = # set sink group
agent1.sinkgroups.g1.sinks = k1 k2 # set failover
agent1.sinkgroups.g1.processor.type = failover
agent1.sinkgroups.g1.processor.priority.k1 =
agent1.sinkgroups.g1.processor.priority.k2 =
agent1.sinkgroups.g1.processor.maxpenalty = ############# end ############## ################# 第二级slave2 配置文件:avro-logger.cnf # Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1 # Describe/configure the sources
a1.sources.r1.type = avro
a1.sources.r1.channels = c1
a1.sources.r1.bind = slave2
a1.sources.r1.port = # Describe the sinks
a1.sinks.k1.type = logger # Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity =
a1.channels.c1.transactionCapacity = # Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1 ############# slave2 end ############## ################# 第二级slave3 配置文件:avro-logger.cnf 唯一的改变是slave3 # Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1 # Describe/configure the sources
a1.sources.r1.type = avro
a1.sources.r1.channels = c1
a1.sources.r1.bind = slave3
a1.sources.r1.port = # Describe the sinks
a1.sinks.k1.type = logger # Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity =
a1.channels.c1.transactionCapacity = # Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1 ############# slave3 end ############## ## 先启动第二级的slave3, slave2
bin/flume-ng agent -c conf -f conf/avro-logger.cnf -n a1 -Dflume.root.logger=INFO,console
## 再启动一级的slave1
bin/flume-ng agent -c conf -f conf/exec-avro.cnf -n agent1 -Dflume.root.logger=INFO,console
# 启动成功后,第二级终端会出现类似:CONNECTED: /192.168.112.11:
# 而后续终止第一级时,第二级会出现类似: /192.168.112.11: disconnected. # 模拟数据写入. 会看到仅第二级slave2有采集动作,第一级不作显示。slave3待命。
while true; do date >>/root/logs/.log;sleep ;done
# 一旦slave2终止,则slave3自动顶上,继续接收。

更新一个练习:

################################################################
# 案例:
# A、B两台日志服务器实时生产日志,主要类型为access.log, nginx.log, web.log
# 要求:把A、B中的三种日志采集汇总到C机器上,然后收集到HDFS
# 且HDFS中要求按类别存放到不同的目录
################################################################
### 现将slave1 slave2 slave3 分别对应A B C
### A & B 配置文件 exec_source_avro_sink.conf 基本上一样,仅hostname不一样 # Name the components on this agent
a1.sources = r1 r2 r3
a1.sinks = k1
a1.channels = c1 # Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /root/logs1/access.log
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = static
a1.sources.r1.interceptors.i1.key = type
a1.sources.r1.interceptors.i1.value = access a1.sources.r2.type = exec
a1.sources.r2.command = tail -F /root/logs1/nginx.log
a1.sources.r2.interceptors = i2
a1.sources.r2.interceptors.i2.type = static
a1.sources.r2.interceptors.i2.key = type
a1.sources.r2.interceptors.i2.value = nginx a1.sources.r3.type = exec
a1.sources.r3.command = tail -F /root/logs1/web.log
a1.sources.r3.interceptors = i3
a1.sources.r3.interceptors.i3.type = static
a1.sources.r3.interceptors.i3.key = type
a1.sources.r3.interceptors.i3.value = web # Describe the sink 发送到下一级主机
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = slave3
a1.sinks.k1.port = # Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity =
a1.channels.c1.transactionCapacity = # Bind the sourceand sink to the channel
a1.sources.r1.channels = c1
a1.sources.r2.channels = c1
a1.sources.r3.channels = c1
a1.sinks.k1.channel = c1
### end ### ### C 配置文件 avro_source_hdfs_sink.conf # 定义agent名, source channel sink的名称
a1.sources = r1
a1.sinks = k1
a1.channels = c1 # 定义source
a1.sources.r1.type = avro
a1.sources.r1.bind = slave3
a1.sources.r1.port = # 添加时间拦截器
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = org.apache.flume.interceptor.TimestampInterceptor$Builder # 定义channels
a1.channels.c1.type = memory
a1.channels.c1.capacity =
a1.channels.c1.transactionCapacity = # 定义sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://master:9000/source/logs/%{type}/%Y%m%d
a1.sinks.k1.hdfs.filePrefix = events
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.writeFormat = Text # 时间类型
# a1.sinks.k1.hdfs.useLocalTimeStamp = true
# 生成的文件不按条数生成
a1.sinks.k1.hdfs.rollCount =
# 生成的文件不按时间生成
a1.sinks.k1.hdfs.rollInterval =
# 生成的文件按大小生成
a1.sinks.k1.hdfs.rollSize =
# 批量写入HDFS的个数
a1.sinks.k1.hdfs.batchSize =
# flume操作hdfs的线程数(包括新建,写入等)
a1.sinks.k1.hdfs.threadsPoolSize =
# 操作hdfs超时时间
a1.sinks.k1.hdfs.callTimeout = # 组装source channel sink
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1 ### end ### ## 先启动第二级的slave2
bin/flume-ng agent -c conf -f conf/avro_source_hdfs_sink.conf -name a1 -Dflume.root.logger=DEBUG,console
## 再启动一级的slave1
bin/flume-ng agent -c conf -f conf/exec_source_avro_sink.conf -name a1 -Dflume.root.logger=DEBUG,console
# 启动成功后,slave2会出现类似:CONNECTED: /192.168.112.11: # 模拟数据写入.
while true; do echo "access.. `date` " >>/root/logs1/access.log;sleep ;done
while true; do echo "nginx.. `date` " >>/root/logs1/nginx.log;sleep ;done
while true; do echo "web.. `date` " >>/root/logs1/web.log;sleep ;done # 查看hdfs上采集成功。

今天的练习完成,成功了。

Apache Flume 学习笔记的更多相关文章

  1. Apache Flink学习笔记

    Apache Flink学习笔记 简介 大数据的计算引擎分为4代 第一代:Hadoop承载的MapReduce.它将计算分为两个阶段,分别为Map和Reduce.对于上层应用来说,就要想办法去拆分算法 ...

  2. Apache OFBiz 学习笔记 之 服务引擎 二

    加载服务定义文件   ofbiz-component.xml:所有的服务定义文件在每个组件的ofbi-component.xml文件中   加载服务定义 例:framework/common/ofbi ...

  3. Apache Ignite 学习笔记(一): Ignite介绍、部署安装和REST/SQL客户端使用

    Apache Ignite 介绍 Ignite是什么呢?先引用一段官网关于Ignite的描述: Ignite is memory-centric distributed database, cachi ...

  4. flume学习笔记——安装和使用

    Flume是一个分布式.可靠.和高可用的海量日志聚合的系统,支持在系统中定制各类数据发送方,用于收集数据:同时,Flume提供对数据进行简单处理,并写到各种数据接受方(可定制)的能力. Flume是一 ...

  5. Apache Flume 学习

    Apache Flume,又称Flume NG (next generation),前身是Cloudera公司的Flume项目 -- 又称Flume OG. 这货的功能就是从源中将数据收集到指定的目的 ...

  6. flume学习笔记

    #################################################################################################### ...

  7. Flume 学习笔记之 Flume NG+Kafka整合

    Flume NG集群+Kafka集群整合: 修改Flume配置文件(flume-kafka-server.conf),让Sink连上Kafka hadoop1: #set Agent name a1. ...

  8. Apache Lucene学习笔记

    Hadoop概述 Apache lucene: 全球第一个开源的全文检索引擎工具包 完整的查询引擎和搜索引擎 部分文本分析引擎 开发人员在此基础建立完整的全文检索引擎 以下为转载:http://www ...

  9. apache activemq 学习笔记

    0.activemq的概念 activemq实现了jms(java Message server),用于接收,发送,处理消息的开源消息总线. 1.activemq和jms的区别 jms说白了就是jav ...

随机推荐

  1. Java 实现异步调用

    首先 我遇到的问题是 接口调用时需要更新缓存 而更新缓存又是个说快不快的过程 所以打算做异步调用 返回我所需要的结果即可 ,至于缓存什么时候更新完 就不是我所需要关注的了 废话不多说 上代码 publ ...

  2. 删除本地git的远程分支和远程删除git服务器的分支

    在项目中使用git管理代码后,有些时候会创建很多不同名称的分支,以此区分各个分支代码功能. 而随着代码的合并,以前的分支就可能不再需要保存了,所以就要对没有用的分支进行删除,包括紧急回滚时从中抽取某一 ...

  3. Python 多进程基本语法

    需求:  在有多线程的情况下,我们可以使用线程帮我们处理一些事情,但是在python这里 由于RSA锁的缘故,我们只能够用到一个cpu帮我们处理事情,一个cpu在处理多个线程时,是通过上下文的切换使我 ...

  4. opencv学习之路(25)、轮廓查找与绘制(四)——正外接矩形

    一.简介 二.外接矩形的查找绘制 #include "opencv2/opencv.hpp" using namespace cv; void main() { //外接矩形的查找 ...

  5. Git冲突和解决冲突-测试方法

    原文链接:https://www.cnblogs.com/blogslee/p/6828659.html

  6. phpstorm 配置git上传代码到 码云

    方法一: 1.安装git,一路next下去. git安装完自带右键菜单 2.看一下phpstorm里的路径是否正确. 3.使用phpstorm管理代码库 新建,从码云上已有的公开项目克隆一份到本地: ...

  7. [pytorch修改]dataloader.py 实现darknet中的subdivision功能

    dataloader.py import random import torch import torch.multiprocessing as multiprocessing from torch. ...

  8. 微信小程序unionid获取问题

    微信小程序使用login获取unionid时可能获取不到,原因可能是该微信账号没有关注小程序所在公众号等.但在微信小程序中使用微信注册,必须要用unionid注册时,大部分用户就会因此无法注册成功. ...

  9. IdentityServer4中AccessToken和IdentityToken中包含的Claims构成

    贴出主要代码(以下源码的位置位于:IdentityServer4.Services.DefaultClaimsService) /// <summary> /// Returns clai ...

  10. 不错的redis文章

    参考: https://www.cnblogs.com/itdragon/tag/redis/