Apache Flume 学习笔记

# 从http://flume.apache.org/download.html 下载flume

#############################################

# 概述：Flume 是Cloudera提供的一个高可用的，高可靠的，分布式的海量日志采集、聚合和传输的软件。

# Flume的核心是把数据从数据源(source)收集过来，送到指定的目的地(sink)。为了保证输送的过程一定

# 成功，在送到目的地(sink)之前，会先缓存数据(channel)，待数据真正到达目的地(sink)后，再删除自

# 己缓存的数据。

#############################################

# 上传到Linux,

tar zxvf apache-flume-1.8.-bin.tar.gz

rm -rf apache-flume-1.8.-bin.tar.gz

mv apache-flume-1.8.-bin/ flume-1.8.

cd flume-1.8./conf/

cp flume-env.sh.template flume-env.sh

vim flume-env.sh

# 导入正确的JDK路径

export JAVA_HOME=/usr/local/src/jdk1..0_161

########################################

# 从网络端口接收数据，下沉到logger

######################################## 采集配置文件，netcat-logger.conf

# Name the components on this agent

a1.sources = r1

a1.sinks = k1

a1.channels = c1

# Describe/configure the sources

a1.sources.r1.type = netcat

a1.sources.r1.bind = localhost

a1.sources.r1.port = 

# Describe the sinks

a1.sinks.k1.type = logger

# Use a channel which buffers events in memory

a1.channels.c1.type = memory

a1.channels.c1.capacity =

a1.channels.c1.transactionCapacity = 

# Bind the source and sink to the channel

a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1

######################################## 采集配置文件 结束

# 启动命令

bin/flume-ng agent --conf conf/ --conf-file conf/netcat-logger.conf --name a1 -Dflume.root.logger=INFO,console

# 将出现监听： Created serverSocket:sun.nio.ch.ServerSocketChannelImpl[/127.0.0.1:]

# 用另一个终端来测试：

yum install -y telnet

telnet localhost  # 登录成功会显示 Connected to localhost.  Escape character is '^]'.

hello, world.  # 发送一段文字。 看启动监听的终端有没有收到。

# 监听端：-- ::, (SinkRunner-PollingRunner-DefaultSinkProcessor) [INFO - org.apache.flume.sink.LoggerSink.process(LoggerSink.java:)] Event: { headers:{} body:   6C 6C 6F 2C  6F  6C  2E 0D          hello,world.. }

##########################################

# 采集目录到HDFS上。# 启动好HDFS，

################################## spooldir-hdfs.cnf 文件：

#Name the components on this agent

a1.sources = r1

a1.sinks = k1

a1.channels = c1

# Describe/configure the sources

# 注意不能往监控目录中重复放置同名文件，一旦重名，服务将出错并停止。

a1.sources.r1.type = spooldir

a1.sources.r1.spoolDir = /root/logs

a1.sources.r1.fileHeader = true

# Describe the sinks

a1.sinks.k1.type = hdfs

a1.sinks.k1.channel = c1

a1.sinks.k1.hdfs.path = /flume/events/%y-%m-%d/%H%M/

a1.sinks.k1.hdfs.filePrefix = events-

a1.sinks.k1.hdfs.round = true

a1.sinks.k1.hdfs.roundValue =

a1.sinks.k1.hdfs.roundUnit = minute

a1.sinks.k1.hdfs.rollInterval =

a1.sinks.k1.hdfs.rollSize =

a1.sinks.k1.hdfs.rollCount =

a1.sinks.k1.hdfs.batchSize =

a1.sinks.k1.hdfs.useLocalTimeStamp = true

# 生成的文件类型，默认是Sequencefile, 可用DataStream ，则为普通文本

a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in memory

a1.channels.c1.type = memory

a1.channels.c1.capacity =

a1.channels.c1.transactionCapacity = 

# Bind the source and sink to the channel

a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1

##################################

# 启动命令 如果/root/logs中已有文件，则会被立刻采集到HDFS

bin/flume-ng agent -c conf/ -f conf/spooldir-hdfs.cnf -n a1 -Dflume.root.logger=INFO,console

# 成功后：-- ::, (lifecycleSupervisor--) [INFO - org.apache.flume.instrumentation.MonitoredCounterGroup.start(MonitoredCounterGroup.java:)] Component type: SOURCE, name: r1 started

# 在/root/logs/下创建一个文件，监听端会显示：Writer callback called.

# HDFS上则得到文件：/flume/events/--//events-.

# 注意 spooldir 不能往源目录/root/logs/中重复放置同名文件，一旦重名，服务将出错并停止工作。

##########################################

### 增量采集内容变化的文件到HDFS

########################################## tail-hdfs.cnf 文件

#Name the components on this agent

a1.sources = r1

a1.sinks = k1

a1.channels = c1

# Describe/configure the sources

# 注意不能往监控目录中重复放置同名文件，一旦重名，服务将出错并停止。

a1.sources.r1.type = exec

a1.sources.r1.command = tail -F /root/logs/test.log

a1.sources.r1.channels = c1

# Describe the sinks

a1.sinks.k1.type = hdfs

a1.sinks.k1.channel = c1

a1.sinks.k1.hdfs.path = /flume/tailout/%y-%m-%d/%H%M/

a1.sinks.k1.hdfs.filePrefix = events-

a1.sinks.k1.hdfs.round = true

a1.sinks.k1.hdfs.roundValue =

a1.sinks.k1.hdfs.roundUnit = minute

a1.sinks.k1.hdfs.rollInterval =

a1.sinks.k1.hdfs.rollSize =

a1.sinks.k1.hdfs.rollCount =

a1.sinks.k1.hdfs.batchSize =

a1.sinks.k1.hdfs.useLocalTimeStamp = true

# 生成的文件类型，默认是Sequencefile, 可用DataStream ，则为普通文本

a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in memory

a1.channels.c1.type = memory

a1.channels.c1.capacity =

a1.channels.c1.transactionCapacity = 

# Bind the source and sink to the channel

a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1

##########################################

# 启动命令 如果/root/logs中已有文件，则会被立刻采集到HDFS

bin/flume-ng agent -c conf -f conf/tail-hdfs.cnf -n a1 -Dflume.root.logger=INFO,console

# 模拟数据不断写入.

while true; do date >>/root/logs/test.log;sleep 1.5;done

##########################################

#Load balance 负载均衡

##########################################

# 使用三台机器，设置二级flume, 前面一台采集，使用轮询方式发往后面的二台，后二台再收集前一台发来的数据，下沉到目标。

scp -r flume-1.8./ slave2:/usr/local/src/

scp -r flume-1.8./ slave3:/usr/local/src/

# 使用slave1在最前，slave2 , slave3在其后的方式。

################# 第一级slave1 配置文件：exec-avro.cnf

#agent1 name

agent1.channels = c1

agent1.sources = r1

agent1.sinks = k1 k2

# set group

agent1.sinkgroups = g1

# set channel

agent1.channels.c1.type = memory

agent1.channels.c1.capacity =

agent1.channels.c1.transactionCapacity = 

agent1.sources.r1.channels = c1

agent1.sources.r1.type = exec

agent1.sources.r1.command = tail -F /root/logs/.log

# set sink1

agent1.sinks.k1.channel = c1

agent1.sinks.k1.type = avro

agent1.sinks.k1.hostname = slave2

agent1.sinks.k1.port = 

# set sink2

agent1.sinks.k2.channel = c1

agent1.sinks.k2.type = avro

agent1.sinks.k2.hostname = slave3

agent1.sinks.k2.port = 

# set sink group

agent1.sinkgroups.g1.sinks = k1 k2

# set failover

agent1.sinkgroups.g1.processor.type = load_balance

agent1.sinkgroups.g1.processor.backoff = true

agent1.sinkgroups.g1.processor.selector = round_robin

agent1.sinkgroups.g1.processor.selector.maxTimeOut = 

############# end ##############

################# 第二级slave2 配置文件：avro-logger.cnf

# Name the components on this agent

a1.sources = r1

a1.sinks = k1

a1.channels = c1

# Describe/configure the sources

a1.sources.r1.type = avro

a1.sources.r1.channels = c1

a1.sources.r1.bind = slave2

a1.sources.r1.port = 

# Describe the sinks

a1.sinks.k1.type = logger

# Use a channel which buffers events in memory

a1.channels.c1.type = memory

a1.channels.c1.capacity =

a1.channels.c1.transactionCapacity = 

# Bind the source and sink to the channel

a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1

############# slave2 end ##############

################# 第二级slave3 配置文件：avro-logger.cnf 唯一的改变是slave3

# Name the components on this agent

a1.sources = r1

a1.sinks = k1

a1.channels = c1

# Describe/configure the sources

a1.sources.r1.type = avro

a1.sources.r1.channels = c1

a1.sources.r1.bind = slave3

a1.sources.r1.port = 

# Describe the sinks

a1.sinks.k1.type = logger

# Use a channel which buffers events in memory

a1.channels.c1.type = memory

a1.channels.c1.capacity =

a1.channels.c1.transactionCapacity = 

# Bind the source and sink to the channel

a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1

############# slave3 end ##############

## 先启动第二级的slave2, slave3

bin/flume-ng agent -c conf -f conf/avro-logger.cnf -n a1 -Dflume.root.logger=INFO,console

## 再启动一级的slave1

bin/flume-ng agent -c conf -f conf/exec-avro.cnf -n agent1 -Dflume.root.logger=INFO,console

# 启动成功后，第二级终端会出现类似：CONNECTED: /192.168.112.11:

# 而后续终止第一级时，第二级会出现类似： /192.168.112.11: disconnected. 

# 模拟数据写入. 会看到仅第二级有采集动作，第一级不作显示。

while true; do date >>/root/logs/.log;sleep ;done

#############################################

#  Failover 容错

#  同一时间后端只有一台机器工作.

#############################################

# 还是使用三台机器，设置二级flume, 前面一台采集，发往后面的某一台，优先级最高的收集前一台发来的数据；

# 如果这台机器挂了，另一台自动替补

scp -r flume-1.8./ slave2:/usr/local/src/

scp -r flume-1.8./ slave3:/usr/local/src/

# 使用slave1在最前，slave2 , slave3在其后的方式。

################# 第一级slave1 配置文件：exec-avro.cnf

#agent1 name

agent1.channels = c1

agent1.sources = r1

agent1.sinks = k1 k2

# set group

agent1.sinkgroups = g1

# set channel

agent1.channels.c1.type = memory

agent1.channels.c1.capacity =

agent1.channels.c1.transactionCapacity = 

agent1.sources.r1.channels = c1

agent1.sources.r1.type = exec

agent1.sources.r1.command = tail -F /root/logs/.log

# set sink1

agent1.sinks.k1.channel = c1

agent1.sinks.k1.type = avro

agent1.sinks.k1.hostname = slave2

agent1.sinks.k1.port = 

# set sink2

agent1.sinks.k2.channel = c1

agent1.sinks.k2.type = avro

agent1.sinks.k2.hostname = slave3

agent1.sinks.k2.port = 

# set sink group

agent1.sinkgroups.g1.sinks = k1 k2

# set failover

agent1.sinkgroups.g1.processor.type = failover

agent1.sinkgroups.g1.processor.priority.k1 =

agent1.sinkgroups.g1.processor.priority.k2 =

agent1.sinkgroups.g1.processor.maxpenalty = 

############# end ##############

################# 第二级slave2 配置文件：avro-logger.cnf

# Name the components on this agent

a1.sources = r1

a1.sinks = k1

a1.channels = c1

# Describe/configure the sources

a1.sources.r1.type = avro

a1.sources.r1.channels = c1

a1.sources.r1.bind = slave2

a1.sources.r1.port = 

# Describe the sinks

a1.sinks.k1.type = logger

# Use a channel which buffers events in memory

a1.channels.c1.type = memory

a1.channels.c1.capacity =

a1.channels.c1.transactionCapacity = 

# Bind the source and sink to the channel

a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1

############# slave2 end ##############

################# 第二级slave3 配置文件：avro-logger.cnf 唯一的改变是slave3

# Name the components on this agent

a1.sources = r1

a1.sinks = k1

a1.channels = c1

# Describe/configure the sources

a1.sources.r1.type = avro

a1.sources.r1.channels = c1

a1.sources.r1.bind = slave3

a1.sources.r1.port = 

# Describe the sinks

a1.sinks.k1.type = logger

# Use a channel which buffers events in memory

a1.channels.c1.type = memory

a1.channels.c1.capacity =

a1.channels.c1.transactionCapacity = 

# Bind the source and sink to the channel

a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1

############# slave3 end ##############

## 先启动第二级的slave3, slave2

bin/flume-ng agent -c conf -f conf/avro-logger.cnf -n a1 -Dflume.root.logger=INFO,console

## 再启动一级的slave1

bin/flume-ng agent -c conf -f conf/exec-avro.cnf -n agent1 -Dflume.root.logger=INFO,console

# 启动成功后，第二级终端会出现类似：CONNECTED: /192.168.112.11:

# 而后续终止第一级时，第二级会出现类似： /192.168.112.11: disconnected. 

# 模拟数据写入. 会看到仅第二级slave2有采集动作，第一级不作显示。slave3待命。

while true; do date >>/root/logs/.log;sleep ;done

# 一旦slave2终止，则slave3自动顶上，继续接收。

更新一个练习：

################################################################

# 案例：

# A、B两台日志服务器实时生产日志，主要类型为access.log, nginx.log, web.log

# 要求：把A、B中的三种日志采集汇总到C机器上，然后收集到HDFS

# 且HDFS中要求按类别存放到不同的目录

################################################################

### 现将slave1 slave2 slave3 分别对应A B C

### A & B 配置文件 exec_source_avro_sink.conf 基本上一样，仅hostname不一样

# Name the components on this agent

a1.sources = r1 r2 r3

a1.sinks = k1

a1.channels = c1

# Describe/configure the source

a1.sources.r1.type = exec

a1.sources.r1.command = tail -F /root/logs1/access.log

a1.sources.r1.interceptors = i1

a1.sources.r1.interceptors.i1.type = static

a1.sources.r1.interceptors.i1.key = type

a1.sources.r1.interceptors.i1.value = access

a1.sources.r2.type = exec

a1.sources.r2.command = tail -F /root/logs1/nginx.log

a1.sources.r2.interceptors = i2

a1.sources.r2.interceptors.i2.type = static

a1.sources.r2.interceptors.i2.key = type

a1.sources.r2.interceptors.i2.value = nginx

a1.sources.r3.type = exec

a1.sources.r3.command = tail -F /root/logs1/web.log

a1.sources.r3.interceptors = i3

a1.sources.r3.interceptors.i3.type = static

a1.sources.r3.interceptors.i3.key = type

a1.sources.r3.interceptors.i3.value = web

# Describe the sink 发送到下一级主机

a1.sinks.k1.type = avro

a1.sinks.k1.hostname = slave3

a1.sinks.k1.port = 

# Use a channel which buffers events in memory

a1.channels.c1.type = memory

a1.channels.c1.capacity =

a1.channels.c1.transactionCapacity = 

# Bind the sourceand sink to the channel

a1.sources.r1.channels = c1

a1.sources.r2.channels = c1

a1.sources.r3.channels = c1

a1.sinks.k1.channel = c1

### end ###

### C 配置文件 avro_source_hdfs_sink.conf

# 定义agent名, source channel sink的名称

a1.sources = r1

a1.sinks = k1

a1.channels = c1

# 定义source

a1.sources.r1.type = avro

a1.sources.r1.bind = slave3

a1.sources.r1.port = 

# 添加时间拦截器

a1.sources.r1.interceptors = i1

a1.sources.r1.interceptors.i1.type = org.apache.flume.interceptor.TimestampInterceptor$Builder

# 定义channels

a1.channels.c1.type = memory

a1.channels.c1.capacity =

a1.channels.c1.transactionCapacity = 

# 定义sink

a1.sinks.k1.type = hdfs

a1.sinks.k1.hdfs.path = hdfs://master:9000/source/logs/%{type}/%Y%m%d

a1.sinks.k1.hdfs.filePrefix = events

a1.sinks.k1.hdfs.fileType = DataStream

a1.sinks.k1.hdfs.writeFormat = Text

# 时间类型

# a1.sinks.k1.hdfs.useLocalTimeStamp = true

# 生成的文件不按条数生成

a1.sinks.k1.hdfs.rollCount =

# 生成的文件不按时间生成

a1.sinks.k1.hdfs.rollInterval =

# 生成的文件按大小生成

a1.sinks.k1.hdfs.rollSize =

# 批量写入HDFS的个数

a1.sinks.k1.hdfs.batchSize =

# flume操作hdfs的线程数(包括新建，写入等)

a1.sinks.k1.hdfs.threadsPoolSize =

# 操作hdfs超时时间

a1.sinks.k1.hdfs.callTimeout = 

# 组装source channel sink

a1.sources.r1.channels = c1

a1.sinks.k1.channel = c1

### end ###

## 先启动第二级的slave2

bin/flume-ng agent -c conf -f conf/avro_source_hdfs_sink.conf -name a1 -Dflume.root.logger=DEBUG,console

## 再启动一级的slave1

bin/flume-ng agent -c conf -f conf/exec_source_avro_sink.conf -name a1 -Dflume.root.logger=DEBUG,console

# 启动成功后，slave2会出现类似：CONNECTED: /192.168.112.11: 

# 模拟数据写入.

while true; do echo "access..  `date` " >>/root/logs1/access.log;sleep ;done

while true; do echo "nginx..  `date` " >>/root/logs1/nginx.log;sleep ;done

while true; do echo "web..  `date` " >>/root/logs1/web.log;sleep ;done

# 查看hdfs上采集成功。

今天的练习完成，成功了。

Apache Flume 学习笔记的更多相关文章

Apache Flink学习笔记
Apache Flink学习笔记简介大数据的计算引擎分为4代第一代:Hadoop承载的MapReduce.它将计算分为两个阶段,分别为Map和Reduce.对于上层应用来说,就要想办法去拆分算法 ...
Apache OFBiz 学习笔记之服务引擎二
加载服务定义文件 ofbiz-component.xml:所有的服务定义文件在每个组件的ofbi-component.xml文件中加载服务定义例:framework/common/ofbi ...
Apache Ignite 学习笔记(一): Ignite介绍、部署安装和REST/SQL客户端使用
Apache Ignite 介绍 Ignite是什么呢?先引用一段官网关于Ignite的描述: Ignite is memory-centric distributed database, cachi ...
flume学习笔记——安装和使用
Flume是一个分布式.可靠.和高可用的海量日志聚合的系统,支持在系统中定制各类数据发送方,用于收集数据:同时,Flume提供对数据进行简单处理,并写到各种数据接受方(可定制)的能力. Flume是一 ...
Apache Flume 学习
Apache Flume,又称Flume NG (next generation),前身是Cloudera公司的Flume项目 -- 又称Flume OG. 这货的功能就是从源中将数据收集到指定的目的 ...
flume学习笔记
#################################################################################################### ...
Flume 学习笔记之 Flume NG+Kafka整合
Flume NG集群+Kafka集群整合: 修改Flume配置文件(flume-kafka-server.conf),让Sink连上Kafka hadoop1: #set Agent name a1. ...
Apache Lucene学习笔记
Hadoop概述 Apache lucene: 全球第一个开源的全文检索引擎工具包完整的查询引擎和搜索引擎部分文本分析引擎开发人员在此基础建立完整的全文检索引擎以下为转载:http://www ...
apache activemq 学习笔记
0.activemq的概念 activemq实现了jms(java Message server),用于接收,发送,处理消息的开源消息总线. 1.activemq和jms的区别 jms说白了就是jav ...

随机推荐

给学习立个flag
今天是2018年7月7号,此时的砖相比昨天格外烫手,望着手套因被磨破而露出来的半截手指头,一股股热浪溜溜的从指间划过,背后还有小山一样高的砖头,感觉对面today店里的冰镇西瓜又成了不可奢望的梦... ...
es6 复制对象
var pp = {'name': '1','work': 'teacher'} var kk = [1,2] var tt = [] for(let index = 0; index<kk.l ...
C++找不出来的bug
1.在函数中给指针赋值时候要极其注意: 新生成的指针要么是new出来的,要么是全局的,要么是传参过来的... 就是要切记在函数局部生成一个新指针,这样的话,出了这个函数,局部的新指针所具体代表的值就被 ...
pyCharm的第一个项目
首先打开编译器pyCharm 创建一个项目在location :新建文件夹在interpreter:指定python解释器的路径 python解释器下载官网: https://www.python ...
jquery首页图片轮播
css样式 .bannerBox {position: relative;width: 100%;height: 348px;margin:0px auto;}.bannerBox .bannerLi ...
7、Kafka、AMQ、RabbitMQ对比
Kafka AMQ RabbitMQ 应用场景 AMQ/RabbitMQ Kafka
mysql的并发控制
并发即指在同一时刻,多个操作并行执行.MySQL对并发的处理主要应用了两种机制——是"锁"和"多版本控制". 1.并发控制 MySQL提供两个级别的并发控制:服 ...
container（容器），injection（注入）
1.container为什么会出现? 在书写程序的时候,我们常常需要对大量的对象引用进行管理.为了实现有效的归类管理,我们常常将同类的引用放置在同一数据容器中.由于数据容器中存放了我们随时可能需要使用 ...
FI 创建资产接口AS01
FUNCTION ZREIP_CREATE_AS01TSET. *"------------------------------------------------------------- ...
Codeforces 792 E. Colored Balls
题目链接:http://codeforces.com/contest/792/problem/E 假设含球较少的那些堆有 $mi$ 个球,较多的那些堆有$ma$个球,$ma=mi+1$,考虑对于最小的 ...

Apache Flume 学习笔记

Apache Flume 学习笔记的更多相关文章

随机推荐

热门专题