本文介绍flume读取kafka数据的方法

代码:

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *  
 * http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *******************************************************************************/
package org.apache.flume.source.kafka;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import kafka.consumer.ConsumerIterator;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import kafka.message.Message;

import kafka.message.MessageAndMetadata;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.conf.ConfigurationException;
import org.apache.flume.event.SimpleEvent;
import org.apache.flume.source.AbstractSource;
import org.apache.flume.source.SyslogParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A Source for Kafka which reads messages from kafka. I use this in company production environment

 * and its performance is good. Over 100k messages per second can be read from kafka in one source.<p>
 * <tt>zookeeper.connect: </tt> the zookeeper ip kafka use.<p>
 * <tt>topic: </tt> the topic to read from kafka.<p>
 * <tt>group.id: </tt> the groupid of consumer group.<p>
 */
public class KafkaSource extends AbstractSource implements Configurable, PollableSource {
    private static final Logger log = LoggerFactory.getLogger(KafkaSource.class);
    private ConsumerConnector consumer;
    private ConsumerIterator<byte[], byte[]> it;
    private String topic;
    
    public Status process() throws EventDeliveryException {
        List<Event> eventList = new ArrayList<Event>();
        MessageAndMetadata<byte[],byte[]> message;
        Event event;
        Map<String, String> headers;
        String strMessage;
        try {
            if(it.hasNext()) {
                message = it.next();
                event = new SimpleEvent();
                headers = new HashMap<String, String>();
                headers.put("timestamp", String.valueOf(System.currentTimeMillis()));

                strMessage =  String.valueOf(System.currentTimeMillis()) + "|" + new String(message.message());
                log.debug("Message: {}", strMessage);

                event.setBody(strMessage.getBytes());
                //event.setBody(message.message());
                event.setHeaders(headers);
                eventList.add(event);
            }
            getChannelProcessor().processEventBatch(eventList);
            return Status.READY;
        } catch (Exception e) {
            log.error("KafkaSource EXCEPTION, {}", e.getMessage());
            return Status.BACKOFF;
        }
    }

    public void configure(Context context) {
        topic = context.getString("topic");
        if(topic == null) {
            throw new ConfigurationException("Kafka topic must be specified.");
        }
        try {
            this.consumer = KafkaSourceUtil.getConsumer(context);
        } catch (IOException e) {
            log.error("IOException occur, {}", e.getMessage());
        } catch (InterruptedException e) {
            log.error("InterruptedException occur, {}", e.getMessage());
        }
        Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
        topicCountMap.put(topic, new Integer(1));
        Map<String, List<KafkaStream<byte[], byte[]>>> consumerMap = consumer.createMessageStreams(topicCountMap);
        if(consumerMap == null) {
            throw new ConfigurationException("topicCountMap is null");
        }
        List<KafkaStream<byte[], byte[]>> topicList = consumerMap.get(topic);
        if(topicList == null || topicList.isEmpty()) {
            throw new ConfigurationException("topicList is null or empty");
        }
        KafkaStream<byte[], byte[]> stream =  topicList.get(0);
        it = stream.iterator();
    }

    @Override
    public synchronized void stop() {
        consumer.shutdown();
        super.stop();
    }

}

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *  
 * http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *******************************************************************************/
package org.apache.flume.source.kafka;

import java.io.IOException;
import java.util.Map;
import java.util.Properties;

import com.google.common.collect.ImmutableMap;
import kafka.consumer.Consumer;
import kafka.consumer.ConsumerConfig;
import kafka.javaapi.consumer.ConsumerConnector;

import org.apache.flume.Context;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class KafkaSourceUtil {
    private static final Logger log = LoggerFactory.getLogger(KafkaSourceUtil.class);

    public static Properties getKafkaConfigProperties(Context context) {
        log.info("context={}",context.toString());
        Properties props = new Properties();
        ImmutableMap<String, String> contextMap = context.getParameters();
        for (Map.Entry<String,String> entry : contextMap.entrySet()) {
            String key = entry.getKey();
            if (!key.equals("type") && !key.equals("channel")) {
                props.setProperty(entry.getKey(), entry.getValue());
                log.info("key={},value={}", entry.getKey(), entry.getValue());
            }
        }
        return props;
    }
    public static ConsumerConnector getConsumer(Context context) throws IOException, InterruptedException {
        ConsumerConfig consumerConfig = new ConsumerConfig(getKafkaConfigProperties(context));
        ConsumerConnector consumer = Consumer.createJavaConsumerConnector(consumerConfig);
        return consumer;
    }
}

配置文件:( /etc/flume/conf/flume-kafka-file.properties)

agent_log.sources = kafka0
agent_log.channels = ch0
agent_log.sinks = sink0

agent_log.sources.kafka0.channels = ch0
agent_log.sinks.sink0.channel = ch0

agent_log.sources.kafka0.type = org.apache.flume.source.kafka.KafkaSource
agent_log.sources.kafka0.zookeeper.connect = node3:2181,node4:2181,node5:2181
agent_log.sources.kafka0.topic = kkt-test-topic
agent_log.sources.kafka0.group.id= test

agent_log.channels.ch0.type = memory
agent_log.channels.ch0.capacity = 2048
agent_log.channels.ch0.transactionCapacity = 1000

agent_log.sinks.sink0.type=file_roll
agent_log.sinks.sink0.sink.directory=/data/flumeng/data/test
agent_log.sinks.sink0.sink.rollInterval=300

启动脚本:

sudo su  -l -s /bin/bash  flume  -c '/usr/lib/flume/bin/flume-ng agent --conf /etc/flume/conf --conf-file /etc/flume/conf/flume-kafka-file.properties -name agent_log -Dflume.root.logger=INFO,console '

注意: 红色字体的功能是对原来数据增加时间戳

            版本号 flume-1.4.0.2.1.1.0 + kafka2.8.0-0.8.0

            參考资料:https://github.com/baniuyao/flume-kafka

             编译用到的库:

            flume-ng-configuration-1.4.0.2.1.1.0-385

            flume-ng-core-1.4.0.2.1.1.0-385

            flume-ng-sdk-1.4.0.2.1.1.0-385

            flume-tools-1.4.0.2.1.1.0-385

            guava-11.0.2

            kafka_2.8.0-0.8.0

            log4j-1.2.15

            scala-compiler

            scala-library

            slf4j-api-1.6.1

            slf4j-log4j12-1.6.1

            zkclient-0.3

            zookeeper-3.3.4

                

flume 读取kafka 数据的更多相关文章

  1. spark读取kafka数据 createStream和createDirectStream的区别

    1.KafkaUtils.createDstream 构造函数为KafkaUtils.createDstream(ssc, [zk], [consumer group id], [per-topic, ...

  2. SparkStreaming直连方式读取kafka数据,使用MySQL保存偏移量

    SparkStreaming直连方式读取kafka数据,使用MySQL保存偏移量 1. ScalikeJDBC 2.配置文件 3.导入依赖的jar包 4.源码测试 通过MySQL保存kafka的偏移量 ...

  3. Flume下读取kafka数据后再打把数据输出到kafka,利用拦截器解决topic覆盖问题

    1:如果在一个Flume Agent中同时使用Kafka Source和Kafka Sink来处理events,便会遇到Kafka Topic覆盖问题,具体表现为,Kafka Source可以正常从指 ...

  4. 使用Flume消费Kafka数据到HDFS

    1.概述 对于数据的转发,Kafka是一个不错的选择.Kafka能够装载数据到消息队列,然后等待其他业务场景去消费这些数据,Kafka的应用接口API非常的丰富,支持各种存储介质,例如HDFS.HBa ...

  5. 使用flume将kafka数据sink到HBase【转】

    1. hbase sink介绍 1.1 HbaseSink 1.2 AsyncHbaseSink 2. 配置flume 3. 运行测试flume 4. 使用RegexHbaseEventSeriali ...

  6. flink 读取kafka 数据,partition分配

    每个并发有个编号,只会读取kafka partition  % 总并发数 == 编号 的分区   如: 6 分区, 4个并发 分区: p0 p1 p2 p3 p4 p5 并发: 0 1 2 3    ...

  7. Logstash读取Kafka数据写入HDFS详解

    强大的功能,丰富的插件,让logstash在数据处理的行列中出类拔萃 通常日志数据除了要入ES提供实时展示和简单统计外,还需要写入大数据集群来提供更为深入的逻辑处理,前边几篇ELK的文章介绍过利用lo ...

  8. 使用spark-streaming实时读取Kafka数据统计结果存入MySQL

    在这篇文章里,我们模拟了一个场景,实时分析订单数据,统计实时收益. 场景模拟 我试图覆盖工程上最为常用的一个场景: 1)首先,向Kafka里实时的写入订单数据,JSON格式,包含订单ID-订单类型-订 ...

  9. SparkStreaming python 读取kafka数据将结果输出到单个指定本地文件

    # -*- coding: UTF-8 -*- #!/bin/env python3 # filename readFromKafkaStreamingGetLocation.py import IP ...

随机推荐

  1. Android开发进度02

    1,今日:目标:创建第一个android项目,创建android虚拟机 2,昨天:完成eclipseandroid环境的搭建 3,收获:修改.xml文件,将出错地方解决 4,问题:版本问题

  2. Laravel核心解读--Contracts契约

    Contracts Laravel 的契约是一组定义框架提供的核心服务的接口, 例如我们在介绍用户认证的章节中到的用户看守器契约IllumninateContractsAuthGuard 和用户提供器 ...

  3. poj 2288 Islands and Bridges (状压dp+Tsp问题)

    这道题千辛万苦啊! 这道题要涉及到当前点和前面两个点,那就设dp[state][i][j]为当前状态为state,当前点为i,前一个点为j 这个状态表示和之前做炮兵那题很像,就是涉及到三个点时,就多设 ...

  4. Vue PC端框架

    Vue PC端框架 1. Element 中文文档:http://element-cn.eleme.io/#/zh-CN github地址:https://github.com/ElemeFE/ele ...

  5. python实战之编码问题:中文!永远的痛

    编码的思维图谱: 也就是说文件没有编码之说,事实上都是按二进制格式保存在硬盘中的.不过在写入读取时须使用相应的编码进行处理,以便操作系统配合相关软件/字体,绘制到屏幕中给人看.所以关键问题是得知道原先 ...

  6. Yii2高速构建RESTful Web服务功能简单介绍

    Yii2相比Yii1而言,一个重大的改进是内置了功能完备的RESTful支持. 其内置RESTful支持提供了例如以下功能: 使用ActiveRecord的通用接口来高速构建原型: 应答格式协商(缺省 ...

  7. 千万别相信鲁大师的硬件測温柔CPU測温功能!!

    非常多人本来随手安装的一个软件. 相信也信任得过它 . 这下让我測试对它失望了.没想到鲁大师这个測温功能实在太搓了!! 白白浪费了我一晚上,  搞来了硅胶 ,弄了几遍  , 还是一样, 还以为买了水货 ...

  8. [luogu P4197] Peaks 解题报告(在线:kruskal重构树+主席树 离线:主席树+线段树合并)

    题目链接: https://www.luogu.org/problemnew/show/P4197 题目: 在Bytemountains有N座山峰,每座山峰有他的高度$h_i$.有些山峰之间有双向道路 ...

  9. BZOJ 4636 (动态开节点)线段树

    思路: 偷懒 懒得离散化 搞了个动态开节点的线段树 (其实是一样的--..) 注意会有a=b的情况 要判掉 //By SiriusRen #include <cstdio> #includ ...

  10. 四个例子实战讲解.htaccess文件rewrite规则(转)

    一.防盗链功能 1 2 3 4 RewriteEngine On RewriteCond %{HTTP_REFERER} !^http://(.+.)?mysite.com/ [NC] Rewrite ...