MapReduce小文件处理之CombineFileInputFormat实现

在MapReduce使用过程中。一般会遇到输入文件特别小（几百KB、几十MB）。而Hadoop默认会为每一个文件向yarn申请一个container启动map，container的启动关闭是很耗时的。

Hadoop提供了CombineFileInputFormat。一个抽象类。作用是将多个小文件合并到一个map中，我们仅仅需实现三个类：

CompressedCombineFileInputFormat

CompressedCombineFileRecordReader

CompressedCombineFileWritable

maven

<dependency>

        <groupId>org.apache.hadoop</groupId>

        <artifactId>hadoop-client</artifactId>

        <version>2.5.0-cdh5.2.1</version>

</dependency>

CompressedCombineFileInputFormat.java

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.JobContext;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;

import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

import java.io.IOException;

public class CompressedCombineFileInputFormat

        extends CombineFileInputFormat<CompressedCombineFileWritable, Text> {

    public CompressedCombineFileInputFormat() {

        super();

    }

    public RecordReader<CompressedCombineFileWritable, Text>

    createRecordReader(InputSplit split,

                       TaskAttemptContext context) throws IOException {

        return new

                CombineFileRecordReader<CompressedCombineFileWritable,

                        Text>((CombineFileSplit) split, context,

                CompressedCombineFileRecordReader.class);

    }

    @Override

    protected boolean isSplitable(JobContext context, Path file) {

        return false;

    }

}

CompressedCombineFileRecordReader.java

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IOUtils;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.compress.CompressionCodec;

import org.apache.hadoop.io.compress.CompressionCodecFactory;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

import org.apache.hadoop.util.LineReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

public class CompressedCombineFileRecordReader

        extends RecordReader<CompressedCombineFileWritable, Text> {

    private long startOffset;

    private long end;

    private long pos;

    private FileSystem fs;

    private Path path;

    private Path dPath;

    private CompressedCombineFileWritable key = new CompressedCombineFileWritable();

    private Text value;

    private long rlength;

    private FSDataInputStream fileIn;

    private LineReader reader;

    public CompressedCombineFileRecordReader(CombineFileSplit split,

                                             TaskAttemptContext context, Integer index) throws IOException {

        Configuration currentConf = context.getConfiguration();

        this.path = split.getPath(index);

        boolean isCompressed = findCodec(currentConf, path);

        if (isCompressed)

            codecWiseDecompress(context.getConfiguration());

        fs = this.path.getFileSystem(currentConf);

        this.startOffset = split.getOffset(index);

        if (isCompressed) {

            this.end = startOffset + rlength;

        } else {

            this.end = startOffset + split.getLength(index);

            dPath = path;

        }

        boolean skipFirstLine = false;

        fileIn = fs.open(dPath);

        if (isCompressed) fs.deleteOnExit(dPath);

        if (startOffset != 0) {

            skipFirstLine = true;

            --startOffset;

            fileIn.seek(startOffset);

        }

        reader = new LineReader(fileIn);

        if (skipFirstLine) {

            startOffset += reader.readLine(new Text(), 0,

                    (int) Math.min((long) Integer.MAX_VALUE, end - startOffset));

        }

        this.pos = startOffset;

    }

    public void initialize(InputSplit split, TaskAttemptContext context)

            throws IOException, InterruptedException {

    }

    public void close() throws IOException {

    }

    public float getProgress() throws IOException {

        if (startOffset == end) {

            return 0.0f;

        } else {

            return Math.min(1.0f, (pos - startOffset) / (float)

                    (end - startOffset));

        }

    }

    public boolean nextKeyValue() throws IOException {

        if (key.fileName == null) {

            key = new CompressedCombineFileWritable();

            key.fileName = dPath.getName();

        }

        key.offset = pos;

        if (value == null) {

            value = new Text();

        }

        int newSize = 0;

        if (pos < end) {

            newSize = reader.readLine(value);

            pos += newSize;

        }

        if (newSize == 0) {

            key = null;

            value = null;

            return false;

        } else {

            return true;

        }

    }

    public CompressedCombineFileWritable getCurrentKey()

            throws IOException, InterruptedException {

        return key;

    }

    public Text getCurrentValue() throws IOException, InterruptedException {

        return value;

    }

    private void codecWiseDecompress(Configuration conf) throws IOException {

        CompressionCodecFactory factory = new CompressionCodecFactory(conf);

        CompressionCodec codec = factory.getCodec(path);

        if (codec == null) {

            System.err.println("No Codec Found For " + path);

            System.exit(1);

        }

        String outputUri =

                CompressionCodecFactory.removeSuffix(path.toString(),

                        codec.getDefaultExtension());

        dPath = new Path(outputUri);

        InputStream in = null;

        OutputStream out = null;

        fs = this.path.getFileSystem(conf);

        try {

            in = codec.createInputStream(fs.open(path));

            out = fs.create(dPath);

            IOUtils.copyBytes(in, out, conf);

        } finally {

            IOUtils.closeStream(in);

            IOUtils.closeStream(out);

            rlength = fs.getFileStatus(dPath).getLen();

        }

    }

    private boolean findCodec(Configuration conf, Path p) {

        CompressionCodecFactory factory = new CompressionCodecFactory(conf);

        CompressionCodec codec = factory.getCodec(path);

        if (codec == null)

            return false;

        else

            return true;

    }

}

CompressedCombineFileWritable.java

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

public class CompressedCombineFileWritable implements WritableComparable {

    public long offset;

    public String fileName;

    public CompressedCombineFileWritable() {

        super();

    }

    public CompressedCombineFileWritable(long offset, String fileName) {

        super();

        this.offset = offset;

        this.fileName = fileName;

    }

    public void readFields(DataInput in) throws IOException {

        this.offset = in.readLong();

        this.fileName = Text.readString(in);

    }

    public void write(DataOutput out) throws IOException {

        out.writeLong(offset);

        Text.writeString(out, fileName);

    }

    public int compareTo(Object o) {

        CompressedCombineFileWritable that = (CompressedCombineFileWritable) o;

        int f = this.fileName.compareTo(that.fileName);

        if (f == 0) {

            return (int) Math.signum((double) (this.offset - that.offset));

        }

        return f;

    }

    @Override

    public boolean equals(Object obj) {

        if (obj instanceof CompressedCombineFileWritable)

            return this.compareTo(obj) == 0;

        return false;

    }

    @Override

    public int hashCode() {

        final int hashPrime = 47;

        int hash = 13;

        hash = hashPrime * hash + (this.fileName != null ? this.fileName.hashCode() : 0);

        hash = hashPrime * hash + (int) (this.offset ^ (this.offset >>> 16));

        return hash;

    }

    @Override

    public String toString() {

        return this.fileName + "-" + this.offset;

    }

}

MR測试类

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.compress.CompressionCodec;

import org.apache.hadoop.io.compress.GzipCodec;

import org.apache.hadoop.mapred.lib.CombineFileInputFormat;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.MRJobConfig;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

import java.util.StringTokenizer;

public class CFWordCount extends Configured implements Tool {

    /**

     * @param args

     * @throws Exception

     */

    public static void main(String[] args) throws Exception {

        System.exit(ToolRunner.run(new Configuration(), new CFWordCount(), args));

    }

    public int run(String[] args) throws Exception {

        Configuration conf = getConf();

        conf.setLong(CombineFileInputFormat.SPLIT_MAXSIZE, 128 * 1024 * 1024);

        conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);

        conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, GzipCodec.class, CompressionCodec.class);

        Job job = new Job(conf);

        job.setJobName("CombineFile Demo");

        job.setJarByClass(CFWordCount.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));

        job.setInputFormatClass(CompressedCombineFileInputFormat.class);

        job.setMapperClass(TestMapper.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(IntSumReducer.class);

        job.setNumReduceTasks(1);

        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.submit();

        job.waitForCompletion(true);

        return 0;

    }

    public static class TestMapper extends Mapper<CompressedCombineFileWritable, Text, Text, IntWritable> {

        private Text txt = new Text();

        private IntWritable count = new IntWritable(1);

        public void map(CompressedCombineFileWritable key, Text val, Context context) throws IOException, InterruptedException {

            StringTokenizer st = new StringTokenizer(val.toString());

            while (st.hasMoreTokens()) {

                txt.set(st.nextToken());

                context.write(txt, count);

            }

        }

    }

}

注意：使用CombineFileInputFormat过程中发现不管小文件积累到多大，甚至超过HDFS BlockSize后。仍然仅仅有一个map split，查看 hadoop 的源代码发现，使用CombineFileInputFormat时。假设没有显示指定CombineFileInputFormat.SPLIT_MAXSIZE，默认不会切分map split，解决方法例如以下：

conf.setLong(CombineFileInputFormat.SPLIT_MAXSIZE, 128 * 1024 * 1024);

MapReduce小文件处理之CombineFileInputFormat实现的更多相关文章

MapReduce小文件优化与分区
一.小文件优化 1.Mapper类 package com.css.combine; import java.io.IOException; import org.apache.hadoop.io.I ...
Hadoop MapReduce编程 API入门系列之小文件合并（二十九）
不多说,直接上代码. Hadoop 自身提供了几种机制来解决相关的问题,包括HAR,SequeueFile和CombineFileInputFormat. Hadoop 自身提供的几种小文件合并机制 ...
[大牛翻译系列]Hadoop（17）MapReduce 文件处理：小文件
5.1 小文件大数据这个概念似乎意味着处理GB级乃至更大的文件.实际上大数据可以是大量的小文件.比如说,日志文件通常增长到MB级时就会存档.这一节中将介绍在HDFS中有效地处理小文件的技术. 技术2 ...
mapreduce 关于小文件导致任务缓慢的问题
小文件导致任务执行缓慢的原因: 1.很容易想到的是map task 任务启动太多,而每个文件的实际输入量很小,所以导致了任务缓慢这个可以通过 CombineTextInputFormat,解决,主要 ...
[转载]mapreduce合并小文件成sequencefile
mapreduce合并小文件成sequencefile http://blog.csdn.net/xiao_jun_0820/article/details/42747537
第3节 mapreduce高级：5、6、通过inputformat实现小文件合并成为sequenceFile格式
1.1 需求无论hdfs还是mapreduce,对于小文件都有损效率,实践中,又难免面临处理大量小文件的场景,此时,就需要有相应解决方案 1.2 分析小文件的优化无非以下几种方式: 1. 在数据 ...
Hadoop对小文件的解决方式
小文件指的是那些size比HDFS的block size(默认64M)小的多的文件.不论什么一个文件,文件夹和block,在HDFS中都会被表示为一个object存储在namenode的内存中, 每一 ...
基于Hadoop Sequencefile的小文件解决方案
一.概述小文件是指文件size小于HDFS上block大小的文件.这样的文件会给hadoop的扩展性和性能带来严重问题.首先,在HDFS中,任何block,文件或者目录在内存中均以对象的形式存储,每 ...
Hadoop小文件存储方案
原文地址:https://www.cnblogs.com/ballwql/p/8944025.html HDFS总体架构在介绍文件存储方案之前,我觉得有必要先介绍下关于HDFS存储架构方面的一些知识 ...

随机推荐

Codeforces 946G Almost Increasing Array （树状数组优化DP）
题目链接 Educational Codeforces Round 39 Problem G 题意给定一个序列,求把他变成Almost Increasing Array需要改变的最小元素个数. ...
腾讯消消乐（状态压缩DP）
腾讯消消乐题意给出长度为 n 的序列,每次可以选择删除序列的一个连续区间,要求这一段区间内所有数最大公约数不小于 k ,删除后剩下的序列仍然构成连续序列. 定义 f(i) 为进行 i 次操作将整个 ...
背包【p1858】多人背包(次优解 or 第k优解)
题目描述--->p1858 多人背包分析: 很明显,这题是背包问题的一种变形. 求解次优解or第k优解. 表示刚开始有点懵,看题解也看不太懂. 又中途去补看了一下背包九讲然后感觉有些理解, ...
LINUX 下mysql导出数据、表结构
1.首先要确认mysqldump命令所在路径例如,我的在:/usr/bin/ 下 [root@sf105113 bin]# which mysqldump /usr/bin/mysqldump 2. ...
微服务实施Spring Boot/Spring Cloud中踩过的坑（转）
http://tietang.wang/2016/09/08/%E5%BE%AE%E6%9C%8D%E5%8A%A1/%E5%BE%AE%E6%9C%8D%E5%8A%A1%E5%AE%9E%E6%9 ...
tiny4412 串口驱动分析四 --- 修改默认的串口输出
作者:彭东林邮箱:pengdonglin137@163.com 开发板:tiny4412ADK+S700 4GB Flash 主机:Wind7 64位虚拟机:Vmware+Ubuntu12_04 ...
onWebView检查网页中文
问题:要检查网页中的一段文本: 开始我是这样写的: private final static String SPECIFIED_TEXT = "这个是一段中文"; onWebVie ...
mysql开发之join语句学习
内连接:inner join -- 全外链接:full outer 左外连接:left outer 右外连接:right outer 交叉连接:cross内连接,两个表中重复部分全外连接,两个表所有字 ...
ES里关于数组的拓展
一.静态方法在ES6以前,创建数组的方式主要有两种,一种是调用Array构造函数,另一种是用数组字面量语法,这两种方法均需列举数组中的元素,功能非常受限.如果想将一个类数组对象(具有数值型索引和le ...
【Hadoop】如何形象描述大数据生态？
作者:千岁大王链接:https://www.zhihu.com/question/27974418/answer/39845635来源:知乎著作权归作者所有,转载请联系作者获得授权. Google内部 ...

MapReduce小文件处理之CombineFileInputFormat实现

MapReduce小文件处理之CombineFileInputFormat实现的更多相关文章

随机推荐

热门专题