在MapReduce使用过程中。一般会遇到输入文件特别小(几百KB、几十MB)。而Hadoop默认会为每一个文件向yarn申请一个container启动map,container的启动关闭是很耗时的。

Hadoop提供了CombineFileInputFormat。一个抽象类。作用是将多个小文件合并到一个map中,我们仅仅需实现三个类:

CompressedCombineFileInputFormat

CompressedCombineFileRecordReader

CompressedCombineFileWritable

maven

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.5.0-cdh5.2.1</version>
</dependency>

CompressedCombineFileInputFormat.java

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; import java.io.IOException; public class CompressedCombineFileInputFormat
extends CombineFileInputFormat<CompressedCombineFileWritable, Text> { public CompressedCombineFileInputFormat() {
super(); } public RecordReader<CompressedCombineFileWritable, Text>
createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException {
return new
CombineFileRecordReader<CompressedCombineFileWritable,
Text>((CombineFileSplit) split, context,
CompressedCombineFileRecordReader.class);
} @Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
} }

CompressedCombineFileRecordReader.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.util.LineReader; import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream; public class CompressedCombineFileRecordReader
extends RecordReader<CompressedCombineFileWritable, Text> { private long startOffset;
private long end;
private long pos;
private FileSystem fs;
private Path path;
private Path dPath;
private CompressedCombineFileWritable key = new CompressedCombineFileWritable();
private Text value;
private long rlength;
private FSDataInputStream fileIn;
private LineReader reader; public CompressedCombineFileRecordReader(CombineFileSplit split,
TaskAttemptContext context, Integer index) throws IOException { Configuration currentConf = context.getConfiguration();
this.path = split.getPath(index);
boolean isCompressed = findCodec(currentConf, path);
if (isCompressed)
codecWiseDecompress(context.getConfiguration()); fs = this.path.getFileSystem(currentConf); this.startOffset = split.getOffset(index); if (isCompressed) {
this.end = startOffset + rlength;
} else {
this.end = startOffset + split.getLength(index);
dPath = path;
} boolean skipFirstLine = false; fileIn = fs.open(dPath); if (isCompressed) fs.deleteOnExit(dPath); if (startOffset != 0) {
skipFirstLine = true;
--startOffset;
fileIn.seek(startOffset);
}
reader = new LineReader(fileIn);
if (skipFirstLine) {
startOffset += reader.readLine(new Text(), 0,
(int) Math.min((long) Integer.MAX_VALUE, end - startOffset));
}
this.pos = startOffset;
} public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
} public void close() throws IOException {
} public float getProgress() throws IOException {
if (startOffset == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - startOffset) / (float)
(end - startOffset));
}
} public boolean nextKeyValue() throws IOException {
if (key.fileName == null) {
key = new CompressedCombineFileWritable();
key.fileName = dPath.getName();
}
key.offset = pos;
if (value == null) {
value = new Text();
}
int newSize = 0;
if (pos < end) {
newSize = reader.readLine(value);
pos += newSize;
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
} public CompressedCombineFileWritable getCurrentKey()
throws IOException, InterruptedException {
return key;
} public Text getCurrentValue() throws IOException, InterruptedException {
return value;
} private void codecWiseDecompress(Configuration conf) throws IOException { CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(path); if (codec == null) {
System.err.println("No Codec Found For " + path);
System.exit(1);
} String outputUri =
CompressionCodecFactory.removeSuffix(path.toString(),
codec.getDefaultExtension());
dPath = new Path(outputUri); InputStream in = null;
OutputStream out = null;
fs = this.path.getFileSystem(conf); try {
in = codec.createInputStream(fs.open(path));
out = fs.create(dPath);
IOUtils.copyBytes(in, out, conf);
} finally {
IOUtils.closeStream(in);
IOUtils.closeStream(out);
rlength = fs.getFileStatus(dPath).getLen();
}
} private boolean findCodec(Configuration conf, Path p) { CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(path); if (codec == null)
return false;
else
return true; } }

CompressedCombineFileWritable.java

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; public class CompressedCombineFileWritable implements WritableComparable { public long offset;
public String fileName; public CompressedCombineFileWritable() {
super();
} public CompressedCombineFileWritable(long offset, String fileName) {
super();
this.offset = offset;
this.fileName = fileName;
} public void readFields(DataInput in) throws IOException {
this.offset = in.readLong();
this.fileName = Text.readString(in);
} public void write(DataOutput out) throws IOException {
out.writeLong(offset);
Text.writeString(out, fileName);
} public int compareTo(Object o) {
CompressedCombineFileWritable that = (CompressedCombineFileWritable) o; int f = this.fileName.compareTo(that.fileName);
if (f == 0) {
return (int) Math.signum((double) (this.offset - that.offset));
}
return f;
} @Override
public boolean equals(Object obj) {
if (obj instanceof CompressedCombineFileWritable)
return this.compareTo(obj) == 0;
return false;
} @Override
public int hashCode() { final int hashPrime = 47;
int hash = 13;
hash = hashPrime * hash + (this.fileName != null ? this.fileName.hashCode() : 0);
hash = hashPrime * hash + (int) (this.offset ^ (this.offset >>> 16)); return hash;
} @Override
public String toString() {
return this.fileName + "-" + this.offset;
} }

MR測试类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.lib.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import java.io.IOException;
import java.util.StringTokenizer; public class CFWordCount extends Configured implements Tool { /**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new Configuration(), new CFWordCount(), args));
} public int run(String[] args) throws Exception {
Configuration conf = getConf();
conf.setLong(CombineFileInputFormat.SPLIT_MAXSIZE, 128 * 1024 * 1024);
conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, GzipCodec.class, CompressionCodec.class);
Job job = new Job(conf);
job.setJobName("CombineFile Demo");
job.setJarByClass(CFWordCount.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
job.setInputFormatClass(CompressedCombineFileInputFormat.class);
job.setMapperClass(TestMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(IntSumReducer.class);
job.setNumReduceTasks(1);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.submit();
job.waitForCompletion(true); return 0;
} public static class TestMapper extends Mapper<CompressedCombineFileWritable, Text, Text, IntWritable> {
private Text txt = new Text();
private IntWritable count = new IntWritable(1); public void map(CompressedCombineFileWritable key, Text val, Context context) throws IOException, InterruptedException {
StringTokenizer st = new StringTokenizer(val.toString());
while (st.hasMoreTokens()) {
txt.set(st.nextToken());
context.write(txt, count);
}
}
}
}

注意:使用CombineFileInputFormat过程中发现不管小文件积累到多大,甚至超过HDFS BlockSize后。仍然仅仅有一个map split,查看 hadoop 的源代码发现,使用CombineFileInputFormat时。假设没有显示指定CombineFileInputFormat.SPLIT_MAXSIZE,默认不会切分map split,解决方法例如以下:

conf.setLong(CombineFileInputFormat.SPLIT_MAXSIZE, 128 * 1024 * 1024);

MapReduce小文件处理之CombineFileInputFormat实现的更多相关文章

  1. MapReduce小文件优化与分区

    一.小文件优化 1.Mapper类 package com.css.combine; import java.io.IOException; import org.apache.hadoop.io.I ...

  2. Hadoop MapReduce编程 API入门系列之小文件合并(二十九)

    不多说,直接上代码. Hadoop 自身提供了几种机制来解决相关的问题,包括HAR,SequeueFile和CombineFileInputFormat. Hadoop 自身提供的几种小文件合并机制 ...

  3. [大牛翻译系列]Hadoop(17)MapReduce 文件处理:小文件

    5.1 小文件 大数据这个概念似乎意味着处理GB级乃至更大的文件.实际上大数据可以是大量的小文件.比如说,日志文件通常增长到MB级时就会存档.这一节中将介绍在HDFS中有效地处理小文件的技术. 技术2 ...

  4. mapreduce 关于小文件导致任务缓慢的问题

    小文件导致任务执行缓慢的原因: 1.很容易想到的是map task 任务启动太多,而每个文件的实际输入量很小,所以导致了任务缓慢 这个可以通过 CombineTextInputFormat,解决,主要 ...

  5. [转载]mapreduce合并小文件成sequencefile

    mapreduce合并小文件成sequencefile http://blog.csdn.net/xiao_jun_0820/article/details/42747537

  6. 第3节 mapreduce高级:5、6、通过inputformat实现小文件合并成为sequenceFile格式

    1.1 需求 无论hdfs还是mapreduce,对于小文件都有损效率,实践中,又难免面临处理大量小文件的场景,此时,就需要有相应解决方案 1.2 分析 小文件的优化无非以下几种方式: 1.  在数据 ...

  7. Hadoop对小文件的解决方式

    小文件指的是那些size比HDFS的block size(默认64M)小的多的文件.不论什么一个文件,文件夹和block,在HDFS中都会被表示为一个object存储在namenode的内存中, 每一 ...

  8. 基于Hadoop Sequencefile的小文件解决方案

    一.概述 小文件是指文件size小于HDFS上block大小的文件.这样的文件会给hadoop的扩展性和性能带来严重问题.首先,在HDFS中,任何block,文件或者目录在内存中均以对象的形式存储,每 ...

  9. Hadoop小文件存储方案

    原文地址:https://www.cnblogs.com/ballwql/p/8944025.html HDFS总体架构 在介绍文件存储方案之前,我觉得有必要先介绍下关于HDFS存储架构方面的一些知识 ...

随机推荐

  1. 网络爬虫框架Webmagic

    1 谈谈网络爬虫 1.1 什么是网络爬虫 在大数据时代,信息的采集是一项重要的工作,而互联网中的数据是海量的,如果单纯靠人力进行信息采集,不仅低效繁琐,搜集的成本也会提高.如何自动高效地获取互联网中我 ...

  2. EGS5在linux系统下安装过程

    转载自52MC论坛 作者:xinruibj 平台:Fedora 13 内核版本为:2.6.33, g77版本为:3.4.6: 用户名为xinrui,下面出现这个文件夹xinrui时,修改为你自己的用户 ...

  3. Android Developer -- Bluetooth篇 开发实例之四 API详解

    http://www.open-open.com/lib/view/open1390879771695.html 这篇文章将会详细解析BluetoothAdapter的详细api, 包括隐藏方法, 每 ...

  4. bin/...的访问被拒绝被拒绝的问题

    复制到bin.... 对路径bin/.... 的访问被拒绝出现这们的问题,把源码管理器中项目的Bin目录删除,重新获取就要以了

  5. Could not find com.android.tools.build:gradle:3.0.0-alpha3

    最近使用Android Studio 3.0 canary 3 时新建项目遇到标题所示错误,后网上找到解决办法.记录如下: 在项目的build.gradle文件中添加如下内容即可解决. reposit ...

  6. VBO与VAO 【转】

    我想大家都已经熟悉VBO了吧.在GL3.0时代的VBO大体还是处于最重要的地位,但是与此同时也出现了不少新的用法和辅助役,其中一个就是VAO.本文大致小记一下这两者的联系,帮助大家理解一下这个角色.— ...

  7. cbuffer padding

    nx glslc float 起始于 内存位置4x0 ,4x1,4x2 ,4x3.... bit float2 起始于 内存位置2x4x0 ,2x4x1,2x4x2 ,2x4x3.... bit fl ...

  8. zabbix通过percona插件监控mysql

    percona zabbix mysql-plugin是percona发布的一个使用zabbix监控mysql数据库的工具,这款工具比zabbix自带的监控模板要强大的多,毕竟percona是Mysq ...

  9. 2017.7.31 ELK+logback+redis的使用

    参考来自:spring mvc+ELK从头开始搭建日志平台 0 前提 ELK安装成功 redis安装成功 使用logback的项目运行成功 1 配置文件 1.1 pom.xml 为了使用logback ...

  10. 解决dubbo问题:forbid consumer(1)

    原因: 1.dubbo服务没有起动起来 2.dubbo链接的地址出现异常 3.dubbo服务端更新了服务接口,没有发布 如果已上都没有问题,那么还有一个原因就是 “ 别人的代码有问题 阻碍了 你的程序 ...