在Hadoop中重写FileInputFormat类以处理二进制格式存储的整数

近期開始使用MapReduce，发现网上大部分样例都是对文本数据进行处理的，也就是说在读取输入数据时直接使用默认的TextInputFormat进行处理就可以。对于文本数据处理，这个类还是能满足一部分应用场景。可是假设要处理以二进制形式结构化记录存储的文件时，这些类就不再适合了。

本文以一个简单的应用场景为例：对依照二进制格式存储的整数做频数统计。当然，也能够在此基础上实现排序之类的其它应用。实现该应用的主要难点就是怎样处理输入数据。參考《权威指南·第三版》得知须要继承FileInputFormat这个类，并实现下面三个方法：

class MyInputFormat extends FileInputFormat<Type1, Type2> {

	/*

	 * 查询推断当前文件能否够分块？"true"为能够分块，"false"表示不进行分块

	 */

	protected boolean isSplitable(Configuration conf, Path path) {

	}

	/*

	 * MapReduce的client调用此方法得到全部的分块，然后将分块发送给MapReduce服务端。

	 * 注意，分块中不包括实际的信息，而仅仅是对实际信息的分块信息。详细的说，每一个分块中

	 * 包括当前分块相应的文件路径，当前分块在该文件里起始位置，当前分块的长度以及相应的

	 * 实际数据所在的机器列表。在实现这个函数时，将这些信息填上就可以。

	 * */

	public List<InputSplit> getSplits(Configuration conf) throws IOException {

	}

	/*

	 * 类RecordReader是用来创建传给map函数的Key-Value序列，传给此类的參数有两个：一个分块(split)和作业的配置信息(context).

	 * 在Mapper的run函数中能够看到MapReduce框架运行Map的逻辑：

	 * public void run(Context context) throws IOException, InterruptedException {

	 * 		setup(context);

	 * 		调用RecordReader方法的nextKeyValue，生成新的键值对。假设当前分块(Split)中已经处理完成了，则nextKeyValue会返回false.退出run函数

	 *		while (context.nextKeyValue()) {

	 *			map(context.getCurrentKey(), context.getCurrentValue(), context);

	 *		}

	 *		cleanup(context);

	 * }

	 **/

	public RecordReader<LongWritable, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext context)

			throws IOException, InterruptedException {

	}

}

在RecordReader函数中实现下面几个接口：

public class BinRecordReader extends RecordReader<LongWritable, IntWritable> {

	/*关闭文件流

	 * */

	public void close() {}

	/*

	 * 获取处理进度

	 **/

	public float getProgress() {}

	/*

	 * 获取当前的Key

	 * */

	public LongWritable getCurrentKey() throws IOException,

	InterruptedException {}

	/* 获取当前的Value

	 * */

	public IntWritable getCurrentValue() throws IOException,InterruptedException {}

	/*

	 * 进行初始化工作，打开文件流，依据分块信息设置起始位置和长度等等

	 * */

	public void initialize(InputSplit inputSplit, TaskAttemptContext context)

			throws IOException, InterruptedException {}

	/*生成下一个键值对

	 **/

	public boolean nextKeyValue() throws IOException, InterruptedException {

	}

}

下面为是三个文件的代码，首先是BinInputFormat.java的代码：

package org.apache.hadoop.examples;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileStatus;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.fs.BlockLocation;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.examples.BinRecordReader;

class BinInputFormat extends FileInputFormat<LongWritable, IntWritable> {

	private static final double SPLIT_SLOP=1.1;

	/*

	 * 查询推断当前文件能否够分块？"true"为能够分块，"false"表示不进行分块

	 */

	protected boolean isSplitable(Configuration conf, Path path) {

		return true;

	}

	/*

	 * MapReduce的client调用此方法得到全部的分块，然后将分块发送给MapReduce服务端。

	 * 注意，分块中不包括实际的信息，而仅仅是对实际信息的分块信息。详细的说，每一个分块中

	 * 包括当前分块相应的文件路径，当前分块在该文件里起始位置，当前分块的长度以及相应的

	 * 实际数据所在的机器列表。在实现这个函数时，将这些信息填上就可以。

	 * */

	public List<InputSplit> getSplits(Configuration conf) throws IOException {

		List<InputSplit> splits = new ArrayList<InputSplit>();

		long minSplitSize = conf.getLong("mapred.min.split.size",1);

		long maxSplitSize = conf.getLong("mapred.max.split.size", 1);

		long blockSize = conf.getLong("dfs.block.size",1);

		long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));

		FileSystem fs = FileSystem.get(conf);

		String path = conf.get(INPUT_DIR);

		FileStatus[] files = fs.listStatus(new Path(path));

		for (int fileIndex = 0; fileIndex < files.length; fileIndex++) {

			FileStatus file = files[fileIndex];

			System.out.println("input file: " + file.getPath().toString());

			long length = file.getLen();

			FileSystem fsin = file.getPath().getFileSystem(conf);

		    BlockLocation[] blkLocations = fsin.getFileBlockLocations(file, 0, length);

		    if ((length != 0) && isSplitable(conf, file.getPath())) {

		        long bytesRemaining = length;

		        while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {

		          int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);

		          splits.add(new FileSplit(file.getPath(), length-bytesRemaining, splitSize,

		                                   blkLocations[blkIndex].getHosts()));

		          bytesRemaining -= splitSize;

		        }

		        if (bytesRemaining != 0) {

		          splits.add(new FileSplit(file.getPath(), length-bytesRemaining, bytesRemaining,

		                     blkLocations[blkLocations.length-1].getHosts()));

		        }

		      } else if (length != 0) {

		        splits.add(new FileSplit(file.getPath(), 0, length, blkLocations[0].getHosts()));

		      } else {

		        //Create empty hosts array for zero length files

		        splits.add(new FileSplit(file.getPath(), 0, length, new String[0]));

		      }

		}

		return splits;

	}

	/*

	 * 类RecordReader是用来创建传给map函数的Key-Value序列，传给此类的參数有两个：一个分块(split)和作业的配置信息(context).

	 * 在Mapper的run函数中能够看到MapReduce框架运行Map的逻辑：

	 * public void run(Context context) throws IOException, InterruptedException {

	 * 		setup(context);

	 * 		调用RecordReader方法的nextKeyValue，生成新的键值对。假设当前分块(Split)中已经处理完成了，则nextKeyValue会返回false.退出run函数

	 *		while (context.nextKeyValue()) {

	 *			map(context.getCurrentKey(), context.getCurrentValue(), context);

	 *		}

	 *		cleanup(context);

	 * }

	 **/

	public RecordReader<LongWritable, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext context)

			throws IOException, InterruptedException {

		// TODO Auto-generated method stub

		BinRecordReader reader = new BinRecordReader();

		reader.initialize(split,context);

		return reader;

	}

}

下面为BinRecordReader.java的代码：

package org.apache.hadoop.examples;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.mapreduce.RecordReader;

/**

 * Return a single record (filename, "") where the filename is taken from

 * the file split.

 */

public class BinRecordReader extends RecordReader<LongWritable, IntWritable> {

	private FSDataInputStream inputStream = null;

	private long start,end,pos;

	private Configuration conf = null;

	private FileSplit fileSplit = null;

	private LongWritable key = new LongWritable();

	private IntWritable value = new IntWritable();

	private boolean processed = false;

	public BinRecordReader() throws IOException {

	}

	/*关闭文件流

	 * */

	public void close() {

		try {

			if(inputStream != null)

				inputStream.close();

		} catch (IOException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		}

	}

	/*

	 * 获取处理进度

	 **/

	public float getProgress() {

		return ((processed == true)? 1.0f : 0.0f);

	}

	/*

	 * 获取当前的Key

	 * */

	public LongWritable getCurrentKey() throws IOException,

	InterruptedException {

		// TODO Auto-generated method stub

		return key;

	}

	/* 获取当前的Value

	 * */

	public IntWritable getCurrentValue() throws IOException,InterruptedException {

		// TODO Auto-generated method stub

		return value;

	}

	/*

	 * 进行初始化工作，打开文件流，依据分块信息设置起始位置和长度等等

	 * */

	public void initialize(InputSplit inputSplit, TaskAttemptContext context)

			throws IOException, InterruptedException {

		// TODO Auto-generated method stub

		fileSplit = (FileSplit)inputSplit;

		conf = context.getConfiguration();

		this.start = fileSplit.getStart();

		this.end = this.start + fileSplit.getLength();

		try{

			Path path = fileSplit.getPath();

			FileSystem fs = path.getFileSystem(conf);

			this.inputStream = fs.open(path);

			inputStream.seek(this.start);

			this.pos = this.start;

		}	catch(IOException e)	{

			e.printStackTrace();

		}

	}

	/*生成下一个键值对

	 **/

	public boolean nextKeyValue() throws IOException, InterruptedException {

		// TODO Auto-generated method stub

		if(this.pos < this.end) {

			key.set(this.pos);

			value.set(Integer.reverseBytes(inputStream.readInt()));

			this.pos = inputStream.getPos();

			return true;

		} else {

			processed = true;

			return false;

		}

	}

}

下面是主文件BinCount.java的代码

package org.apache.hadoop.examples;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.examples.BinInputFormat;

public class IntCount {

	public static class TokenizerMapper extends Mapper<LongWritable, IntWritable, Text, IntWritable>{

		private final static IntWritable one = new IntWritable(1);

		private Text intNum = new Text();                             

		public void map(LongWritable key, IntWritable value, Context context

				) throws IOException, InterruptedException {

			intNum.set(Integer.toString(value.get()));

			context.write(intNum, one);

		}

	}

	public static class IntSumReducer

	extends Reducer<Text,IntWritable,Text,IntWritable> {

		private IntWritable result = new IntWritable();              

		public void reduce(Text key, Iterable<IntWritable> values,

				Context context

				) throws IOException, InterruptedException {

			int sum = 0;

			for (IntWritable val : values) {

				sum += val.get();                                         

			}

			result.set(sum);

			context.write(key, result);

		}

	}

	public static void main(String[] args) throws Exception {

		System.out.println("testing1");

		Configuration conf = new Configuration();

		String[] newArgs = new String[]{"hdfs://localhost:9000/read","hdfs://localhost:9000/data/wc_output21"};

		String[] otherArgs = new GenericOptionsParser(conf, newArgs).getRemainingArgs();

		if (otherArgs.length != 2) {

			System.err.println("Usage: wordcount <in> <out>");

			System.exit(2);

		}

		Job job = new Job(conf, "IntCount");

		job.setJarByClass(IntCount.class);

		job.setMapperClass(TokenizerMapper.class);

		job.setCombinerClass(IntSumReducer.class);

		job.setReducerClass(IntSumReducer.class);

		//设置自己定义的输入类

		job.setInputFormatClass(BinInputFormat.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(IntWritable.class);

		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

接着我们用一段C语言生成二进制格式存储的文件，C语言代码例如以下：

#include<stdio.h>

int main(){

	FILE * fp = fopen("tmpfile","wb");

	int i,j;

	for(i=0;i<10;i++) {

		for(j=0;j<10;j++)

			fwrite(&j,sizeof(int),1,fp);

	}

	fclose(fp);

	return 0;

}

将生成的文件复制到/read/下，接着启动IntCount这个MapReduce程序，打开执行结果：

在Hadoop中重写FileInputFormat类以处理二进制格式存储的整数的更多相关文章

实现Square类，让其继承自Rectangle类，并在Square类增添新属性和方法，在2的基础上，在Square类中重写Rectangle类中的初始化和打印方法
实现Square类,让其继承自Rectangle类,并在Square类增添新属性和方法,在2的基础上,在Square类中重写Rectangle类中的初始化和打印方法 #import <Found ...
hadoop中典型Writable类详解
本文地址:http://www.cnblogs.com/archimedes/p/hadoop-writable.html,转载请注明源地址. Hadoop将很多Writable类归入org.apac ...
Hadoop Mapreduce 中的FileInputFormat类的文件切分算法和host选择算法
文件切分算法文件切分算法主要用于确定InputSplit的个数以及每个InputSplit对应的数据段. FileInputFormat以文件为单位切分成InputSplit.对于每个文件,由以下三 ...
hadoop中Text类与 java中String类的区别
hadoop 中的Text类与java中的String类感觉上用法是相似的,但两者在编码格式和访问方式上还是有些差别的,要说明这个问题,首先得了解几个概念: 字符集: 是一个系统支持的所有抽象字符的 ...
Hadoop中序列化与Writable接口
学习笔记,整理自<Hadoop权威指南第3版> 一.序列化序列化:序列化是将内存中的结构化数据转化为能在网络上传输或磁盘中进行永久保存的二进制流的过程:反序列化:序列化的逆 ...
java之线程（线程的创建方式、java中的Thread类、线程的同步、线程的生命周期、线程之间的通信）
CPU:10核主频100MHz 1核主频 3GHz 那么哪一个CPU比较好呢? CPU核不是越多越好吗?并不一定.主频用于衡量GPU处理速度的快慢,举个例子10头牛运送货物快还是1架飞机运 ...
Hadoop中Writable类之四
1.定制Writable类型 Hadoop中有一套Writable实现,例如:IntWritable.Text等,但是,有时候可能并不能满足自己的需求,这个时候,就需要自己定制Writable类型. ...
WordCount作业提交到FileInputFormat类中split切分算法和host选择算法过程源码分析
参考 FileInputFormat类中split切分算法和host选择算法介绍以及 Hadoop2.6.0的FileInputFormat的任务切分原理分析(即如何控制FileInputForm ...
【转载】 C++多继承中重写不同基类中相同原型的虚函数
本篇随笔为转载,原文地址:C++多继承中重写不同基类中相同原型的虚函数. 在C++多继承体系当中,在派生类中可以重写不同基类中的虚函数.下面就是一个例子: class CBaseA { public: ...

随机推荐

java基础74 XML解析中的SAX解析相关知识点（网页知识）
1.SAX解析工具 SAX解析工具:是Sun公司提供的,内置JDK中.org.xml.sax.* 点击查看: DOM解析相关知识:以及DOM和SAX解析的原理(区别) 2.SAX解析的 ...
AdvStringGrid 标题头
标题头内容: 字体: 标题头高度: 头的对齐方式:
SQL行列转换的另一种方法
create table tb(姓名 varchar(10) , 课程 varchar(10) , 分数 int)insert into tb values('张三' , '语文' , 74)inse ...
编程六月定律 | 外刊IT评论网
编程六月定律上周,我被迫对一个很老的项目做一些修改.麻烦是,当开始着手时,我真的记不清这个项目究竟有多老了. 这实际上是我使用Codeigniter实现的第一个MVC项目.打开项目文件后,很多东西都 ...
Python中super的应用
约定单继承多继承 super 是个类多继承中 super 的工作方式参考资料约定在开始之前我们来约定一下本文所使用的 Python 版本.默认用的是 Python 3,也就是说:本文所定义 ...
MP3 Fuzz学习
这篇文章主要是学习一波MP3格式fuzz的知识.目录如下 0x0.MP3格式的构成 0x0.MP3格式的构成 MP3是一种通俗叫法,学名叫MPEG1 Layer-3.MP3是三段式的结构,依次由ID3 ...
第一篇CodeIgniter框架的下载及安装
初次学习Php,网上搜了很多php框架,最后选择了CodeIgniter. 安装环境:php5+mysql6.5+iis7 我的电脑是用来办公写文档用的,win7系统,不想换系统,所以就安装了win7 ...
js与jquery的动态加载脚本文件
jquery动态加载 jQuery.getScript(url,[callback]) js动态加载 function loadJs(name) { document.write('<scrip ...
Keras中RNN不定长输入的处理--padding and masking
在使用RNN based model处理序列的应用中,如果使用并行运算batch sample,我们几乎一定会遇到变长序列的问题. 通常解决变长的方法主要是将过长的序列截断,将过短序列用0补齐到一个固 ...
java 基础类库之 SQLFun
package com.exjor.webdemo; import java.sql.Timestamp; import java.util.Date; public class SQLFun { / ...

在Hadoop中重写FileInputFormat类以处理二进制格式存储的整数

在Hadoop中重写FileInputFormat类以处理二进制格式存储的整数的更多相关文章

随机推荐

热门专题