在Hadoop中重写FileInputFormat类以处理二进制格式存储的整数

近期開始使用MapReduce，发现网上大部分样例都是对文本数据进行处理的，也就是说在读取输入数据时直接使用默认的TextInputFormat进行处理就可以。对于文本数据处理，这个类还是能满足一部分应用场景。可是假设要处理以二进制形式结构化记录存储的文件时，这些类就不再适合了。

本文以一个简单的应用场景为例：对依照二进制格式存储的整数做频数统计。当然，也能够在此基础上实现排序之类的其它应用。实现该应用的主要难点就是怎样处理输入数据。參考《权威指南·第三版》得知须要继承FileInputFormat这个类，并实现下面三个方法：

class MyInputFormat extends FileInputFormat<Type1, Type2> {

	/*

	 * 查询推断当前文件能否够分块？"true"为能够分块，"false"表示不进行分块

	 */

	protected boolean isSplitable(Configuration conf, Path path) {

	}

	/*

	 * MapReduce的client调用此方法得到全部的分块，然后将分块发送给MapReduce服务端。

	 * 注意，分块中不包括实际的信息，而仅仅是对实际信息的分块信息。详细的说，每一个分块中

	 * 包括当前分块相应的文件路径，当前分块在该文件里起始位置，当前分块的长度以及相应的

	 * 实际数据所在的机器列表。在实现这个函数时，将这些信息填上就可以。

	 * */

	public List<InputSplit> getSplits(Configuration conf) throws IOException {

	}

	/*

	 * 类RecordReader是用来创建传给map函数的Key-Value序列，传给此类的參数有两个：一个分块(split)和作业的配置信息(context).

	 * 在Mapper的run函数中能够看到MapReduce框架运行Map的逻辑：

	 * public void run(Context context) throws IOException, InterruptedException {

	 * 		setup(context);

	 * 		调用RecordReader方法的nextKeyValue，生成新的键值对。假设当前分块(Split)中已经处理完成了，则nextKeyValue会返回false.退出run函数

	 *		while (context.nextKeyValue()) {

	 *			map(context.getCurrentKey(), context.getCurrentValue(), context);

	 *		}

	 *		cleanup(context);

	 * }

	 **/

	public RecordReader<LongWritable, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext context)

			throws IOException, InterruptedException {

	}

}

在RecordReader函数中实现下面几个接口：

public class BinRecordReader extends RecordReader<LongWritable, IntWritable> {

	/*关闭文件流

	 * */

	public void close() {}

	/*

	 * 获取处理进度

	 **/

	public float getProgress() {}

	/*

	 * 获取当前的Key

	 * */

	public LongWritable getCurrentKey() throws IOException,

	InterruptedException {}

	/* 获取当前的Value

	 * */

	public IntWritable getCurrentValue() throws IOException,InterruptedException {}

	/*

	 * 进行初始化工作，打开文件流，依据分块信息设置起始位置和长度等等

	 * */

	public void initialize(InputSplit inputSplit, TaskAttemptContext context)

			throws IOException, InterruptedException {}

	/*生成下一个键值对

	 **/

	public boolean nextKeyValue() throws IOException, InterruptedException {

	}

}

下面为是三个文件的代码，首先是BinInputFormat.java的代码：

package org.apache.hadoop.examples;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileStatus;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.fs.BlockLocation;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.examples.BinRecordReader;

class BinInputFormat extends FileInputFormat<LongWritable, IntWritable> {

	private static final double SPLIT_SLOP=1.1;

	/*

	 * 查询推断当前文件能否够分块？"true"为能够分块，"false"表示不进行分块

	 */

	protected boolean isSplitable(Configuration conf, Path path) {

		return true;

	}

	/*

	 * MapReduce的client调用此方法得到全部的分块，然后将分块发送给MapReduce服务端。

	 * 注意，分块中不包括实际的信息，而仅仅是对实际信息的分块信息。详细的说，每一个分块中

	 * 包括当前分块相应的文件路径，当前分块在该文件里起始位置，当前分块的长度以及相应的

	 * 实际数据所在的机器列表。在实现这个函数时，将这些信息填上就可以。

	 * */

	public List<InputSplit> getSplits(Configuration conf) throws IOException {

		List<InputSplit> splits = new ArrayList<InputSplit>();

		long minSplitSize = conf.getLong("mapred.min.split.size",1);

		long maxSplitSize = conf.getLong("mapred.max.split.size", 1);

		long blockSize = conf.getLong("dfs.block.size",1);

		long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize));

		FileSystem fs = FileSystem.get(conf);

		String path = conf.get(INPUT_DIR);

		FileStatus[] files = fs.listStatus(new Path(path));

		for (int fileIndex = 0; fileIndex < files.length; fileIndex++) {

			FileStatus file = files[fileIndex];

			System.out.println("input file: " + file.getPath().toString());

			long length = file.getLen();

			FileSystem fsin = file.getPath().getFileSystem(conf);

		    BlockLocation[] blkLocations = fsin.getFileBlockLocations(file, 0, length);

		    if ((length != 0) && isSplitable(conf, file.getPath())) {

		        long bytesRemaining = length;

		        while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {

		          int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);

		          splits.add(new FileSplit(file.getPath(), length-bytesRemaining, splitSize,

		                                   blkLocations[blkIndex].getHosts()));

		          bytesRemaining -= splitSize;

		        }

		        if (bytesRemaining != 0) {

		          splits.add(new FileSplit(file.getPath(), length-bytesRemaining, bytesRemaining,

		                     blkLocations[blkLocations.length-1].getHosts()));

		        }

		      } else if (length != 0) {

		        splits.add(new FileSplit(file.getPath(), 0, length, blkLocations[0].getHosts()));

		      } else {

		        //Create empty hosts array for zero length files

		        splits.add(new FileSplit(file.getPath(), 0, length, new String[0]));

		      }

		}

		return splits;

	}

	/*

	 * 类RecordReader是用来创建传给map函数的Key-Value序列，传给此类的參数有两个：一个分块(split)和作业的配置信息(context).

	 * 在Mapper的run函数中能够看到MapReduce框架运行Map的逻辑：

	 * public void run(Context context) throws IOException, InterruptedException {

	 * 		setup(context);

	 * 		调用RecordReader方法的nextKeyValue，生成新的键值对。假设当前分块(Split)中已经处理完成了，则nextKeyValue会返回false.退出run函数

	 *		while (context.nextKeyValue()) {

	 *			map(context.getCurrentKey(), context.getCurrentValue(), context);

	 *		}

	 *		cleanup(context);

	 * }

	 **/

	public RecordReader<LongWritable, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext context)

			throws IOException, InterruptedException {

		// TODO Auto-generated method stub

		BinRecordReader reader = new BinRecordReader();

		reader.initialize(split,context);

		return reader;

	}

}

下面为BinRecordReader.java的代码：

package org.apache.hadoop.examples;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.mapreduce.RecordReader;

/**

 * Return a single record (filename, "") where the filename is taken from

 * the file split.

 */

public class BinRecordReader extends RecordReader<LongWritable, IntWritable> {

	private FSDataInputStream inputStream = null;

	private long start,end,pos;

	private Configuration conf = null;

	private FileSplit fileSplit = null;

	private LongWritable key = new LongWritable();

	private IntWritable value = new IntWritable();

	private boolean processed = false;

	public BinRecordReader() throws IOException {

	}

	/*关闭文件流

	 * */

	public void close() {

		try {

			if(inputStream != null)

				inputStream.close();

		} catch (IOException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		}

	}

	/*

	 * 获取处理进度

	 **/

	public float getProgress() {

		return ((processed == true)? 1.0f : 0.0f);

	}

	/*

	 * 获取当前的Key

	 * */

	public LongWritable getCurrentKey() throws IOException,

	InterruptedException {

		// TODO Auto-generated method stub

		return key;

	}

	/* 获取当前的Value

	 * */

	public IntWritable getCurrentValue() throws IOException,InterruptedException {

		// TODO Auto-generated method stub

		return value;

	}

	/*

	 * 进行初始化工作，打开文件流，依据分块信息设置起始位置和长度等等

	 * */

	public void initialize(InputSplit inputSplit, TaskAttemptContext context)

			throws IOException, InterruptedException {

		// TODO Auto-generated method stub

		fileSplit = (FileSplit)inputSplit;

		conf = context.getConfiguration();

		this.start = fileSplit.getStart();

		this.end = this.start + fileSplit.getLength();

		try{

			Path path = fileSplit.getPath();

			FileSystem fs = path.getFileSystem(conf);

			this.inputStream = fs.open(path);

			inputStream.seek(this.start);

			this.pos = this.start;

		}	catch(IOException e)	{

			e.printStackTrace();

		}

	}

	/*生成下一个键值对

	 **/

	public boolean nextKeyValue() throws IOException, InterruptedException {

		// TODO Auto-generated method stub

		if(this.pos < this.end) {

			key.set(this.pos);

			value.set(Integer.reverseBytes(inputStream.readInt()));

			this.pos = inputStream.getPos();

			return true;

		} else {

			processed = true;

			return false;

		}

	}

}

下面是主文件BinCount.java的代码

package org.apache.hadoop.examples;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.examples.BinInputFormat;

public class IntCount {

	public static class TokenizerMapper extends Mapper<LongWritable, IntWritable, Text, IntWritable>{

		private final static IntWritable one = new IntWritable(1);

		private Text intNum = new Text();                             

		public void map(LongWritable key, IntWritable value, Context context

				) throws IOException, InterruptedException {

			intNum.set(Integer.toString(value.get()));

			context.write(intNum, one);

		}

	}

	public static class IntSumReducer

	extends Reducer<Text,IntWritable,Text,IntWritable> {

		private IntWritable result = new IntWritable();              

		public void reduce(Text key, Iterable<IntWritable> values,

				Context context

				) throws IOException, InterruptedException {

			int sum = 0;

			for (IntWritable val : values) {

				sum += val.get();                                         

			}

			result.set(sum);

			context.write(key, result);

		}

	}

	public static void main(String[] args) throws Exception {

		System.out.println("testing1");

		Configuration conf = new Configuration();

		String[] newArgs = new String[]{"hdfs://localhost:9000/read","hdfs://localhost:9000/data/wc_output21"};

		String[] otherArgs = new GenericOptionsParser(conf, newArgs).getRemainingArgs();

		if (otherArgs.length != 2) {

			System.err.println("Usage: wordcount <in> <out>");

			System.exit(2);

		}

		Job job = new Job(conf, "IntCount");

		job.setJarByClass(IntCount.class);

		job.setMapperClass(TokenizerMapper.class);

		job.setCombinerClass(IntSumReducer.class);

		job.setReducerClass(IntSumReducer.class);

		//设置自己定义的输入类

		job.setInputFormatClass(BinInputFormat.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(IntWritable.class);

		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

接着我们用一段C语言生成二进制格式存储的文件，C语言代码例如以下：

#include<stdio.h>

int main(){

	FILE * fp = fopen("tmpfile","wb");

	int i,j;

	for(i=0;i<10;i++) {

		for(j=0;j<10;j++)

			fwrite(&j,sizeof(int),1,fp);

	}

	fclose(fp);

	return 0;

}

将生成的文件复制到/read/下，接着启动IntCount这个MapReduce程序，打开执行结果：

在Hadoop中重写FileInputFormat类以处理二进制格式存储的整数的更多相关文章

实现Square类，让其继承自Rectangle类，并在Square类增添新属性和方法，在2的基础上，在Square类中重写Rectangle类中的初始化和打印方法
实现Square类,让其继承自Rectangle类,并在Square类增添新属性和方法,在2的基础上,在Square类中重写Rectangle类中的初始化和打印方法 #import <Found ...
hadoop中典型Writable类详解
本文地址:http://www.cnblogs.com/archimedes/p/hadoop-writable.html,转载请注明源地址. Hadoop将很多Writable类归入org.apac ...
Hadoop Mapreduce 中的FileInputFormat类的文件切分算法和host选择算法
文件切分算法文件切分算法主要用于确定InputSplit的个数以及每个InputSplit对应的数据段. FileInputFormat以文件为单位切分成InputSplit.对于每个文件,由以下三 ...
hadoop中Text类与 java中String类的区别
hadoop 中的Text类与java中的String类感觉上用法是相似的,但两者在编码格式和访问方式上还是有些差别的,要说明这个问题,首先得了解几个概念: 字符集: 是一个系统支持的所有抽象字符的 ...
Hadoop中序列化与Writable接口
学习笔记,整理自<Hadoop权威指南第3版> 一.序列化序列化:序列化是将内存中的结构化数据转化为能在网络上传输或磁盘中进行永久保存的二进制流的过程:反序列化:序列化的逆 ...
java之线程（线程的创建方式、java中的Thread类、线程的同步、线程的生命周期、线程之间的通信）
CPU:10核主频100MHz 1核主频 3GHz 那么哪一个CPU比较好呢? CPU核不是越多越好吗?并不一定.主频用于衡量GPU处理速度的快慢,举个例子10头牛运送货物快还是1架飞机运 ...
Hadoop中Writable类之四
1.定制Writable类型 Hadoop中有一套Writable实现,例如:IntWritable.Text等,但是,有时候可能并不能满足自己的需求,这个时候,就需要自己定制Writable类型. ...
WordCount作业提交到FileInputFormat类中split切分算法和host选择算法过程源码分析
参考 FileInputFormat类中split切分算法和host选择算法介绍以及 Hadoop2.6.0的FileInputFormat的任务切分原理分析(即如何控制FileInputForm ...
【转载】 C++多继承中重写不同基类中相同原型的虚函数
本篇随笔为转载,原文地址:C++多继承中重写不同基类中相同原型的虚函数. 在C++多继承体系当中,在派生类中可以重写不同基类中的虚函数.下面就是一个例子: class CBaseA { public: ...

随机推荐

耗时任务DefaultEventExecutorGroup 定时任务
一. 耗时任务 static final EventExecutorGroup group = new DefaultEventExecutorGroup(16); // Tell the pipel ...
type Iterator does not take parameters
在ubuntu编译java程序时报错:type Iterator does not take parameters 源码如下: package object; import java.util.*; ...
如何在VS2013创建WebService并在IIS中发布
第一步:打开VS2013,选择文件->新建->项目. 第二步:选择[ASP.net 空web应用程序],将其命名为自己想的工程名称. 第三步:然后右键点击工程,添加->web服务.然 ...
GreenPlum学习笔记：新手入门命令
1.命令行登录数据库 psql -h 192.168.111.111 -U username -d dbname 其中,username为数据库用户名,dbname为数据库名,执行后提示输入密码.(可 ...
Struts 2 Tutorial
Apache Struts 2 is an elegant, extensible framework for creating enterprise-ready Java web applicati ...
Chrome-Adobe Flash 无法正常使用
https://support.google.com/chrome/answer/6258784 该网站因是是google.com,被强了,所以一般打不开. 故将google官方说明记录以下: 如果 ...
php读取xml中cdata部分方法
本例使用php的simplexml:XML(eventtrackdata.xml'): <eventdata> <event> <date>2012.05.11&l ...
【BZOJ】1566: [NOI2009]管道取珠
题解假如我们非常熟练的看出来,平方和转有序对统计的套路的话,应该就不难了我们只需要统计(wayA,wayB)生成的序列一样的有序对个数就行可以用一个\(n^3\)的dp解决 \(dp[i][j] ...
8-8 Ddfense Line uva1471 优先级队列
题意:给你一串长度为n的序列你的任务是删除一个连续的子序列使得剩下的序列中有一个长度最大的连续递增子序列例如将 5 3 4 9 2 8 6 7 1 中的9 2 8 删除得到5 3 ...
Wannafly挑战赛9 B - 数一数
链接:https://www.nowcoder.com/acm/contest/71/B来源:牛客网题目描述设s,t为两个字符串,定义f(s,t) = t的子串中,与s相等的串的个数.如f(&qu ...

在Hadoop中重写FileInputFormat类以处理二进制格式存储的整数

在Hadoop中重写FileInputFormat类以处理二进制格式存储的整数的更多相关文章

随机推荐

热门专题