利用CombineFileInputFormat把netflix data set 导入到Hbase里

package com.mr.test;

import java.io.IOException;

import org.apache.hadoop.io.BytesWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;

import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

public class CombineSmallfileInputFormat extends CombineFileInputFormat<LongWritable, BytesWritable> {

	@Override

	public RecordReader<LongWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {

		CombineFileSplit combineFileSplit = (CombineFileSplit) split;

		CombineFileRecordReader<LongWritable, BytesWritable> recordReader = new CombineFileRecordReader<LongWritable, BytesWritable>(combineFileSplit, context, CombineSmallfileRecordReader.class);

		try {

			recordReader.initialize(combineFileSplit, context);

		} catch (InterruptedException e) {

			new RuntimeException("Error to initialize CombineSmallfileRecordReader.");

		}

		return recordReader;

	}

}

package com.mr.test;

import java.io.IOException;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.BytesWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

public class CombineSmallfileRecordReader extends RecordReader<LongWritable, BytesWritable> {

	private CombineFileSplit combineFileSplit;

	private LineRecordReader lineRecordReader = new LineRecordReader();

	private Path[] paths;

	private int totalLength;

	private int currentIndex;

	private float currentProgress = 0;

	private LongWritable currentKey;

	private BytesWritable currentValue = new BytesWritable();

	public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {

		super();

		this.combineFileSplit = combineFileSplit;

		this.currentIndex = index; // 当前要处理的小文件Block在CombineFileSplit中的索引

	}

	@Override

	public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

		this.combineFileSplit = (CombineFileSplit) split;

		// 处理CombineFileSplit中的一个小文件Block，由于使用LineRecordReader，须要构造一个FileSplit对象，然后才可以读取数据

		FileSplit fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());

		lineRecordReader.initialize(fileSplit, context);

		this.paths = combineFileSplit.getPaths();

		totalLength = paths.length;

		context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName());

	}

	@Override

	public LongWritable getCurrentKey() throws IOException, InterruptedException {

		currentKey = lineRecordReader.getCurrentKey();

		return currentKey;

	}

<strong><span style="color:#ff0000;">	@Override

	public BytesWritable getCurrentValue() throws IOException, InterruptedException {

		System.out.println("lineRecordReader:"+lineRecordReader.getCurrentValue().toString());

		byte[] content = lineRecordReader.getCurrentValue().toString().getBytes();

		System.out.println("content:"+new String(content));

		currentValue = new BytesWritable();

		currentValue.set(content, 0, content.length);

		System.out.println("currentValue:"+new String(currentValue.getBytes()));

		return currentValue;

	}</span></strong>

	public static void main(String args[]){

		BytesWritable cv = new BytesWritable();

		String str1 = "1234567";

		String str2 = "123450";

		cv.set(str1.getBytes(), 0, str1.getBytes().length);

		System.out.println(new String(cv.getBytes()));

		cv.setCapacity(0);

		cv.set(str2.getBytes(), 0, str2.getBytes().length);

		System.out.println(new String(cv.getBytes()));

	}

	@Override

	public boolean nextKeyValue() throws IOException, InterruptedException {

		if (currentIndex >= 0 && currentIndex < totalLength) {

			return lineRecordReader.nextKeyValue();

		} else {

			return false;

		}

	}

	@Override

	public float getProgress() throws IOException {

		if (currentIndex >= 0 && currentIndex < totalLength) {

			currentProgress = (float) currentIndex / totalLength;

			return currentProgress;

		}

		return currentProgress;

	}

	@Override

	public void close() throws IOException {

		lineRecordReader.close();

	}

}

package com.mr.test;

import java.io.IOException;

import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.hbase.HBaseConfiguration;

import org.apache.hadoop.hbase.HColumnDescriptor;

import org.apache.hadoop.hbase.HTableDescriptor;

import org.apache.hadoop.hbase.client.HBaseAdmin;

import org.apache.hadoop.hbase.client.Put;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;

import org.apache.hadoop.hbase.mapreduce.TableReducer;

import org.apache.hadoop.hbase.util.Bytes;

import org.apache.hadoop.io.BytesWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class BulkImportData {

	public static class TokenizerMapper extends

			Mapper<Object, BytesWritable, Text, Text> {

		public Text _key = new Text();

		public Text _value = new Text();

		public void map(Object key, BytesWritable value, Context context)

				throws IOException, InterruptedException {

			_value.set(value.getBytes());

			String tmp = _value.toString().trim();

			System.out.println(tmp);

			tmp = tmp.replace("\\x00", "");

			_value.set(tmp);

			String filename = context.getConfiguration().get("map.input.file.name");

			String[] splits = _value.toString().split(",");

			if(splits.length==3){

				filename = filename.replace("mv_", "");

				filename = filename.replace(".txt", "");

				_key.set(splits[0]+"_"+filename);

				context.write(_key, _value);

			}

		}

	}

	public static class IntSumReducer extends

			TableReducer<Text, Text, ImmutableBytesWritable> {

		public void reduce(Text key, Iterable<Text> values,

				Context context) throws IOException, InterruptedException {

			Iterator<Text> itr = values.iterator();

			while(itr.hasNext()){

				Text t = itr.next();

				String[] strs = t.toString().split(",");

				if(strs.length!=3)continue;

				Put put = new Put(key.getBytes());

				put.add(Bytes.toBytes("content"), Bytes.toBytes("score"), Bytes.toBytes(strs[1].trim()));

				put.add(Bytes.toBytes("content"), Bytes.toBytes("date"), Bytes.toBytes(strs[2].trim()));

				context.write(new ImmutableBytesWritable(key.getBytes()), put);

			}

		}

	}

	public static void main(String[] args) throws Exception {

		String tablename = "ntf_data";

		Configuration conf = HBaseConfiguration.create();

		HBaseAdmin admin = new HBaseAdmin(conf);

		if (admin.tableExists(tablename)) {

			admin.disableTable(tablename);

			admin.deleteTable(tablename);

		}

		HTableDescriptor htd = new HTableDescriptor(tablename);

		HColumnDescriptor hcd = new HColumnDescriptor("content");

		htd.addFamily(hcd);

		admin.createTable(htd);

		String[] otherArgs = new GenericOptionsParser(conf, args)

				.getRemainingArgs();

		if (otherArgs.length != 1) {

			System.err

					.println("Usage: wordcount <in> <out>" + otherArgs.length);

			System.exit(2);

		}

		Job job = new Job(conf, "h");

		job.setMapperClass(TokenizerMapper.class);

		job.setJarByClass(BulkImportData.class);

		job.setInputFormatClass(CombineSmallfileInputFormat.class);

		job.setNumReduceTasks(5);

		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

		TableMapReduceUtil.initTableReducerJob(tablename, IntSumReducer.class,

				job);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		System.exit(job.waitForCompletion(true) ?

0 : 1);

	}

}

利用CombineFileInputFormat把netflix data set 导入到Hbase里的更多相关文章

利用HaoheDI从数据库抽取数据导入到hbase中
下载apache-phoenix-4.14.0-HBase-1.4-bin.tar.gz 将其中的 phoenix-4.14.0-HBase-1.4-client.jar phoenix-core-4 ...
利用mapreduce将数据从hdfs导入到hbase遇到的问题
现象: 15/08/12 10:19:30 INFO mapreduce.Job: Job job_1439396788627_0005 failed with state FAILED due to ...
利用TOAD实现把EXCEL数据导入oracle数据库
利用TOAD实现把EXCEL数据导入oracle数据库工具: Toad11.7z(百度搜索,直接下载) 1.将Excel文件中某些字段导入到Oracle数据库的对应表连接想要导入的数据库 ,然 ...
shell编程系列24--shell操作数据库实战之利用shell脚本将文本数据导入到mysql中
shell编程系列24--shell操作数据库实战之利用shell脚本将文本数据导入到mysql中利用shell脚本将文本数据导入到mysql中需求1:处理文本中的数据,将文本中的数据插入到mys ...
利用反射实现通用的excel导入导出
如果一个项目中存在多种信息的导入导出,为了简化代码,就需要用反射实现通用的excel导入导出实例代码如下: 1.创建一个 Book类,并编写set和get方法 package com.bean; p ...
利用CocoaPods，在项目中导入AFNetworking类库
场景1:利用CocoaPods,在项目中导入AFNetworking类库 AFNetworking类库在GitHub地址是:https://github.com/AFNetworking/AFNetw ...
利用PHPExcel 实现excel数据的导入导出（源码实现）
利用PHPExcel 实现excel数据的导入导出(源码实现) 在开发过程中,经常会遇到导入导出的需求,利用phpexcel类实现起来也是比较容易的,下面,我们一步一步实现提前将phpexcel类下 ...
mysql中使用load data infile导入数据的用法
有时需要将大量数据批量写入数据库,直接使用程序语言和Sql写入往往很耗时间,其中有一种方案就是使用mysql load data infile导入文件的形式导入数据,这样可大大缩短数据导入时间. LO ...
Mysql load data infile 导入数据出现：Data truncated for column
[1]Mysql load data infile 导入数据出现:Data truncated for column .... 可能原因分析: (1)数据库表对应字段类型长度不够或修改为其他数据类型( ...

随机推荐

eclipse 远程链接访问hadoop 集群日志信息没有输出的问题l
Eclipse插件Run on Hadoop没有用到hadoop集群节点的问题参考来源 http://f.dataguru.cn/thread-250980-1-1.html http://f.dat ...
css常用代码含义
1.font:12px Arial, Helvetica, sans-serif: 使用了缩写,完整的代码应该是:font-size:12px; font-family:Tahoma:说明字体为12像 ...
javascript--枚举算法实现
<!doctype html> <html lang="en"> <head> <meta charset="UTF-8&quo ...
On iPad, UIImagePickerController must be presented via UIPopoverController
本文转载至:http://blog.csdn.net/k12104/article/details/8537695 On iPad, UIImagePickerController must be p ...
C++ 在继承中使用virtual
使用virtual:如果方法是通过引用类型或指针而不是对象调用的,它将确定使用哪一种方法.如果没有使用关键字irtual,程序将根据引用类型或指针类型选择方法:如果使用了irtual,程序将根据引用或 ...
Spring学习笔记--注入Bean属性
这里通过一个MoonlightPoet类来演示了注入Bean属性property的效果. package com.moonlit.myspring; import java.util.List; im ...
AJAX同步设置以及请求代码
全局设置ajax同步更正一点:这个的同步,针对的是ajax请求的返回,而不是ajax-success返回后所有进行处理后才进行下一步.所以,window.location.href转跳这个在执行的时 ...
【HTML】改变鼠标样式图片css
你需要一张图 .ico 的格式如果一开始你要解决的是怎么去用png 格式图片转成 ICO格式先做一张32*32的PNG格式图片然后打开http://www.easyicon.net/co ...
java基础---->多线程之Daemon（五）
在java线程中有两种线程,一种是用户线程,另一种是守护线程.守护线程是一种特殊的线程,当进程中不存在非守护线程了,则守护线程自动销毁.今天我们通过实例来学习一下java中关于守护线程的知识.我是个平 ...
开源的PaaS方案：在OpenStack上部署CloudFoundry （二）部署OpenStack
硬件要求安装OpenStack 1 安装CentOS 65系统并清空iptables防火墙规则 2 安装系统需要的工具包包括Openstack依赖的和CloudFoundry依赖的 3 安装EPEL ...

利用CombineFileInputFormat把netflix data set 导入到Hbase里

利用CombineFileInputFormat把netflix data set 导入到Hbase里的更多相关文章

随机推荐

热门专题