利用CombineFileInputFormat把netflix data set 导入到Hbase里
版权声明:本文为博主原创文章。未经博主同意不得转载。 https://blog.csdn.net/xiewenbo/article/details/25637931
package com.mr.test;
import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
public class CombineSmallfileInputFormat extends CombineFileInputFormat<LongWritable, BytesWritable> {
@Override
public RecordReader<LongWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
CombineFileSplit combineFileSplit = (CombineFileSplit) split;
CombineFileRecordReader<LongWritable, BytesWritable> recordReader = new CombineFileRecordReader<LongWritable, BytesWritable>(combineFileSplit, context, CombineSmallfileRecordReader.class);
try {
recordReader.initialize(combineFileSplit, context);
} catch (InterruptedException e) {
new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
}
return recordReader;
}
}
package com.mr.test;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
public class CombineSmallfileRecordReader extends RecordReader<LongWritable, BytesWritable> {
private CombineFileSplit combineFileSplit;
private LineRecordReader lineRecordReader = new LineRecordReader();
private Path[] paths;
private int totalLength;
private int currentIndex;
private float currentProgress = 0;
private LongWritable currentKey;
private BytesWritable currentValue = new BytesWritable();
public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
super();
this.combineFileSplit = combineFileSplit;
this.currentIndex = index; // 当前要处理的小文件Block在CombineFileSplit中的索引
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.combineFileSplit = (CombineFileSplit) split;
// 处理CombineFileSplit中的一个小文件Block,由于使用LineRecordReader,须要构造一个FileSplit对象,然后才可以读取数据
FileSplit fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
lineRecordReader.initialize(fileSplit, context);
this.paths = combineFileSplit.getPaths();
totalLength = paths.length;
context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName());
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
currentKey = lineRecordReader.getCurrentKey();
return currentKey;
}
<strong><span style="color:#ff0000;"> @Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
System.out.println("lineRecordReader:"+lineRecordReader.getCurrentValue().toString());
byte[] content = lineRecordReader.getCurrentValue().toString().getBytes();
System.out.println("content:"+new String(content));
currentValue = new BytesWritable();
currentValue.set(content, 0, content.length);
System.out.println("currentValue:"+new String(currentValue.getBytes()));
return currentValue;
}</span></strong>
public static void main(String args[]){
BytesWritable cv = new BytesWritable();
String str1 = "1234567";
String str2 = "123450";
cv.set(str1.getBytes(), 0, str1.getBytes().length);
System.out.println(new String(cv.getBytes()));
cv.setCapacity(0);
cv.set(str2.getBytes(), 0, str2.getBytes().length);
System.out.println(new String(cv.getBytes()));
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (currentIndex >= 0 && currentIndex < totalLength) {
return lineRecordReader.nextKeyValue();
} else {
return false;
}
}
@Override
public float getProgress() throws IOException {
if (currentIndex >= 0 && currentIndex < totalLength) {
currentProgress = (float) currentIndex / totalLength;
return currentProgress;
}
return currentProgress;
}
@Override
public void close() throws IOException {
lineRecordReader.close();
}
}
package com.mr.test;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class BulkImportData {
public static class TokenizerMapper extends
Mapper<Object, BytesWritable, Text, Text> {
public Text _key = new Text();
public Text _value = new Text();
public void map(Object key, BytesWritable value, Context context)
throws IOException, InterruptedException {
_value.set(value.getBytes());
String tmp = _value.toString().trim();
System.out.println(tmp);
tmp = tmp.replace("\\x00", "");
_value.set(tmp);
String filename = context.getConfiguration().get("map.input.file.name");
String[] splits = _value.toString().split(",");
if(splits.length==3){
filename = filename.replace("mv_", "");
filename = filename.replace(".txt", "");
_key.set(splits[0]+"_"+filename);
context.write(_key, _value);
}
}
}
public static class IntSumReducer extends
TableReducer<Text, Text, ImmutableBytesWritable> {
public void reduce(Text key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
Iterator<Text> itr = values.iterator();
while(itr.hasNext()){
Text t = itr.next();
String[] strs = t.toString().split(",");
if(strs.length!=3)continue;
Put put = new Put(key.getBytes());
put.add(Bytes.toBytes("content"), Bytes.toBytes("score"), Bytes.toBytes(strs[1].trim()));
put.add(Bytes.toBytes("content"), Bytes.toBytes("date"), Bytes.toBytes(strs[2].trim()));
context.write(new ImmutableBytesWritable(key.getBytes()), put);
}
}
}
public static void main(String[] args) throws Exception {
String tablename = "ntf_data";
Configuration conf = HBaseConfiguration.create();
HBaseAdmin admin = new HBaseAdmin(conf);
if (admin.tableExists(tablename)) {
admin.disableTable(tablename);
admin.deleteTable(tablename);
}
HTableDescriptor htd = new HTableDescriptor(tablename);
HColumnDescriptor hcd = new HColumnDescriptor("content");
htd.addFamily(hcd);
admin.createTable(htd);
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 1) {
System.err
.println("Usage: wordcount <in> <out>" + otherArgs.length);
System.exit(2);
}
Job job = new Job(conf, "h");
job.setMapperClass(TokenizerMapper.class);
job.setJarByClass(BulkImportData.class);
job.setInputFormatClass(CombineSmallfileInputFormat.class);
job.setNumReduceTasks(5);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
TableMapReduceUtil.initTableReducerJob(tablename, IntSumReducer.class,
job);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ?
0 : 1);
}
}
利用CombineFileInputFormat把netflix data set 导入到Hbase里的更多相关文章
- 利用HaoheDI从数据库抽取数据导入到hbase中
下载apache-phoenix-4.14.0-HBase-1.4-bin.tar.gz 将其中的 phoenix-4.14.0-HBase-1.4-client.jar phoenix-core-4 ...
- 利用mapreduce将数据从hdfs导入到hbase遇到的问题
现象: 15/08/12 10:19:30 INFO mapreduce.Job: Job job_1439396788627_0005 failed with state FAILED due to ...
- 利用TOAD实现把EXCEL数据导入oracle数据库
利用TOAD实现把EXCEL数据导入oracle数据库 工具: Toad11.7z(百度搜索,直接下载) 1.将Excel文件中某些字段导入到Oracle数据库的对应表 连接想要导入的数据库 ,然 ...
- shell编程系列24--shell操作数据库实战之利用shell脚本将文本数据导入到mysql中
shell编程系列24--shell操作数据库实战之利用shell脚本将文本数据导入到mysql中 利用shell脚本将文本数据导入到mysql中 需求1:处理文本中的数据,将文本中的数据插入到mys ...
- 利用反射实现通用的excel导入导出
如果一个项目中存在多种信息的导入导出,为了简化代码,就需要用反射实现通用的excel导入导出 实例代码如下: 1.创建一个 Book类,并编写set和get方法 package com.bean; p ...
- 利用CocoaPods,在项目中导入AFNetworking类库
场景1:利用CocoaPods,在项目中导入AFNetworking类库 AFNetworking类库在GitHub地址是:https://github.com/AFNetworking/AFNetw ...
- 利用PHPExcel 实现excel数据的导入导出(源码实现)
利用PHPExcel 实现excel数据的导入导出(源码实现) 在开发过程中,经常会遇到导入导出的需求,利用phpexcel类实现起来也是比较容易的,下面,我们一步一步实现 提前将phpexcel类下 ...
- mysql中使用load data infile导入数据的用法
有时需要将大量数据批量写入数据库,直接使用程序语言和Sql写入往往很耗时间,其中有一种方案就是使用mysql load data infile导入文件的形式导入数据,这样可大大缩短数据导入时间. LO ...
- Mysql load data infile 导入数据出现:Data truncated for column
[1]Mysql load data infile 导入数据出现:Data truncated for column .... 可能原因分析: (1)数据库表对应字段类型长度不够或修改为其他数据类型( ...
随机推荐
- CSS之少用继承,多用组合
下面是一段普通的代码: css: .box{ border:1px solid #ccc; font-size:12px; background:#f1f1f1; padding:10px; } ht ...
- VC++ 6.0开发套件(自己收藏!)
安装镜像ISO VC++ 6.0_SP6_Win7企业版(中英文集成).iso MSDN安装镜像ISO MSDN_Oct_200 ...
- NHibernate初学六之关联多对多关系
1:创建三张表T_Course.T_Student.T_Middle:其中一个学生可以对应多个课程,一个课程也可以对应多个学生,用T_Middle存放它们的关系内容: CREATE TABLE [db ...
- C#资源释放及Dispose、Close和析构方法
https://www.cnblogs.com/luminji/archive/2011/01/05/1926468.html C#资源释放及Dispose.Close和析构方法 备注:此文的部分 ...
- UE4射线的碰撞与绘制
http://blog.csdn.net/qq992817263/article/details/51800657 //起点 终点 FHitResult RayGetHitResult(FVector ...
- LVS+keeplived+nginx+tomcat高可用、高性能jsp集群
原创作品,允许转载,转载时请务必以超链接形式标明文章 原始出处 .作者信息和本声明.否则将追究法律责任.http://kerry.blog.51cto.com/172631/557749 #!/bin ...
- C++的virtual详解
类的多态特性是支持面向对象的语言最主要的特性,有过非面向对象语言开发经历的人,通常对这一章节的内容会觉得不习惯,因为很多人错误的认为,支持类的封装的语言就是支持面向对象的,其实不然,Visual BA ...
- NSFileManager和NSFileHandle(附:获取文件大小 )
本文转载至:http://www.cnblogs.com/pengyingh/articles/2350345.html 天牛 感谢原创作者的硕果 //file 文件操作 NSFileManager ...
- iPhone较为基础的代码片段
Iphone代码片段导航 1.给UITableViewController添加ToolBar. self.navigationController.toolbarHidden = NO; //默认是隐 ...
- php查询操作实现投票功能
这篇文章主要为大家详细介绍了php查询操作实现投票功能的具体代码,具有一定的参考价值,感兴趣的小伙伴们可以参考一下 本文实例为大家分享了php查询操作实现投票功能的代码,供大家参考,具体内容如下 ...