【Hadoop】Hadoop MR 如何实现倒排索引算法?
1、概念、方案
2、代码示例
InverseIndexOne
package com.ares.hadoop.mr.inverseindex; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger; public class InverseIndexOne extends Configured implements Tool { private static final Logger LOGGER = Logger.getLogger(InverseIndexOne.class);
enum Counter {
LINESKIP
} public static class InverseIndexOneMapper
extends Mapper<LongWritable, Text, Text, LongWritable> { private String line;
private final static char separatorA = ' ';
private final static char separatorB = '-';
private String fileName; private Text text = new Text();
private final static LongWritable ONE = new LongWritable(1L); @Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
//super.map(key, value, context);
try {
line = value.toString();
String[] fields = StringUtils.split(line, separatorA); FileSplit fileSplit = (FileSplit) context.getInputSplit();
fileName = fileSplit.getPath().getName(); for (int i = ; i < fields.length; i++) {
text.set(fields[i] + separatorB + fileName);
context.write(text, ONE);
}
} catch (Exception e) {
// TODO: handle exception
LOGGER.error(e);
System.out.println(e);
context.getCounter(Counter.LINESKIP).increment();
return;
}
}
} public static class InverseIndexOneReducer
extends Reducer<Text, LongWritable, Text, LongWritable> {
private LongWritable result = new LongWritable(); @Override
protected void reduce(Text key, Iterable<LongWritable> values,
Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
//super.reduce(arg0, arg1, arg2);
long count = ;
for (LongWritable value : values) {
count += value.get();
}
result.set(count);
context.write(key, result);
}
} @Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
//return 0;
String errMsg = "InverseIndexOne: TEST STARTED...";
LOGGER.debug(errMsg);
System.out.println(errMsg); Configuration conf = new Configuration();
//FOR Eclipse JVM Debug
//conf.set("mapreduce.job.jar", "flowsum.jar");
Job job = Job.getInstance(conf); // JOB NAME
job.setJobName("InverseIndexOne"); // JOB MAPPER & REDUCER
job.setJarByClass(InverseIndexOne.class);
job.setMapperClass(InverseIndexOneMapper.class);
job.setReducerClass(InverseIndexOneReducer.class); // JOB PARTITION
//job.setPartitionerClass(FlowGroupPartition.class); // JOB REDUCE TASK NUMBER
//job.setNumReduceTasks(5); // MAP & REDUCE
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// MAP
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class); // JOB INPUT & OUTPUT PATH
//FileInputFormat.addInputPath(job, new Path(args[0]));
FileInputFormat.setInputPaths(job, args[]);
Path output = new Path(args[]);
// FileSystem fs = FileSystem.get(conf);
// if (fs.exists(output)) {
// fs.delete(output, true);
// }
FileOutputFormat.setOutputPath(job, output); // VERBOSE OUTPUT
if (job.waitForCompletion(true)) {
errMsg = "InverseIndexOne: TEST SUCCESSFULLY...";
LOGGER.debug(errMsg);
System.out.println(errMsg);
return ;
} else {
errMsg = "InverseIndexOne: TEST FAILED...";
LOGGER.debug(errMsg);
System.out.println(errMsg);
return ;
}
} public static void main(String[] args) throws Exception {
if (args.length != ) {
String errMsg = "InverseIndexOne: ARGUMENTS ERROR";
LOGGER.error(errMsg);
System.out.println(errMsg);
System.exit(-);
} int result = ToolRunner.run(new Configuration(), new InverseIndexOne(), args);
System.exit(result);
}
}
InverseIndexTwo
package com.ares.hadoop.mr.inverseindex; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger; public class InverseIndexTwo extends Configured implements Tool{
private static final Logger LOGGER = Logger.getLogger(InverseIndexOne.class);
enum Counter {
LINESKIP
} public static class InverseIndexTwoMapper extends
Mapper<LongWritable, Text, Text, Text> { private String line;
private final static char separatorA = '\t';
private final static char separatorB = '-'; private Text textKey = new Text();
private Text textValue = new Text(); @Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
//super.map(key, value, context);
try {
line = value.toString();
String[] fields = StringUtils.split(line, separatorA);
String[] wordAndfileName = StringUtils.split(fields[], separatorB);
long count = Long.parseLong(fields[]);
String word = wordAndfileName[];
String fileName = wordAndfileName[]; textKey.set(word);
textValue.set(fileName + separatorB + count);
context.write(textKey, textValue);
} catch (Exception e) {
// TODO: handle exception
LOGGER.error(e);
System.out.println(e);
context.getCounter(Counter.LINESKIP).increment();
return;
}
}
} public static class InverseIndexTwoReducer extends
Reducer<Text, Text, Text, Text> { private Text textValue = new Text(); @Override
protected void reduce(Text key, Iterable<Text> values,
Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
//super.reduce(arg0, arg1, arg2);
StringBuilder index = new StringBuilder("");
// for (Text text : values) {
// if (condition) {
//
// }
// index.append(text.toString() + separatorA);
// }
String separatorA = "";
for (Text text : values) {
index.append(separatorA + text.toString());
separatorA = ",";
}
textValue.set(index.toString());
context.write(key, textValue);
}
} @Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
//return 0;
String errMsg = "InverseIndexTwo: TEST STARTED...";
LOGGER.debug(errMsg);
System.out.println(errMsg); Configuration conf = new Configuration();
//FOR Eclipse JVM Debug
//conf.set("mapreduce.job.jar", "flowsum.jar");
Job job = Job.getInstance(conf); // JOB NAME
job.setJobName("InverseIndexTwo"); // JOB MAPPER & REDUCER
job.setJarByClass(InverseIndexTwo.class);
job.setMapperClass(InverseIndexTwoMapper.class);
job.setReducerClass(InverseIndexTwoReducer.class); // JOB PARTITION
//job.setPartitionerClass(FlowGroupPartition.class); // JOB REDUCE TASK NUMBER
//job.setNumReduceTasks(5); // MAP & REDUCE
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// MAP
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); // JOB INPUT & OUTPUT PATH
//FileInputFormat.addInputPath(job, new Path(args[0]));
FileInputFormat.setInputPaths(job, args[]);
Path output = new Path(args[]);
// FileSystem fs = FileSystem.get(conf);
// if (fs.exists(output)) {
// fs.delete(output, true);
// }
FileOutputFormat.setOutputPath(job, output); // VERBOSE OUTPUT
if (job.waitForCompletion(true)) {
errMsg = "InverseIndexTwo: TEST SUCCESSFULLY...";
LOGGER.debug(errMsg);
System.out.println(errMsg);
return ;
} else {
errMsg = "InverseIndexTwo: TEST FAILED...";
LOGGER.debug(errMsg);
System.out.println(errMsg);
return ;
}
} public static void main(String[] args) throws Exception {
if (args.length != ) {
String errMsg = "InverseIndexOne: ARGUMENTS ERROR";
LOGGER.error(errMsg);
System.out.println(errMsg);
System.exit(-);
} int result = ToolRunner.run(new Configuration(), new InverseIndexTwo(), args);
System.exit(result);
} }
参考资料:
How to check if processing the last item in an Iterator?:http://stackoverflow.com/questions/9633991/how-to-check-if-processing-the-last-item-in-an-iterator
【Hadoop】Hadoop MR 如何实现倒排索引算法?的更多相关文章
- hadoop修改MR的提交的代码程序的副本数
hadoop修改MR的提交的代码程序的副本数 Under-Replicated Blocks的数量很多,有7万多个.hadoop fsck -blocks 检查发现有很多replica missing ...
- 腾讯公司数据分析岗位的hadoop工作 线性回归 k-means算法 朴素贝叶斯算法 SpringMVC组件 某公司的广告投放系统 KNN算法 社交网络模型 SpringMVC注解方式
腾讯公司数据分析岗位的hadoop工作 线性回归 k-means算法 朴素贝叶斯算法 SpringMVC组件 某公司的广告投放系统 KNN算法 社交网络模型 SpringMVC注解方式 某移动公司实时 ...
- Hadoop【MR开发规范、序列化】
Hadoop[MR开发规范.序列化] 目录 Hadoop[MR开发规范.序列化] 一.MapReduce编程规范 1.Mapper阶段 2.Reducer阶段 3.Driver阶段 二.WordCou ...
- [Hadoop]Hadoop章2 HDFS原理及读写过程
HDFS(Hadoop Distributed File System )Hadoop分布式文件系统. HDFS有很多特点: ① 保存多个副本,且提供容错机制,副本丢失或宕机自动恢复.默认存3份. ② ...
- hadoop hadoop install (1)
vmuser@vmuser-VirtualBox:~$ sudo useradd -m hadoop -s /bin/bash[sudo] vmuser 的密码: vmuser@vmuser-Virt ...
- MR案例:倒排索引
1.map阶段:将单词和URI组成Key值(如“MapReduce :1.txt”),将词频作为value. 利用MR框架自带的Map端排序,将同一文档的相同单词的词频组成列表,传递给Combine过 ...
- Hadoop hadoop 机架感知配置
机架感知脚本 使用python3编写机架感知脚本,报存到topology.py,给予执行权限 import sys import os DEFAULT_RACK="/default-rack ...
- hadoop之 mr输出到hbase
1.注意问题: 1.在开发过程中一定要导入hbase源码中的lib库否则出现如下错误 TableMapReducUtil 找不到什么-- 2.编码: import java.io.IOExceptio ...
- Hadoop案例(四)倒排索引(多job串联)与全局计数器
一. 倒排索引(多job串联) 1. 需求分析 有大量的文本(文档.网页),需要建立搜索索引 xyg pingping xyg ss xyg ss a.txt xyg pingping xyg pin ...
随机推荐
- [GDOI2016] 疯狂动物园 [树链剖分+可持久化线段树]
题面 太长了,而且解释的不清楚,我来给个简化版的题意: 给定一棵$n$个点的数,每个点有点权,你需要实现以下$m$个操作 操作1,把$x$到$y$的路径上的所有点的权值都加上$delta$,并且更新一 ...
- 【CZY选讲·黑白染色】
题目描述 给出平面上n 个点,试将他们黑白染色,要求染色后无法用一条直线把黑白完全分开. 随便输出一种方案. 数据范围 n<=100000 题解: ①点数很多,但是可以发现至多需 ...
- 用$("...").attr("checked", true)设置勾选无效的原因
如下图所示,本来想要实现如下图所示的功能,于是我本来是使用$("...").attr("checked", true/false)来实现该功能,但是第一次点击时 ...
- 转:Android Log
在调试代码的时候我们需要查看调试信息,那我们就需要用Android Log类. android.util.Log常用的方法有以下5个:Log.v() Log.d() Log.i() Log.w() 以 ...
- 理解javascript的闭包,原型,和匿名函数及IIFE
理解javascript的闭包,原型,和匿名函数(自己总结) 一 .>关于闭包 理解闭包 需要的知识1.变量的作用域 例1: var n =99; //建立函数外的全局变量 function r ...
- 搜索水题四连发_C++
特别声明:以下题目有部分为原创题,涉及版权问题,不得转载,违者追究 法律责任! 话说这是一套神题,只有你想不到,没有你做不到 题目更正后比 Pascal 跑得还快哈~ 一道特别裸,但是特别坑的搜索题 ...
- LVDS 数据通道详解 单8 单6
1.1.1 LVDS接口分类 1.1.1.1 单路6bit LVDS 这种接口电路中,采用单路方式传输,每个基色信号采用6位数据,共18位RGB数据,因此 ...
- 手一抖误删了根目录 /usr 之后的挽救过程
一切悲剧来源于写的Shell没有好好检查,执行后把开发机的根目录 /usr 目录给删除了,而且是root执行,众所周知,/usr目录里有大量的应用层程序,删除之后导致大量命令无法使用,如 ssh / ...
- SpringMvc+Spring+Mybatis+Maven整合
一.建立数据库表,使用generator自动生成相关代码: /* SQLyog Ultimate v11.24 (32 bit) MySQL - 5.1.62-community : Database ...
- 使用两个 Windows 窗体 DataGridView 控件创建一个主/从窗体
使用 DataGridView 控件的一种最常见方案是“主/详细信息”窗体,这样的窗体可显示两个数据库表之间的父/子关系.如果选择主表中的行,将导致以相应的子数据来更新详细信息表. 主/详细信息窗体很 ...