1.倒排索引的实现

 import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class InvertedIndex { public static class InvertedIndexMap extends Mapper<Object,Text,Text,Text>{ private Text valueInfo = new Text();
private Text keyInfo = new Text();
private FileSplit split; public void map(Object key, Text value,Context context)
throws IOException, InterruptedException {
//获取<key value>对所属的FileSplit对象
split = (FileSplit) context.getInputSplit();
StringTokenizer stk = new StringTokenizer(value.toString());
while (stk.hasMoreElements()) {
//key值由(单词:URI)组成
keyInfo.set(stk.nextToken()+":"+split.getPath().toString());
//词频
valueInfo.set("1");
context.write(keyInfo, valueInfo);
}
}
} public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{ Text info = new Text(); public void reduce(Text key, Iterable<Text> values,Context contex)
throws IOException, InterruptedException {
int sum = 0;
for (Text value : values) {
sum += Integer.parseInt(value.toString());
}
int splitIndex = key.toString().indexOf(":");
//重新设置value值由(URI+:词频组成)
info.set(key.toString().substring(splitIndex+1) +":"+ sum);
//重新设置key值为单词
key.set(key.toString().substring(0,splitIndex));
contex.write(key, info);
}
} public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{ private Text result = new Text(); public void reduce(Text key, Iterable<Text> values,Context contex)
throws IOException, InterruptedException {
//生成文档列表
String fileList = new String();
for (Text value : values) {
fileList += value.toString()+";";
}
result.set(fileList);
contex.write(key, result);
}
} public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); Job job = new Job(conf,"InvertedIndex"); job.setJarByClass(InvertedIndex.class); job.setMapperClass(InvertedIndexMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); job.setCombinerClass(InvertedIndexCombiner.class); job.setReducerClass(InvertedIndexReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path("./in/invertedindex/"));
FileOutputFormat.setOutputPath(job, new Path("./out/")); System.exit(job.waitForCompletion(true)?0:1); }
}

2.word count

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser; public class WordCount {
public static class WordMapper extends Mapper<Object, String, String, IntWritable> {
private static final IntWritable one = new IntWritable(1);
public void map(Object key, String value, Context context) throws IOException, InterruptedException {
String[] words = value.split(" ");
for (String word : words) {
context.write(word, one);
}
}
}
public static class WordReducer extends Reducer<String, Iterable<IntWritable>, String, IntWritable> {
private static IntWritable ans = new IntWritable();
public void reduce(String key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable count : value) {
sum += count.get();
}
ans.set(sum);
context.write(key, ans);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordCount <int> <count>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordMapper.class);
job.setCombinerClass(WordReducer.class);
job.setReducerClass(WordReducer.class);
job.setOutputKeyClass(String.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}

3.找出访问量最多的表

 import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * 用Hadoop分析海量日志文件,每行日志记录了如下数据: * TableName(表名),Time(时间),User(用户),TimeSpan(时间开销) * 要求编写MapReduce程序算出高峰时间段(如9-10点)哪张表被访问的最频繁 * 以及这段时间访问这张表最多的用户,以及这个用户访问这张表的总时间开销。 * @author drguo *t003 6:00 u002 180 *t003 7:00 u002 180 *t003 7:08 u002 180 *t003 7:25 u002 180 *t002 8:00 u002 180 *t001 8:00 u001 240 *t001 9:00 u002 300 *t001 9:11 u001 240 *t003 9:26 u001 180 *t001 9:39 u001 300 * * * 先找出9-10点访问量最大的表 * */
//club.drguo.xx.mapreduce.tablecount.TableCount
public class TableCount {
public static class TableCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
private Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] strings = StringUtils.split(line, " ");
String tabName = strings[0];
String time = strings[1];
String[] times = time.split(":");
int hour = Integer.parseInt(times[0]);
k.set(tabName);
if(hour==9){
context.write(k, new LongWritable(1));
System.out.println("-----------------------------------------------"+k);
}
}
}
public static class TableCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
private TreeMap<Text, Long> map = new TreeMap<Text, Long>();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
Text tabName = new Text(key.toString());//不要直接Text tabName = key;
long count = 0;
for(LongWritable value : values){
count += value.get();
}
System.out.println(tabName+"--------------------------"+count);
map.put(tabName, count);
}
@Override
protected void cleanup(Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
Text tableName = null;
Long maxCount = 0L;
for(Text key : map.keySet()){
System.out.println("key="+key+"-----------------value="+map.get(key));
while(map.get(key)>maxCount){
maxCount = map.get(key);
tableName = key;
}
}
context.write(tableName, new LongWritable(maxCount));
}
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration,"tablejob");
job.setJarByClass(TableCount.class); job.setMapperClass(TableCountMapper.class);
job.setReducerClass(TableCountReducer.class); job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/log");
FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/tablecount")); System.exit(job.waitForCompletion(true)?0:1);
}
}

MapReduce的几个实现的更多相关文章

  1. Mapreduce的文件和hbase共同输入

    Mapreduce的文件和hbase共同输入 package duogemap;   import java.io.IOException;   import org.apache.hadoop.co ...

  2. mapreduce多文件输出的两方法

    mapreduce多文件输出的两方法   package duogemap;   import java.io.IOException;   import org.apache.hadoop.conf ...

  3. mapreduce中一个map多个输入路径

    package duogemap; import java.io.IOException; import java.util.ArrayList; import java.util.List; imp ...

  4. Hadoop 中利用 mapreduce 读写 mysql 数据

    Hadoop 中利用 mapreduce 读写 mysql 数据   有时候我们在项目中会遇到输入结果集很大,但是输出结果很小,比如一些 pv.uv 数据,然后为了实时查询的需求,或者一些 OLAP ...

  5. [Hadoop in Action] 第5章 高阶MapReduce

    链接多个MapReduce作业 执行多个数据集的联结 生成Bloom filter   1.链接MapReduce作业   [顺序链接MapReduce作业]   mapreduce-1 | mapr ...

  6. MapReduce

    2016-12-21  16:53:49 mapred-default.xml mapreduce.input.fileinputformat.split.minsize 0 The minimum ...

  7. 使用mapreduce计算环比的实例

    最近做了一个小的mapreduce程序,主要目的是计算环比值最高的前5名,本来打算使用spark计算,可是本人目前spark还只是简单看了下,因此就先改用mapreduce计算了,今天和大家分享下这个 ...

  8. MapReduce剖析笔记之八: Map输出数据的处理类MapOutputBuffer分析

    在上一节我们分析了Child子进程启动,处理Map.Reduce任务的主要过程,但对于一些细节没有分析,这一节主要对MapOutputBuffer这个关键类进行分析. MapOutputBuffer顾 ...

  9. MapReduce剖析笔记之七:Child子进程处理Map和Reduce任务的主要流程

    在上一节我们分析了TaskTracker如何对JobTracker分配过来的任务进行初始化,并创建各类JVM启动所需的信息,最终创建JVM的整个过程,本节我们继续来看,JVM启动后,执行的是Child ...

  10. MapReduce剖析笔记之六:TaskTracker初始化任务并启动JVM过程

    在上面一节我们分析了JobTracker调用JobQueueTaskScheduler进行任务分配,JobQueueTaskScheduler又调用JobInProgress按照一定顺序查找任务的流程 ...

随机推荐

  1. mysqlbinlog基于某个偏移量进行数据的恢复(重做),--start-position,--stop-position的使用方法

    需求描述: 今天在看mysqlbinlog的内容,看到了--start-position和--stop-position这些选项, 就测试下这个参数具体该怎么进行使用呢,在此记录下. 操作过程: 1. ...

  2. NHibernate之映射文件配置说

    1. hibernate-mapping 这个元素包括以下可选的属性.schema属性,指明了这个映射所引用的表所在的schema名称.假若指定了这个属性, 表名会加上所指定的schema的名字扩展为 ...

  3. Git 的BUG小结

    Git 的BUG小结 Git 在push的时候出现了: fatal: The remote end hung up unexpectedly 在网上找了非常多  发现出现了下面错误提示也可能是同样的问 ...

  4. django model 数据类型

    转自:http://www.cnblogs.com/lhj588/archive/2012/05/24/2516040.html Django 通过 models 实现数据库的创建.修改.删除等操作, ...

  5. Python zmail 模块

    zmail 是 python3 用来收发邮件的一个模块,用法参考: https://mp.weixin.qq.com/s?__biz=MzAxMjUyNDQ5OA==&mid=26535559 ...

  6. Unity版本与虚拟现实头盔Deepoon大朋版本测试

    一.看这里 Unity官方与OC runtime版本兼容性说明: https://developer.oculus.com/documentation/game-engines/latest/conc ...

  7. 将Eclipse项目导入到Android studio 中 很多点9图出现问题解决方法

    在build.gradle里添加以下两句: aaptOptions.cruncherEnabled = false aaptOptions.useNewCruncher = false

  8. 《C++ Primer Plus》第9章 内存模型和名称空间 学习笔记

    C++鼓励程序员在开发程序时使用多个文件.一种有效的组织策略是,使用头文件来定义用户类型,为操纵用户类型的函数提供函数原型,并将函数定义放在一个独立的源代码文件中.头文件和源代码文件一起定义和实现了用 ...

  9. Change Base

    Given an integer m in base B (2 ≤ B ≤ 10) (m contains no more than 1000 digits), find the value of t ...

  10. scss语法

    SCSS其实就是SASS新语法, 增强了对CSS3语法的支持 1.变量(Variables) /*声明变明*/ $color: #333; $bgcolor:#f36; /*引用变量*/ body { ...