MapReduce的几个实现

1.倒排索引的实现

 import java.io.IOException;

 import java.util.StringTokenizer;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.input.FileSplit;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 public class InvertedIndex {

     public static class InvertedIndexMap extends Mapper<Object,Text,Text,Text>{

         private Text valueInfo = new Text();

         private Text keyInfo = new Text();

         private FileSplit split;

         public void map(Object key, Text value,Context context)

                 throws IOException, InterruptedException {

             //获取<key value>对所属的FileSplit对象

             split = (FileSplit) context.getInputSplit();

             StringTokenizer stk = new StringTokenizer(value.toString());

             while (stk.hasMoreElements()) {

                 //key值由（单词：URI）组成

                 keyInfo.set(stk.nextToken()+":"+split.getPath().toString());

                 //词频

                 valueInfo.set("1");

                 context.write(keyInfo, valueInfo);

             }

         }

     } 

     public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{

         Text info = new Text();

         public void reduce(Text key, Iterable<Text> values,Context contex)

                 throws IOException, InterruptedException {

             int sum = 0;

             for (Text value : values) {

                 sum += Integer.parseInt(value.toString());

             }

             int splitIndex = key.toString().indexOf(":");

             //重新设置value值由（URI+:词频组成）

             info.set(key.toString().substring(splitIndex+1) +":"+ sum);

             //重新设置key值为单词

             key.set(key.toString().substring(0,splitIndex));

             contex.write(key, info);

         }

     }

     public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{

         private Text result = new Text();

         public void reduce(Text key, Iterable<Text> values,Context contex)

                 throws IOException, InterruptedException {

             //生成文档列表

             String fileList = new String();

             for (Text value : values) {

                 fileList += value.toString()+";";

             }

             result.set(fileList);

             contex.write(key, result);

         }

     }

     public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

         Configuration conf = new Configuration();

         Job job = new Job(conf,"InvertedIndex");

         job.setJarByClass(InvertedIndex.class);

         job.setMapperClass(InvertedIndexMap.class);

         job.setMapOutputKeyClass(Text.class);

         job.setMapOutputValueClass(Text.class);

         job.setCombinerClass(InvertedIndexCombiner.class);

         job.setReducerClass(InvertedIndexReduce.class);

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(Text.class);

         FileInputFormat.addInputPath(job, new Path("./in/invertedindex/"));

         FileOutputFormat.setOutputPath(job, new Path("./out/"));

         System.exit(job.waitForCompletion(true)?0:1);

     }

 }

2.word count

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.util.GenericOptionsParser;

 public class WordCount {

     public static class WordMapper extends Mapper<Object, String, String, IntWritable> {

         private static final IntWritable one = new IntWritable(1);

         public void map(Object key, String value, Context context) throws IOException, InterruptedException {

             String[] words = value.split(" ");

             for (String word : words) {

                 context.write(word, one);

             }

         }

     }

     public static class WordReducer extends Reducer<String, Iterable<IntWritable>, String, IntWritable> {

         private static IntWritable ans = new IntWritable();

         public void reduce(String key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException {

             int sum = 0;

             for (IntWritable count : value) {

                 sum += count.get();

             }

             ans.set(sum);

             context.write(key, ans);

         }

     }

     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

         Configuration conf = new Configuration();

         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

         if (otherArgs.length != 2) {

             System.err.println("Usage: wordCount <int> <count>");

             System.exit(2);

         }

         Job job = new Job(conf, "word count");

         job.setJarByClass(WordCount.class);

         job.setMapperClass(WordMapper.class);

         job.setCombinerClass(WordReducer.class);

         job.setReducerClass(WordReducer.class);

         job.setOutputKeyClass(String.class);

         job.setOutputValueClass(IntWritable.class);

         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

         System.exit(job.waitForCompletion(true)?0:1);

     }

 }

3.找出访问量最多的表

 import org.apache.commons.lang.StringUtils;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  

 /** 

  * 用Hadoop分析海量日志文件，每行日志记录了如下数据： 

  * TableName(表名)，Time(时间)，User(用户)，TimeSpan(时间开销) 

  * 要求编写MapReduce程序算出高峰时间段（如9-10点）哪张表被访问的最频繁 

  * 以及这段时间访问这张表最多的用户，以及这个用户访问这张表的总时间开销。 

  * @author drguo 

  *t003 6:00 u002 180 

  *t003 7:00 u002 180 

  *t003 7:08 u002 180 

  *t003 7:25 u002 180 

  *t002 8:00 u002 180 

  *t001 8:00 u001 240 

  *t001 9:00 u002 300 

  *t001 9:11 u001 240 

  *t003 9:26 u001 180 

  *t001 9:39 u001 300 

  * 

  * 

  * 先找出9-10点访问量最大的表 

  * 

  */

 //club.drguo.xx.mapreduce.tablecount.TableCount

 public class TableCount {

     public static class TableCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

         private Text k = new Text();

         @Override

         protected void map(LongWritable key, Text value, Context context)

                 throws IOException, InterruptedException {

             String line = value.toString();

             String[] strings = StringUtils.split(line, " ");

             String tabName = strings[0];

             String time = strings[1];

             String[] times = time.split(":");

             int hour = Integer.parseInt(times[0]);

             k.set(tabName);

             if(hour==9){

                 context.write(k, new LongWritable(1));

                 System.out.println("-----------------------------------------------"+k);

             }

         }

     }

     public static class TableCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{

         private TreeMap<Text, Long> map = new TreeMap<Text, Long>();

         @Override

         protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

             Text tabName = new Text(key.toString());//不要直接Text tabName = key;

             long count = 0;

             for(LongWritable value : values){

                 count += value.get();

             }

             System.out.println(tabName+"--------------------------"+count);

             map.put(tabName, count);

         }

         @Override

         protected void cleanup(Reducer<Text, LongWritable, Text, LongWritable>.Context context)

                 throws IOException, InterruptedException {

             Text tableName = null;

             Long maxCount = 0L;

             for(Text key : map.keySet()){

                 System.out.println("key="+key+"-----------------value="+map.get(key));

                 while(map.get(key)>maxCount){

                     maxCount = map.get(key);

                     tableName = key;

                 }

             }

             context.write(tableName, new LongWritable(maxCount));

         }

     }

     public static void main(String[] args) throws Exception {

         Configuration configuration = new Configuration();

         Job job = Job.getInstance(configuration,"tablejob");

         job.setJarByClass(TableCount.class);  

         job.setMapperClass(TableCountMapper.class);

         job.setReducerClass(TableCountReducer.class);  

         job.setMapOutputKeyClass(Text.class);

         job.setMapOutputValueClass(LongWritable.class);  

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(LongWritable.class);  

         FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/log");

         FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/tablecount"));  

         System.exit(job.waitForCompletion(true)?0:1);

     }

 }

MapReduce的几个实现的更多相关文章

Mapreduce的文件和hbase共同输入
Mapreduce的文件和hbase共同输入 package duogemap; import java.io.IOException; import org.apache.hadoop.co ...
mapreduce多文件输出的两方法
mapreduce多文件输出的两方法 package duogemap; import java.io.IOException; import org.apache.hadoop.conf ...
mapreduce中一个map多个输入路径
package duogemap; import java.io.IOException; import java.util.ArrayList; import java.util.List; imp ...
Hadoop 中利用 mapreduce 读写 mysql 数据
Hadoop 中利用 mapreduce 读写 mysql 数据有时候我们在项目中会遇到输入结果集很大,但是输出结果很小,比如一些 pv.uv 数据,然后为了实时查询的需求,或者一些 OLAP ...
[Hadoop in Action] 第5章高阶MapReduce
链接多个MapReduce作业执行多个数据集的联结生成Bloom filter 1.链接MapReduce作业 [顺序链接MapReduce作业] mapreduce-1 | mapr ...
MapReduce
2016-12-21 16:53:49 mapred-default.xml mapreduce.input.fileinputformat.split.minsize 0 The minimum ...
使用mapreduce计算环比的实例
最近做了一个小的mapreduce程序,主要目的是计算环比值最高的前5名,本来打算使用spark计算,可是本人目前spark还只是简单看了下,因此就先改用mapreduce计算了,今天和大家分享下这个 ...
MapReduce剖析笔记之八: Map输出数据的处理类MapOutputBuffer分析
在上一节我们分析了Child子进程启动,处理Map.Reduce任务的主要过程,但对于一些细节没有分析,这一节主要对MapOutputBuffer这个关键类进行分析. MapOutputBuffer顾 ...
MapReduce剖析笔记之七：Child子进程处理Map和Reduce任务的主要流程
在上一节我们分析了TaskTracker如何对JobTracker分配过来的任务进行初始化,并创建各类JVM启动所需的信息,最终创建JVM的整个过程,本节我们继续来看,JVM启动后,执行的是Child ...
MapReduce剖析笔记之六：TaskTracker初始化任务并启动JVM过程
在上面一节我们分析了JobTracker调用JobQueueTaskScheduler进行任务分配,JobQueueTaskScheduler又调用JobInProgress按照一定顺序查找任务的流程 ...

随机推荐

《转》我眼中的C# 3.0
本文转载自Allen Lee's Magic 缘起每次有新技术发布时,我们总能感受到两种截然不同的情绪:一种是恐惧和抵抗,伴随着这种情绪的还有诸如"C# 2.0用的挺好的,为什么要在C# ...
PHP中str_replace和substr_replace有什么区别?
两个函数的定义:(1)str_replace() 函数替换字符串中的一些字符(区分大小写). 该函数必须遵循下列规则: 如果搜索的字符串是一个数组,那么它将返回一个数组. 如果搜索的字符串是一个数组, ...
测试sql语句性能，提高执行效率
为了让您的程序执行的效率更高,SQL的效率一定不可忽视. 现有以下方法去检测SQL的执行效率. 对于多表查询的效率测试: )直接from ,where方式. SET STATISTICS io ON ...
.net 防盗链
Global.asax 文件中 protected void Application_BeginRequest(object sender, EventArgs e) { //判断当前请求是否是访问 ...
微信红包随机生成算法（PHP版）
/** * 求一个数的平方 * @param $n */ function sqr($n){ return $n*$n; } /** * 生产min和max之间的随机数,但是概率不是平均的,从min到 ...
java基础---->FilenameFilter之文件过滤
FilenameFilter用于对列表中文件名的过滤,今天我们就开始java中FilenameFilter的学习.好多年了,你一直在我的伤口中幽居,我放下过天地,却从未放下过你,我生命中的千山万水,任 ...
serializeArray()与serialize()的区别
serialize()序列化表单元素为字符串,用于 Ajax 请求. serializeArray()序列化表单元素为JSON数据. <script type="text/javasc ...
Egret Wing4.0.3 动画编辑器
一 exml上摆放组件切换动画编辑创建动画组,命名test1. 选中一个对象,创建动画(必须选中一个对象后,+号才会亮.且一个对象只能创建一个动画) 之后和Flash差不多.在时间轴插入关键帧. ...
【算法】N Queens Problem
/* ** 目前最快的N皇后递归解决方法 ** N Queens Problem ** 试探-回溯算法,递归实现 */ #include "stdafx.h" #include & ...
iOS使用位置和方向服务（来自苹果apple官方）
版权声明:本文为博主原创文章,未经博主允许不得转载. 目录(?)[+] 本文章来自苹果官方文档,特此声明--------禚 Core Location框架为定位用户当前位置和方向(Headin ...

MapReduce的几个实现

MapReduce的几个实现的更多相关文章

随机推荐

热门专题