MapReduce的几个实现

1.倒排索引的实现

 import java.io.IOException;

 import java.util.StringTokenizer;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.input.FileSplit;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 public class InvertedIndex {

     public static class InvertedIndexMap extends Mapper<Object,Text,Text,Text>{

         private Text valueInfo = new Text();

         private Text keyInfo = new Text();

         private FileSplit split;

         public void map(Object key, Text value,Context context)

                 throws IOException, InterruptedException {

             //获取<key value>对所属的FileSplit对象

             split = (FileSplit) context.getInputSplit();

             StringTokenizer stk = new StringTokenizer(value.toString());

             while (stk.hasMoreElements()) {

                 //key值由（单词：URI）组成

                 keyInfo.set(stk.nextToken()+":"+split.getPath().toString());

                 //词频

                 valueInfo.set("1");

                 context.write(keyInfo, valueInfo);

             }

         }

     } 

     public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{

         Text info = new Text();

         public void reduce(Text key, Iterable<Text> values,Context contex)

                 throws IOException, InterruptedException {

             int sum = 0;

             for (Text value : values) {

                 sum += Integer.parseInt(value.toString());

             }

             int splitIndex = key.toString().indexOf(":");

             //重新设置value值由（URI+:词频组成）

             info.set(key.toString().substring(splitIndex+1) +":"+ sum);

             //重新设置key值为单词

             key.set(key.toString().substring(0,splitIndex));

             contex.write(key, info);

         }

     }

     public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{

         private Text result = new Text();

         public void reduce(Text key, Iterable<Text> values,Context contex)

                 throws IOException, InterruptedException {

             //生成文档列表

             String fileList = new String();

             for (Text value : values) {

                 fileList += value.toString()+";";

             }

             result.set(fileList);

             contex.write(key, result);

         }

     }

     public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

         Configuration conf = new Configuration();

         Job job = new Job(conf,"InvertedIndex");

         job.setJarByClass(InvertedIndex.class);

         job.setMapperClass(InvertedIndexMap.class);

         job.setMapOutputKeyClass(Text.class);

         job.setMapOutputValueClass(Text.class);

         job.setCombinerClass(InvertedIndexCombiner.class);

         job.setReducerClass(InvertedIndexReduce.class);

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(Text.class);

         FileInputFormat.addInputPath(job, new Path("./in/invertedindex/"));

         FileOutputFormat.setOutputPath(job, new Path("./out/"));

         System.exit(job.waitForCompletion(true)?0:1);

     }

 }

2.word count

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.util.GenericOptionsParser;

 public class WordCount {

     public static class WordMapper extends Mapper<Object, String, String, IntWritable> {

         private static final IntWritable one = new IntWritable(1);

         public void map(Object key, String value, Context context) throws IOException, InterruptedException {

             String[] words = value.split(" ");

             for (String word : words) {

                 context.write(word, one);

             }

         }

     }

     public static class WordReducer extends Reducer<String, Iterable<IntWritable>, String, IntWritable> {

         private static IntWritable ans = new IntWritable();

         public void reduce(String key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException {

             int sum = 0;

             for (IntWritable count : value) {

                 sum += count.get();

             }

             ans.set(sum);

             context.write(key, ans);

         }

     }

     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

         Configuration conf = new Configuration();

         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

         if (otherArgs.length != 2) {

             System.err.println("Usage: wordCount <int> <count>");

             System.exit(2);

         }

         Job job = new Job(conf, "word count");

         job.setJarByClass(WordCount.class);

         job.setMapperClass(WordMapper.class);

         job.setCombinerClass(WordReducer.class);

         job.setReducerClass(WordReducer.class);

         job.setOutputKeyClass(String.class);

         job.setOutputValueClass(IntWritable.class);

         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

         System.exit(job.waitForCompletion(true)?0:1);

     }

 }

3.找出访问量最多的表

 import org.apache.commons.lang.StringUtils;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  

 /** 

  * 用Hadoop分析海量日志文件，每行日志记录了如下数据： 

  * TableName(表名)，Time(时间)，User(用户)，TimeSpan(时间开销) 

  * 要求编写MapReduce程序算出高峰时间段（如9-10点）哪张表被访问的最频繁 

  * 以及这段时间访问这张表最多的用户，以及这个用户访问这张表的总时间开销。 

  * @author drguo 

  *t003 6:00 u002 180 

  *t003 7:00 u002 180 

  *t003 7:08 u002 180 

  *t003 7:25 u002 180 

  *t002 8:00 u002 180 

  *t001 8:00 u001 240 

  *t001 9:00 u002 300 

  *t001 9:11 u001 240 

  *t003 9:26 u001 180 

  *t001 9:39 u001 300 

  * 

  * 

  * 先找出9-10点访问量最大的表 

  * 

  */

 //club.drguo.xx.mapreduce.tablecount.TableCount

 public class TableCount {

     public static class TableCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

         private Text k = new Text();

         @Override

         protected void map(LongWritable key, Text value, Context context)

                 throws IOException, InterruptedException {

             String line = value.toString();

             String[] strings = StringUtils.split(line, " ");

             String tabName = strings[0];

             String time = strings[1];

             String[] times = time.split(":");

             int hour = Integer.parseInt(times[0]);

             k.set(tabName);

             if(hour==9){

                 context.write(k, new LongWritable(1));

                 System.out.println("-----------------------------------------------"+k);

             }

         }

     }

     public static class TableCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{

         private TreeMap<Text, Long> map = new TreeMap<Text, Long>();

         @Override

         protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

             Text tabName = new Text(key.toString());//不要直接Text tabName = key;

             long count = 0;

             for(LongWritable value : values){

                 count += value.get();

             }

             System.out.println(tabName+"--------------------------"+count);

             map.put(tabName, count);

         }

         @Override

         protected void cleanup(Reducer<Text, LongWritable, Text, LongWritable>.Context context)

                 throws IOException, InterruptedException {

             Text tableName = null;

             Long maxCount = 0L;

             for(Text key : map.keySet()){

                 System.out.println("key="+key+"-----------------value="+map.get(key));

                 while(map.get(key)>maxCount){

                     maxCount = map.get(key);

                     tableName = key;

                 }

             }

             context.write(tableName, new LongWritable(maxCount));

         }

     }

     public static void main(String[] args) throws Exception {

         Configuration configuration = new Configuration();

         Job job = Job.getInstance(configuration,"tablejob");

         job.setJarByClass(TableCount.class);  

         job.setMapperClass(TableCountMapper.class);

         job.setReducerClass(TableCountReducer.class);  

         job.setMapOutputKeyClass(Text.class);

         job.setMapOutputValueClass(LongWritable.class);  

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(LongWritable.class);  

         FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/log");

         FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/tablecount"));  

         System.exit(job.waitForCompletion(true)?0:1);

     }

 }

MapReduce的几个实现的更多相关文章

Mapreduce的文件和hbase共同输入
Mapreduce的文件和hbase共同输入 package duogemap; import java.io.IOException; import org.apache.hadoop.co ...
mapreduce多文件输出的两方法
mapreduce多文件输出的两方法 package duogemap; import java.io.IOException; import org.apache.hadoop.conf ...
mapreduce中一个map多个输入路径
package duogemap; import java.io.IOException; import java.util.ArrayList; import java.util.List; imp ...
Hadoop 中利用 mapreduce 读写 mysql 数据
Hadoop 中利用 mapreduce 读写 mysql 数据有时候我们在项目中会遇到输入结果集很大,但是输出结果很小,比如一些 pv.uv 数据,然后为了实时查询的需求,或者一些 OLAP ...
[Hadoop in Action] 第5章高阶MapReduce
链接多个MapReduce作业执行多个数据集的联结生成Bloom filter 1.链接MapReduce作业 [顺序链接MapReduce作业] mapreduce-1 | mapr ...
MapReduce
2016-12-21 16:53:49 mapred-default.xml mapreduce.input.fileinputformat.split.minsize 0 The minimum ...
使用mapreduce计算环比的实例
最近做了一个小的mapreduce程序,主要目的是计算环比值最高的前5名,本来打算使用spark计算,可是本人目前spark还只是简单看了下,因此就先改用mapreduce计算了,今天和大家分享下这个 ...
MapReduce剖析笔记之八: Map输出数据的处理类MapOutputBuffer分析
在上一节我们分析了Child子进程启动,处理Map.Reduce任务的主要过程,但对于一些细节没有分析,这一节主要对MapOutputBuffer这个关键类进行分析. MapOutputBuffer顾 ...
MapReduce剖析笔记之七：Child子进程处理Map和Reduce任务的主要流程
在上一节我们分析了TaskTracker如何对JobTracker分配过来的任务进行初始化,并创建各类JVM启动所需的信息,最终创建JVM的整个过程,本节我们继续来看,JVM启动后,执行的是Child ...
MapReduce剖析笔记之六：TaskTracker初始化任务并启动JVM过程
在上面一节我们分析了JobTracker调用JobQueueTaskScheduler进行任务分配,JobQueueTaskScheduler又调用JobInProgress按照一定顺序查找任务的流程 ...

随机推荐

nginx配置技巧汇总
https://segmentfault.com/a/1190000000437323
编译ros程序包--4
编译程序包(原创博文,转载请标明出处--周学伟http://www.cnblogs.com/zxouxuewei/) 1.编译程序包: 一旦安装了所需的系统依赖项,我们就可以开始编译刚才创建的程序包了 ...
orcale_proceduie_function_两三栗
--获取部门树 procedure: create or replace procedure P_UTIL_TREE_ALL(P_APPL_NAME in VARCHAR2, P_HIERARCHY_ ...
html5 file 自定义文件过滤
使用 acctpe属性即可示例: gif,jpg <input type="file" name="pic" accept="image/gi ...
string permutation with upcase and lowcase
Give a string, which only contains a-z. List all the permutation of upcase and lowcase. For example, ...
/etc/motd
/etc/motd 用于自定义欢迎界面,用法如下: [root@localhost ~]$ cat /etc/motd .=""=. / _ _ \ | d b | \ /\ / ...
docker tag 详解
docker tag 用于给镜像打标签,语法如下: docker tag SOURCE_IMAGE[:TAG] TARGET_IMAGE[:TAG] ① 比如我现在有一个 centos 镜像: [ro ...
Qt监控后台服务运行状态
mainwindow.h #ifndef MAINWINDOW_H #define MAINWINDOW_H #include <QMainWindow> #include <QMa ...
IIS7以上版本去掉伪静态去掉index.php方法
1,由于从iis7以上的版本httpd.ini文件已不会被解析,将以下的xml文件复制到web.config 的文件中,然后放到网站的根目录即可. <?xml version="1.0 ...
Excel 2010 如何将筛选后的数据复制粘贴到另一个工作表筛选后的表格里
如果你是指自动筛选后,把筛选数据复制/粘贴到另外一个工作表中,不妨试试试第一步选中筛选后的数据区域:第二步执行菜单命令“编辑/定位/定位条件/可见单元格”,确定:第三步单击复制按钮或者Ctrl+C或 ...

MapReduce的几个实现

MapReduce的几个实现的更多相关文章

随机推荐

热门专题