1.倒排索引的实现

 import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class InvertedIndex { public static class InvertedIndexMap extends Mapper<Object,Text,Text,Text>{ private Text valueInfo = new Text();
private Text keyInfo = new Text();
private FileSplit split; public void map(Object key, Text value,Context context)
throws IOException, InterruptedException {
//获取<key value>对所属的FileSplit对象
split = (FileSplit) context.getInputSplit();
StringTokenizer stk = new StringTokenizer(value.toString());
while (stk.hasMoreElements()) {
//key值由(单词:URI)组成
keyInfo.set(stk.nextToken()+":"+split.getPath().toString());
//词频
valueInfo.set("1");
context.write(keyInfo, valueInfo);
}
}
} public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{ Text info = new Text(); public void reduce(Text key, Iterable<Text> values,Context contex)
throws IOException, InterruptedException {
int sum = 0;
for (Text value : values) {
sum += Integer.parseInt(value.toString());
}
int splitIndex = key.toString().indexOf(":");
//重新设置value值由(URI+:词频组成)
info.set(key.toString().substring(splitIndex+1) +":"+ sum);
//重新设置key值为单词
key.set(key.toString().substring(0,splitIndex));
contex.write(key, info);
}
} public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{ private Text result = new Text(); public void reduce(Text key, Iterable<Text> values,Context contex)
throws IOException, InterruptedException {
//生成文档列表
String fileList = new String();
for (Text value : values) {
fileList += value.toString()+";";
}
result.set(fileList);
contex.write(key, result);
}
} public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); Job job = new Job(conf,"InvertedIndex"); job.setJarByClass(InvertedIndex.class); job.setMapperClass(InvertedIndexMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); job.setCombinerClass(InvertedIndexCombiner.class); job.setReducerClass(InvertedIndexReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path("./in/invertedindex/"));
FileOutputFormat.setOutputPath(job, new Path("./out/")); System.exit(job.waitForCompletion(true)?0:1); }
}

2.word count

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser; public class WordCount {
public static class WordMapper extends Mapper<Object, String, String, IntWritable> {
private static final IntWritable one = new IntWritable(1);
public void map(Object key, String value, Context context) throws IOException, InterruptedException {
String[] words = value.split(" ");
for (String word : words) {
context.write(word, one);
}
}
}
public static class WordReducer extends Reducer<String, Iterable<IntWritable>, String, IntWritable> {
private static IntWritable ans = new IntWritable();
public void reduce(String key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable count : value) {
sum += count.get();
}
ans.set(sum);
context.write(key, ans);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordCount <int> <count>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordMapper.class);
job.setCombinerClass(WordReducer.class);
job.setReducerClass(WordReducer.class);
job.setOutputKeyClass(String.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}

3.找出访问量最多的表

 import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * 用Hadoop分析海量日志文件,每行日志记录了如下数据: * TableName(表名),Time(时间),User(用户),TimeSpan(时间开销) * 要求编写MapReduce程序算出高峰时间段(如9-10点)哪张表被访问的最频繁 * 以及这段时间访问这张表最多的用户,以及这个用户访问这张表的总时间开销。 * @author drguo *t003 6:00 u002 180 *t003 7:00 u002 180 *t003 7:08 u002 180 *t003 7:25 u002 180 *t002 8:00 u002 180 *t001 8:00 u001 240 *t001 9:00 u002 300 *t001 9:11 u001 240 *t003 9:26 u001 180 *t001 9:39 u001 300 * * * 先找出9-10点访问量最大的表 * */
//club.drguo.xx.mapreduce.tablecount.TableCount
public class TableCount {
public static class TableCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
private Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] strings = StringUtils.split(line, " ");
String tabName = strings[0];
String time = strings[1];
String[] times = time.split(":");
int hour = Integer.parseInt(times[0]);
k.set(tabName);
if(hour==9){
context.write(k, new LongWritable(1));
System.out.println("-----------------------------------------------"+k);
}
}
}
public static class TableCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
private TreeMap<Text, Long> map = new TreeMap<Text, Long>();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
Text tabName = new Text(key.toString());//不要直接Text tabName = key;
long count = 0;
for(LongWritable value : values){
count += value.get();
}
System.out.println(tabName+"--------------------------"+count);
map.put(tabName, count);
}
@Override
protected void cleanup(Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
Text tableName = null;
Long maxCount = 0L;
for(Text key : map.keySet()){
System.out.println("key="+key+"-----------------value="+map.get(key));
while(map.get(key)>maxCount){
maxCount = map.get(key);
tableName = key;
}
}
context.write(tableName, new LongWritable(maxCount));
}
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration,"tablejob");
job.setJarByClass(TableCount.class); job.setMapperClass(TableCountMapper.class);
job.setReducerClass(TableCountReducer.class); job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/log");
FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/tablecount")); System.exit(job.waitForCompletion(true)?0:1);
}
}

MapReduce的几个实现的更多相关文章

  1. Mapreduce的文件和hbase共同输入

    Mapreduce的文件和hbase共同输入 package duogemap;   import java.io.IOException;   import org.apache.hadoop.co ...

  2. mapreduce多文件输出的两方法

    mapreduce多文件输出的两方法   package duogemap;   import java.io.IOException;   import org.apache.hadoop.conf ...

  3. mapreduce中一个map多个输入路径

    package duogemap; import java.io.IOException; import java.util.ArrayList; import java.util.List; imp ...

  4. Hadoop 中利用 mapreduce 读写 mysql 数据

    Hadoop 中利用 mapreduce 读写 mysql 数据   有时候我们在项目中会遇到输入结果集很大,但是输出结果很小,比如一些 pv.uv 数据,然后为了实时查询的需求,或者一些 OLAP ...

  5. [Hadoop in Action] 第5章 高阶MapReduce

    链接多个MapReduce作业 执行多个数据集的联结 生成Bloom filter   1.链接MapReduce作业   [顺序链接MapReduce作业]   mapreduce-1 | mapr ...

  6. MapReduce

    2016-12-21  16:53:49 mapred-default.xml mapreduce.input.fileinputformat.split.minsize 0 The minimum ...

  7. 使用mapreduce计算环比的实例

    最近做了一个小的mapreduce程序,主要目的是计算环比值最高的前5名,本来打算使用spark计算,可是本人目前spark还只是简单看了下,因此就先改用mapreduce计算了,今天和大家分享下这个 ...

  8. MapReduce剖析笔记之八: Map输出数据的处理类MapOutputBuffer分析

    在上一节我们分析了Child子进程启动,处理Map.Reduce任务的主要过程,但对于一些细节没有分析,这一节主要对MapOutputBuffer这个关键类进行分析. MapOutputBuffer顾 ...

  9. MapReduce剖析笔记之七:Child子进程处理Map和Reduce任务的主要流程

    在上一节我们分析了TaskTracker如何对JobTracker分配过来的任务进行初始化,并创建各类JVM启动所需的信息,最终创建JVM的整个过程,本节我们继续来看,JVM启动后,执行的是Child ...

  10. MapReduce剖析笔记之六:TaskTracker初始化任务并启动JVM过程

    在上面一节我们分析了JobTracker调用JobQueueTaskScheduler进行任务分配,JobQueueTaskScheduler又调用JobInProgress按照一定顺序查找任务的流程 ...

随机推荐

  1. develop brew app from here

    https://brewx.qualcomm.com/brew/sdk/download.jsp?page=dx/en/brew31/ad/tl/overview the email is silen ...

  2. Cocos2d-x 3.0 Lua编程 之 响应Android手机的按键

    演示样例代码例如以下所看到的: local listenerKey= cc.EventListenerKeyboard:create() local function onKeyReleaseed(k ...

  3. EF--CodeFirst

    1,增加EntityFramework的引用 2,创建实体类 public class Invoice { public Invoice() { LineItems = new List<Lin ...

  4. laravel 模版赋值

    1)一般赋值是直接用view助手函数返回的 return view('Index/index', ['key'=>'value']); 2)一般做系统时,我们都会有一个共同控制器,其他控制器继承 ...

  5. vuejs开发环境搭建

    前言:现在前端最火的是3个框架:react,vue,angular.可以说着是哪个框架大大改变了前端的地位.相对于angular来说.vue同样拥有丰富的指令,并且都是典型的MVC框架,但是vue比较 ...

  6. mySQL数据库一:数据类型

    integer(整型)varchar(字符串类型,必须要跟最大字符串)text(大文本)float(单精度,即七到八位有效数字)double(双精度,即15到16位有效数字)date(只有年月日)ti ...

  7. UITableView划动删除的实现

    对于app应用来说,使用列表的形式展现数据非UITableView莫属.在熟练掌握了用UITableView展示数据以后,是不是也遇到了需要删除数据的需求?是不是觉得在一行数据上划动一下,然后出现一个 ...

  8. 如何提高AJAX客户端响应速度

    AJAX的出现极大的改变了Web应用客户端的操作模式,它使的用户可以在全心工作时不必频繁的忍受那令人厌恶的页面刷新.理论上AJAX技术在很大的程度上可以减少用户操作的等待时间,同时节约网络上的数据流量 ...

  9. memcached与redis实现的对比

    版权声明:本文由田京昆原创文章,转载请注明出处: 文章原文链接:https://www.qcloud.com/community/article/129 来源:腾云阁 https://www.qclo ...

  10. CRUX下实现进程隐藏(3)

    通过一个内核模块拦截文件系统的回调函数来实现进程隐藏. VFS(Virtual File System)是Linux在实际文件系统(如ext3,ext4,vfat等)上抽象出的一个文件系统模型,简单来 ...