1.倒排索引的实现

 import java.io.IOException;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class InvertedIndex { public static class InvertedIndexMap extends Mapper<Object,Text,Text,Text>{ private Text valueInfo = new Text();
private Text keyInfo = new Text();
private FileSplit split; public void map(Object key, Text value,Context context)
throws IOException, InterruptedException {
//获取<key value>对所属的FileSplit对象
split = (FileSplit) context.getInputSplit();
StringTokenizer stk = new StringTokenizer(value.toString());
while (stk.hasMoreElements()) {
//key值由(单词:URI)组成
keyInfo.set(stk.nextToken()+":"+split.getPath().toString());
//词频
valueInfo.set("1");
context.write(keyInfo, valueInfo);
}
}
} public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{ Text info = new Text(); public void reduce(Text key, Iterable<Text> values,Context contex)
throws IOException, InterruptedException {
int sum = 0;
for (Text value : values) {
sum += Integer.parseInt(value.toString());
}
int splitIndex = key.toString().indexOf(":");
//重新设置value值由(URI+:词频组成)
info.set(key.toString().substring(splitIndex+1) +":"+ sum);
//重新设置key值为单词
key.set(key.toString().substring(0,splitIndex));
contex.write(key, info);
}
} public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{ private Text result = new Text(); public void reduce(Text key, Iterable<Text> values,Context contex)
throws IOException, InterruptedException {
//生成文档列表
String fileList = new String();
for (Text value : values) {
fileList += value.toString()+";";
}
result.set(fileList);
contex.write(key, result);
}
} public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); Job job = new Job(conf,"InvertedIndex"); job.setJarByClass(InvertedIndex.class); job.setMapperClass(InvertedIndexMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); job.setCombinerClass(InvertedIndexCombiner.class); job.setReducerClass(InvertedIndexReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path("./in/invertedindex/"));
FileOutputFormat.setOutputPath(job, new Path("./out/")); System.exit(job.waitForCompletion(true)?0:1); }
}

2.word count

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser; public class WordCount {
public static class WordMapper extends Mapper<Object, String, String, IntWritable> {
private static final IntWritable one = new IntWritable(1);
public void map(Object key, String value, Context context) throws IOException, InterruptedException {
String[] words = value.split(" ");
for (String word : words) {
context.write(word, one);
}
}
}
public static class WordReducer extends Reducer<String, Iterable<IntWritable>, String, IntWritable> {
private static IntWritable ans = new IntWritable();
public void reduce(String key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable count : value) {
sum += count.get();
}
ans.set(sum);
context.write(key, ans);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordCount <int> <count>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordMapper.class);
job.setCombinerClass(WordReducer.class);
job.setReducerClass(WordReducer.class);
job.setOutputKeyClass(String.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}

3.找出访问量最多的表

 import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * 用Hadoop分析海量日志文件,每行日志记录了如下数据: * TableName(表名),Time(时间),User(用户),TimeSpan(时间开销) * 要求编写MapReduce程序算出高峰时间段(如9-10点)哪张表被访问的最频繁 * 以及这段时间访问这张表最多的用户,以及这个用户访问这张表的总时间开销。 * @author drguo *t003 6:00 u002 180 *t003 7:00 u002 180 *t003 7:08 u002 180 *t003 7:25 u002 180 *t002 8:00 u002 180 *t001 8:00 u001 240 *t001 9:00 u002 300 *t001 9:11 u001 240 *t003 9:26 u001 180 *t001 9:39 u001 300 * * * 先找出9-10点访问量最大的表 * */
//club.drguo.xx.mapreduce.tablecount.TableCount
public class TableCount {
public static class TableCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
private Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] strings = StringUtils.split(line, " ");
String tabName = strings[0];
String time = strings[1];
String[] times = time.split(":");
int hour = Integer.parseInt(times[0]);
k.set(tabName);
if(hour==9){
context.write(k, new LongWritable(1));
System.out.println("-----------------------------------------------"+k);
}
}
}
public static class TableCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
private TreeMap<Text, Long> map = new TreeMap<Text, Long>();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
Text tabName = new Text(key.toString());//不要直接Text tabName = key;
long count = 0;
for(LongWritable value : values){
count += value.get();
}
System.out.println(tabName+"--------------------------"+count);
map.put(tabName, count);
}
@Override
protected void cleanup(Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
Text tableName = null;
Long maxCount = 0L;
for(Text key : map.keySet()){
System.out.println("key="+key+"-----------------value="+map.get(key));
while(map.get(key)>maxCount){
maxCount = map.get(key);
tableName = key;
}
}
context.write(tableName, new LongWritable(maxCount));
}
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration,"tablejob");
job.setJarByClass(TableCount.class); job.setMapperClass(TableCountMapper.class);
job.setReducerClass(TableCountReducer.class); job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/log");
FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/tablecount")); System.exit(job.waitForCompletion(true)?0:1);
}
}

MapReduce的几个实现的更多相关文章

  1. Mapreduce的文件和hbase共同输入

    Mapreduce的文件和hbase共同输入 package duogemap;   import java.io.IOException;   import org.apache.hadoop.co ...

  2. mapreduce多文件输出的两方法

    mapreduce多文件输出的两方法   package duogemap;   import java.io.IOException;   import org.apache.hadoop.conf ...

  3. mapreduce中一个map多个输入路径

    package duogemap; import java.io.IOException; import java.util.ArrayList; import java.util.List; imp ...

  4. Hadoop 中利用 mapreduce 读写 mysql 数据

    Hadoop 中利用 mapreduce 读写 mysql 数据   有时候我们在项目中会遇到输入结果集很大,但是输出结果很小,比如一些 pv.uv 数据,然后为了实时查询的需求,或者一些 OLAP ...

  5. [Hadoop in Action] 第5章 高阶MapReduce

    链接多个MapReduce作业 执行多个数据集的联结 生成Bloom filter   1.链接MapReduce作业   [顺序链接MapReduce作业]   mapreduce-1 | mapr ...

  6. MapReduce

    2016-12-21  16:53:49 mapred-default.xml mapreduce.input.fileinputformat.split.minsize 0 The minimum ...

  7. 使用mapreduce计算环比的实例

    最近做了一个小的mapreduce程序,主要目的是计算环比值最高的前5名,本来打算使用spark计算,可是本人目前spark还只是简单看了下,因此就先改用mapreduce计算了,今天和大家分享下这个 ...

  8. MapReduce剖析笔记之八: Map输出数据的处理类MapOutputBuffer分析

    在上一节我们分析了Child子进程启动,处理Map.Reduce任务的主要过程,但对于一些细节没有分析,这一节主要对MapOutputBuffer这个关键类进行分析. MapOutputBuffer顾 ...

  9. MapReduce剖析笔记之七:Child子进程处理Map和Reduce任务的主要流程

    在上一节我们分析了TaskTracker如何对JobTracker分配过来的任务进行初始化,并创建各类JVM启动所需的信息,最终创建JVM的整个过程,本节我们继续来看,JVM启动后,执行的是Child ...

  10. MapReduce剖析笔记之六:TaskTracker初始化任务并启动JVM过程

    在上面一节我们分析了JobTracker调用JobQueueTaskScheduler进行任务分配,JobQueueTaskScheduler又调用JobInProgress按照一定顺序查找任务的流程 ...

随机推荐

  1. oracle中的内连接和外连接区别

    表t_user1,t_user2,t_user3,各有id,name两列 id name 1 10A 2 20A id name 1 10B 3 30B id name 1 10C 4 40C 连接分 ...

  2. 【RF库Collections测试】Count Values In List

    Name:Count Values In ListSource:Collections <test library>Arguments:[ list_ | value | start=0 ...

  3. C++中的枚举变量

    至从C语言开始enum类型就被作为用户自定义分类有限集合常量的方法被引入到了语言当中,而且一度成为C++中定义编译期常量的唯一方法(后来在类中引入了静态整型常量).根据上面对enum类型的描述,有以下 ...

  4. Docker源码分析(四):Docker Daemon之NewDaemon实现

    1. 前言 Docker的生态系统日趋完善,开发者群体也在日趋庞大,这让业界对Docker持续抱有极其乐观的态度.如今,对于广大开发者而言,使用Docker这项技术已然不是门槛,享受Docker带来的 ...

  5. EUI组件之EditableText

    一.EditableText常规使用 EditableText是一个可输入文本,例如登陆时输入用户名.密码等. 拖动EditableText到exml即可 实际效果 其他: 1.输入密码框 设置inp ...

  6. Windows Phone 7 检查手机网络

    using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Wi ...

  7. Python - 3.6 学习四

    错误.调试和测试 程序运行中,可能会遇到BUG.用户输入异常数据以及其它环境的异常,这些都需要程序猿进行处理.Python提供了一套内置的异常处理机制,供程序猿使用,同时PDB提供了调试代码的功能,除 ...

  8. Linux 使用 常识记忆

    1.当系统突然死机需要重启时,打开 命令输入端口,使用快捷键 Shift +Alt +F(1或6)打开 ,然后输入 shutdown -r now 如果提示shutdown need to be ro ...

  9. vue 引入通用 css

    1.在入口 js 文件 main.js 中引入,一些公共的样式文件,可以在这里引入. import Vue from 'vue' import App from './App' // 引入App这个组 ...

  10. C# tostring 格式化输出

    C 货币 2.5.ToString("C") ¥2.50 D 十进制数 25.ToString("D5") 00025 E 科学型 25000.ToString ...