MapReduce数据筛选

需求：

编写MapReduce程序算出高峰时间段（如9-10点）哪张表被访问的最频繁的表，以及这段时间访问这张表最多的用户，以及这个用户访问这张表的总时间开销。

测试数据：

TableName(表名)，Time(时间)，User(用户)，TimeSpan(时间开销)

*t003 6:00 u002 180

*t003 7:00 u002 180

*t003 7:08 u002 180

*t003 7:25 u002 180

*t002 8:00 u002 180

*t001 8:00 u001 240

*t001 9:00 u002 300

*t001 9:11 u001 240

*t003 9:26 u001 180

*t001 9:39 u001 300

*t001 10:00 u001 200

代码

方法一：

package com.table.main;

import java.io.IOException;

import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TableUsed {

	public static class MRMapper extends Mapper<LongWritable, Text, Text, Text> {

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] split = value.toString().substring(1).split("\\s+");

			Long time = Long.parseLong(split[1].charAt(0) + "");

			// 筛选9-10点使用过的表

			if (time == 9 || time == 10) {

				context.write(new Text(split[0]), new Text(split[2] + ":" + split[3]));

			}

		}

	}

	public static class MRReducer extends Reducer<Text, Text, Text, Text> {

		// 存放使用量最大的表的表名及用户

		public static HashMap<String, HashMap<String, Integer>> map = new HashMap<String, HashMap<String, Integer>>();

		// 最大用使用量

		public static int max_used_num = 0;

		// 使用量最大的表

		public static String table = "";

		protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

			HashMap<String, Integer> user_map = new HashMap<String, Integer>();

			int table_used_num = 0;

			for (Text t : values) {

				table_used_num++;

				String[] split = t.toString().split(":");

				// 如map中已经存在的用户则把使用时间叠加 不存在则添加该用户

				if (user_map.get(split[0]) == null) {

					user_map.put(split[0], Integer.parseInt(split[1]));

				} else {

					Integer use_time = user_map.get(split[0]);

					use_time += Integer.parseInt(split[1]);

					user_map.put(split[0], use_time);

				}

			}

			if (table_used_num > max_used_num) {

				map.put(key.toString(), user_map);

				table = key.toString();

				max_used_num = table_used_num;

			}

		}

		protected void cleanup(Context context) throws IOException, InterruptedException {

			// 循环map，查出使用时间最长的用户信息

			HashMap<String, Integer> map2 = map.get(table);

			int max = 0;

			String max_used_user = "";

			for (HashMap.Entry<String, Integer> m : map2.entrySet()) {

				if (m.getValue() > max) {

					max = m.getValue();

					max_used_user = m.getKey();

				}

			}

			context.write(new Text(table), new Text("\t" + max_used_user + "\t" + map2.get(max_used_user)));

		}

	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(TableUsed.class);

		job.setMapperClass(MRMapper.class);

		job.setReducerClass(MRReducer.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		FileInputFormat.setInputPaths(job, new Path("hdfs://hadoop5:9000/input/table_time.txt"));

		FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop5:9000/output/put2"));

		System.out.println(job.waitForCompletion(true) ? 1 : 0);

	}

}

缺点：只算出使用时间最长的用户，没有判断该用户是否是使用次数最多的

方法二：

package com.table.main;

import java.io.IOException;

import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TableUsed {

	public static class MRMapper extends Mapper<LongWritable, Text, Text, Text> {

		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String[] split = value.toString().substring(1).split("\\s+");

			Long time = Long.parseLong(split[1].charAt(0) + "");

			// 筛选9-10点使用过的表

			if (time == 9 || time == 10) {

				context.write(new Text(split[0]), new Text(split[2] + ":" + split[3]));

			}

		}

	}

	public static class MRReducer extends Reducer<Text, Text, Text, Text> {

		// 					表的最大使用次数		使用该表最多的用户

		public static int max_used_num = 0, max_user_used = 0;

		//						使用量最大的表		使用该表最多的用户名

		public static String max_used_table = "", user_name = "";

		// 					使用次数最多的用户的 使用时间

		public static Integer user_used_time = 0;

		protected void reduce(Text key, Iterable<Text> values, Context context)

				throws IOException, InterruptedException {

			HashMap<String, Integer> user_map = new HashMap<String, Integer>();

			HashMap<String, Integer> user_used_map = new HashMap<String, Integer>();

			int table_used_num = 0;// 表的使用次数

			Integer use_num = 0;// 用户使用次数

			Integer use_time = 0;//使用时间

			String username = "";//用户名

			for (Text t : values) {

				table_used_num++;

				String[] split = t.toString().split(":");

				// 如map中已经存在的用户则把使用时间叠加 不存在则添加该用户

				if (user_map.get(split[0]) == null) {

					user_map.put(split[0], Integer.parseInt(split[1]));

					user_used_map.put(split[0], 1);

				} else {

					use_time = user_map.get(split[0]);

					use_time += Integer.parseInt(split[1]);

					user_map.put(split[0], use_time);

					use_num = user_used_map.get(split[0]);

					use_num ++;

					user_used_map.put(split[0], use_num);

				}

				/**

				 * 判断该用户是否为此表使用次数最多的,

				 * 是则存进user_map和user_used_map，否则不存;

				 * 由于只需要求使用量最多的用户,因此使用量不是最多用户没有必要存在于map中

				 */

				if (use_num > max_user_used) {

					username = split[0];

					max_user_used = use_num;

					user_used_time = use_time;

					//此处也可以不remove()

					user_used_map.remove(split[0]);

					user_map.remove(split[0]);

				}

			}

			if (table_used_num > max_used_num) {

				max_used_table = key.toString();

				max_used_num = table_used_num;

				user_name = username;

			}

		}

		protected void cleanup(Context context) throws IOException, InterruptedException {

			context.write(new Text(max_used_table), new Text(max_user_used + "\t" + user_name + "\t" + user_used_time));

		}

	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(TableUsed.class);

		job.setMapperClass(MRMapper.class);

		job.setReducerClass(MRReducer.class);

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		FileInputFormat.setInputPaths(job, new Path("hdfs://hadoop5:9000/input/table_time.txt"));

		FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop5:9000/output/put6"));

		System.out.println(job.waitForCompletion(true) ? 1 : 0);

	}

}

MapReduce数据筛选的更多相关文章

ASP.NET MVC5+EF6+EasyUI 后台管理系统（81）-数据筛选（万能查询）
系列目录前言听标题的名字似乎是一个非常牛X复杂的功能,但是实际上它确实是非常复杂的,我们本节将演示如何实现对数据,进行组合查询(数据筛选) 我们都知道Excel中是如何筛选数据的.就像下面一样他 ...
DataGridView如何实现列标头带数据筛选功能，就象Excel高级筛选功能一样
'近日有本论坛网友问:DataGridView如何实现列标头带数据筛选功能,就象Excel高级筛选功能一样 '今晚正好闲着没事,加之以前也没用到过这个需求,所以就写了个模拟功能,供各位坛友酌情参考. ...
layui table 根据条件改变更换表格颜色高亮显示数据筛选
请问想让当layui表格的某个字段符合某个条件的时候,让该行变颜色.这样可以实现么. layui数据表格怎么更换表格颜色 layui表格通过判断某一行中的某一列的值进行设置这一行的颜色 LayUI之 ...
C#进行数据筛选（二）
这里介绍LINQ+Lambda表达式进行数据筛选的方式这里是第一种方式,还是使用了if条件语句去判断,根据选择的条件去筛选出我所需要的数据 public GxAnaly SelectDay(stri ...
C#进行数据筛选（一）
这里介绍数据筛选的第一种方式,不用过滤器,给新手看得 public DataTable SourceList(string Wmain, string OrderNo, string Process) ...
python之pandas数据筛选和csv操作
本博主要总结DaraFrame数据筛选方法(loc,iloc,ix,at,iat),并以操作csv文件为例进行说明 1. 数据筛选 a b c (1)单条件筛选 df[df[] # 如果想筛选a列的取 ...
Pandas 数据筛选,去重结合group by
Pandas 数据筛选,去重结合group by 需求今小伙伴有一个Excel表, 是部门里的小伙9月份打卡记录, 关键字段如下: 姓名, 工号, 日期, 打卡方式, 时间, 详细位置, IP地址. ...
【杂记】mysql 左右连接查询中的NULL的数据筛选问题，查询NULL设置默认值，DATE_FORMAT函数
MySQL左右连接查询中的NULL的数据筛选问题 xpression 为 Null,则 IsNull 将返回 True:否则 IsNull 将返回 False. 如果 expression 由多个变量 ...
4-Pandas之数据类型与数据筛选
一.数据类型 1.Pandas的数据类型主要结合了pandas和numpy两个模块中的数据类型,包括以下几种: float int bool datetime64[ns]------>日期类型 ...

随机推荐

爬虫实战【10】利用Selenium自动登陆京东签到领金币
今天我们来讲一下如何通过python来实现自动登陆京东,以及签到领取金币. 如何自动登陆京东? 我们先来看一下京东的登陆页面,如下图所示: [插入图片,登陆页面] 登陆框就是右面这一个框框了,但是目前 ...
[cocos2dx] cocosdx编译工程那些事
cocos compile -p android 上面这条命令可以将cocos2dx的工程编译出android apk,需要注意的是如果有新增的cpp文件,都需要在“CocosProject\proj ...
HDU 1879 继续畅通工程（Kruskra）
继续畅通工程 Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 32768/32768 K (Java/Others) Total Sub ...
FineReport----查询功能的知识点
1.设置日期控件,默认当前日期 2.默认不查询选择参数:点击查询前不显示报表内容
Spoken English Practice（not always estimating your status in other's hearts. you will lose yourself when you live in other's look. do your best and walk on you own way.）
绿色:连读: 红色:略读: 蓝色:浊化: 橙色:弱读下划线_为浊化口语蜕变(2017/7/8) 英 ...
<2014 05 09> 程序员：从C++转到Java需注意的地方
最近想玩玩Android的APP开发,从C++角度来学习Java.Java可以说是一个优化精简版的C++,去除了底层C的很多特性.找了这篇文章. --------------------------- ...
ES6 Promise对象then方法链式调用
then()方法的作用是Promise实例添加解决(fulfillment)和拒绝(rejection)状态的回调函数.then()方法会返回一个新的Promise实例,所以then()方法后面可以继 ...
三.实例演示insert/update/delect更新数据库
1.逻辑图 2.只是准备 3.代码展示 import pymysql conn=pymysql.connect( host='192.168.199.249', port=3306, user='ro ...
我的Android进阶之旅------>解决Android Studio编译后安装apk报错：The APK file does not exist on disk
1.错误描述今天用Android Studio编译应用后安装APK的时候,报错了,错误如下所示: The APK file build\outputs\apk\OYP_2.3.4_I2Base_64 ...
Python面向对象高级编程-_slots_
使用_slots_ 正常情况下,当定义一个class,创建一个class的实例后,可以给实例绑定任何属性和方法,这就是动态语言的灵活性.先定义class: >>> class Stu ...

MapReduce数据筛选

需求：

测试数据：

代码

MapReduce数据筛选的更多相关文章

随机推荐

热门专题