MR案例:基站相关01
字段解释: product_no:用户手机号; lac_id:用户所在基站; start_time:用户在此基站的开始时间; staytime:用户在此基站的逗留时间。
product_no lac_id moment start_time user_id county_id staytime city_id
-- ::19.151754088
-- ::20.152622488
-- ::37.149593624
-- ::51.139539816
-- ::45.150276800
-- ::38.140225200
-- ::19.151754088
-- ::32.151754088
-- ::24.139539816
-- ::30.152622440
需求描述: 根据 lac_id 和 start_time 知道用户当时的位置,根据 staytime 知道用户各个基站的逗留时长。根据轨迹合并连续基站的 staytime。最终得到每一个用户按时间排序在每一个基站驻留时长。
期望输出:
-- ::20.152622488
-- ::37.149593624
-- ::38.140225200
-- ::51.139539816
-- ::45.150276800
问题分析:针对每个product_no按照start_time进行排序(本例降序),如果相邻两项的lac_id相同,则将staytime进行相加保存到后一项中,并将前一项移除。
完整代码v1:此版本只启用了Map阶段。map()函数:将每行内容解析成自定义的RecordWritable对象并添加到List集合中,然后对List集合进行排序。clearup()函数:将product_no和lac_id相同的相邻两项中的staytime进行相加。
缺点:将全部数据添加到List集合,对于大数据量无法满足要求。
package demo0902; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class Demo090203 {
final static String INPUT_PATH = "hdfs://10.16.17.182:9000/test/in/0902/";
final static String OUT_PATH = "hdfs://10.16.17.182:9000/test/out/0902/06"; public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(Demo090203.class); //指定map
job.setMapperClass(Demo090201Mapper.class); job.setMapOutputKeyClass(RecordWritable.class);
job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(RecordWritable.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH)); job.waitForCompletion(true); }
//map
public static class Demo090201Mapper extends Mapper<LongWritable, Text, RecordWritable, NullWritable>{ //存储一条记录
ArrayList<RecordWritable> list = new ArrayList<RecordWritable>(); @Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] splited = value.toString().split("\t"); //将一行内容组装成一条记录
RecordWritable record = new RecordWritable();
record.product_no=splited[0];
record.lac_id=splited[1];
record.moment=Integer.parseInt(splited[2]);
record.start_time=splited[3];
record.user_id=splited[4];
record.county_id=splited[5];
record.staytime=Integer.parseInt(splited[6]);
record.city_id=splited[7]; list.add(record); //对List中数据进行排序(自定义比较器)
Collections.sort(list, new Comparator<RecordWritable>() {
@Override
public int compare(RecordWritable r1, RecordWritable r2) { //调用RecordWritable的compareTo()方法
return (r1.compareTo(r2));
}
});
} @Override
protected void cleanup(Context context)
throws IOException, InterruptedException { for(RecordWritable r : list){
System.out.println(r.toString());
} for(int i=0; i<list.size() ;i++){
if(i != list.size()-1){ //取出相邻的两个RecordWritable
RecordWritable record_pre = list.get(i);
RecordWritable record_next = list.get(i+1); //只有手机号和基站号都相等的情况下,才将 staytime 相加
if(record_pre.product_no.equals(record_next.product_no) && record_pre.lac_id.equals(record_next.lac_id)){ //将相加后的staytime赋予后一条记录
record_next.staytime += record_pre.staytime; //移除前一条记录
list.remove(record_pre);
}
}
}
for(RecordWritable record : list){
context.write(record, NullWritable.get());
}
}
} //自定义的序列化类
public static class RecordWritable implements WritableComparable<RecordWritable>{
String product_no;
String lac_id;
int moment;
String start_time;
String user_id;
String county_id;
int staytime;
String city_id; @Override
public int compareTo(RecordWritable o) {
// 先按手机号排序 Asc
int value = this.product_no.compareTo(o.product_no);
if(value==0)
// 再按时间进行排序 Desc
return o.start_time.compareTo(this.start_time);
return value;
} @Override
public void write(DataOutput out) throws IOException {
out.writeUTF(product_no);
out.writeUTF(lac_id);
out.writeInt(moment);
out.writeUTF(start_time);
out.writeUTF(user_id);
out.writeUTF(county_id);
out.writeInt(staytime);
out.writeUTF(city_id);
} @Override
public void readFields(DataInput in) throws IOException {
product_no=in.readUTF();
lac_id=in.readUTF();
moment=in.readInt();
start_time=in.readUTF();
user_id=in.readUTF();
county_id=in.readUTF();
staytime=in.readInt();
city_id=in.readUTF();
} @Override
public String toString() {
return this.product_no+" "+this.lac_id+" "+this.moment+" "+this.start_time+" "+user_id+" "+county_id+" "+ staytime+" "+city_id;
}
}
}
完整代码v2:此版本Map阶段以product_no为key,每行内容为value进行输出。Reduce阶段和上一个版本的Map阶段功能类似。
优点:相比于v1,此版本优化在于每次只处理一个product_no相关的数据,减缓数据量带来的压力。
package demo0902; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class Demo090204 {
final static String INPUT_PATH = "hdfs://10.16.17.182:9000/test/in/0902/";
final static String OUT_PATH = "hdfs://10.16.17.182:9000/test/out/0902/02"; public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(Demo090203.class); job.setMapperClass(Demo090201Mapper.class);
job.setReducerClass(Demo090201Reducer.class); job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(RecordWritable.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH)); job.waitForCompletion(true); }
//map
public static class Demo090201Mapper extends Mapper<LongWritable, Text, Text, Text>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] splited = value.toString().split("\t"); context.write(new Text(splited[0]), new Text(value));
}
} //reduce
public static class Demo090201Reducer extends Reducer<Text, Text, RecordWritable, NullWritable>{
@Override
protected void reduce(Text key, Iterable<Text> v2s, Context context)
throws IOException, InterruptedException { ArrayList<RecordWritable> list = new ArrayList<RecordWritable>(); for(Text text : v2s){
String[] splited = text.toString().split("\t"); RecordWritable record = new RecordWritable();
record.product_no=splited[0];
record.lac_id=splited[1];
record.moment=Integer.parseInt(splited[2]);
record.start_time=splited[3];
record.user_id=splited[4];
record.county_id=splited[5];
record.staytime=Integer.parseInt(splited[6]);
record.city_id=splited[7]; list.add(record);
} //对List中数据进行排序(自定义比较器)
Collections.sort(list, new Comparator<RecordWritable>() {
@Override
public int compare(RecordWritable r1, RecordWritable r2) { //调用RecordWritable的compareTo()方法
return (r1.compareTo(r2));
}
}); for(int i=0; i<list.size() ;i++){ //滤过最后一条记录
if(i != list.size()-1){ //取出相邻的两个RecordWritable
RecordWritable record_pre = list.get(i);
RecordWritable record_next = list.get(i+1); if(record_pre.lac_id.equals(record_next.lac_id)){ //将相加后的staytime赋予后一条记录
record_next.staytime += record_pre.staytime; //移除前一条记录
list.remove(record_pre);
}
}
}
for(RecordWritable record : list){
context.write(record, NullWritable.get());
}
}
}
//自定义的序列化类
public static class RecordWritable implements WritableComparable<RecordWritable>{
String product_no;
String lac_id;
int moment;
String start_time;
String user_id;
String county_id;
int staytime;
String city_id; @Override
public int compareTo(RecordWritable o) {
// 先按手机号排序 Asc
int value = this.product_no.compareTo(o.product_no);
if(value==0)
// 再按时间进行排序 Desc
return o.start_time.compareTo(this.start_time);
return value;
} @Override
public void write(DataOutput out) throws IOException {
out.writeUTF(product_no);
out.writeUTF(lac_id);
out.writeInt(moment);
out.writeUTF(start_time);
out.writeUTF(user_id);
out.writeUTF(county_id);
out.writeInt(staytime);
out.writeUTF(city_id);
} @Override
public void readFields(DataInput in) throws IOException {
product_no=in.readUTF();
lac_id=in.readUTF();
moment=in.readInt();
start_time=in.readUTF();
user_id=in.readUTF();
county_id=in.readUTF();
staytime=in.readInt();
city_id=in.readUTF();
} @Override
public String toString() {
return this.product_no+" "+this.lac_id+" "+this.moment+" "+this.start_time+" "+user_id+" "+county_id+" "+ staytime+" "+city_id;
}
}
}
MR案例:基站相关01的更多相关文章
- 069 01 Android 零基础入门 01 Java基础语法 09 综合案例-数组移位 01 综合案例-数组移位-案例需求
069 01 Android 零基础入门 01 Java基础语法 09 综合案例-数组移位 01 综合案例-数组移位-案例需求 本文知识点:综合案例-数组移位-案例需求 说明:因为时间紧张,本人写博客 ...
- MR案例:Reduce-Join
问题描述:两种类型输入文件:address(地址)和company(公司)进行一对多的关联查询,得到地址名(例如:Beijing)与公司名(例如:Beijing JD.Beijing Red Star ...
- MR案例:倒排索引
1.map阶段:将单词和URI组成Key值(如“MapReduce :1.txt”),将词频作为value. 利用MR框架自带的Map端排序,将同一文档的相同单词的词频组成列表,传递给Combine过 ...
- MR案例:小文件处理方案
HDFS被设计来存储大文件,而有时候会有大量的小文件生成,造成NameNode资源的浪费,同时也影响MapReduce的处理效率.有哪些方案可以合并这些小文件,或者提高处理小文件的效率呢? 1). 所 ...
- MR案例:倒排索引 && MultipleInputs
本案例采用 MultipleInputs类 实现多路径输入的倒排索引.解读:MR多路径输入 package test0820; import java.io.IOException; import j ...
- GAN︱生成模型学习笔记(运行机制、NLP结合难点、应用案例、相关Paper)
我对GAN"生成对抗网络"(Generative Adversarial Networks)的看法: 前几天在公开课听了新加坡国立大学[机器学习与视觉实验室]负责人冯佳时博士在[硬 ...
- MR案例:CombineFileInputFormat
CombineFileInputFormat是一个抽象类.Hadoop提供了两个实现类CombineTextInputFormat和CombineSequenceFileInputFormat. 此案 ...
- 预测学习、深度生成式模型、DcGAN、应用案例、相关paper
我对GAN"生成对抗网络"(Generative Adversarial Networks)的看法: 前几天在公开课听了新加坡国立大学[机器学习与视觉实验室]负责人冯佳时博士在[硬 ...
- MR案例:输出/输入SequenceFile
SequenceFile文件是Hadoop用来存储二进制形式的key-value对而设计的一种平面文件(Flat File).在SequenceFile文件中,每一个key-value对被看做是一条记 ...
随机推荐
- SharePoint Managed Metadata 使用总结
前言 本文完全原创,转载请说明出处,希望对大家有用. 在SharePoint开发中,通常我们会将数据存储在列表,文档库或者直接存到数据库.但涉及到数据的层级结构时,用列表等存储实现并不是一件简单的事情 ...
- setMasksToBounds
setMasksToBounds 在IB中,当你使用Custom类型的Button时,你可以指定按钮的背景色.但当你运行时按钮就失去了圆角特性,你看到的仅仅是一个方块.因为custombutton没有 ...
- ctf-HITCON-2016-houseoforange学习
目录 堆溢出点 利用步骤 创建第一个house,修改top_chunk的size 创建第二个house,触发sysmalloc中的_int_free 创建第三个house,泄露libc和heap的地址 ...
- python3在centos6.6上的安装
建议:在看这个文档操作前,最好先参考一下这个:https://www.cnblogs.com/bookwed/p/10251236.html,是解决pip安装模块时,提示ssl版本低的问题. #提前的 ...
- 剑指Offer——把数组排成最小的数
题目描述: 输入一个正整数数组,把数组里所有数字拼接起来排成一个数,打印能拼接出的所有数字中最小的一个.例如输入数组{3,32,321},则打印出这三个数字能排成的最小数字为321323. 分析: 排 ...
- 14.Iterate a Cursor in the mongo Shell-官方文档摘录
1 迭代游标 } ); while (myCursor.hasNext()) { print(tojson(myCursor.next())); } } ); myCursor.forEach(pri ...
- 7.Git工作区和暂存区
Git和其他版本控制系统如SVN的一个不同之处就是有暂存区的概念. 先来看名词解释. 1.工作区(Working Directory) 就是你在电脑里能看到的目录,比如我的test文件夹就是一个工作区 ...
- 安装python3 centos
1.在新centos中安装python3的步骤https://www.cnblogs.com/lclq/archive/2016/06/27/5620196.html 2.安装python3过程中报错 ...
- Spark2.x AFTSurvivalRegression算法
Spark2.0的机器学习算法比之前的改变最大的是2.0基本采用了dataframe来实现的,但之前的都是用的RDD,看官网说貌似在3.0的时候RDD就不用了!还有一个就是hiveContext和sq ...
- linux mysql备份
许多备份方案: http://blog.jobbole.com/14012/ 采用的: mysqldump ---user admin ---password=password mydatabase ...