大数据学习——mapreduce学习topN问题
求每一个订单中成交金额最大的那一笔 top1
数据
Order_0000001,Pdt_01,222.8
Order_0000001,Pdt_05,25.8
Order_0000002,Pdt_05,325.8
Order_0000002,Pdt_03,522.8
Order_0000002,Pdt_04,122.4
Order_0000003,Pdt_01,222.8
Order_0000003,Pdt_01,322.8
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.cyf</groupId>
<artifactId>MapReduceCases</artifactId>
<packaging>jar</packaging>
<version>1.0</version> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.4</version>
</dependency> <dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.1.40</version>
</dependency> <dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.36</version>
</dependency>
</dependencies> <build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>cn.itcast.mapreduce.top.one.TopOne</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>assembly</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build> </project>
package cn.itcast.mapreduce.top.one; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable; /**
*
*
*/
public class OrderBean implements WritableComparable<OrderBean> { private Text itemid;
private DoubleWritable amount; public OrderBean() {
} public OrderBean(Text itemid, DoubleWritable amount) {
set(itemid, amount); } public void set(Text itemid, DoubleWritable amount) { this.itemid = itemid;
this.amount = amount; } public Text getItemid() {
return itemid;
} public DoubleWritable getAmount() {
return amount;
} public int compareTo(OrderBean o) {
int cmp = this.itemid.compareTo(o.getItemid());
if (cmp == 0) { cmp = -this.amount.compareTo(o.getAmount()); }
return cmp;
} public void write(DataOutput out) throws IOException {
out.writeUTF(itemid.toString());
out.writeDouble(amount.get()); } public void readFields(DataInput in) throws IOException {
String readUTF = in.readUTF();
double readDouble = in.readDouble(); this.itemid = new Text(readUTF);
this.amount = new DoubleWritable(readDouble);
} @Override
public String toString() { return itemid.toString() + "\t" + amount.get(); } }
package cn.itcast.mapreduce.top.one; import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator; /**
*
*/
public class ItemidGroupingComparator extends WritableComparator { protected ItemidGroupingComparator() { super(OrderBean.class, true);
} @Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean abean = (OrderBean) a;
OrderBean bbean = (OrderBean) b; //��item_id��ͬ��bean����Ϊ��ͬ���Ӷ�ۺ�Ϊһ��
return abean.getItemid().compareTo(bbean.getItemid()); } }
package cn.itcast.mapreduce.top.one; import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner; public class ItemIdPartitioner extends Partitioner<OrderBean, NullWritable> { @Override
public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
//ָ
return (key.getItemid().hashCode() & Integer.MAX_VALUE) % numPartitions; } }
package cn.itcast.mapreduce.top.one; import java.io.IOException; import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.sun.xml.bind.v2.schemagen.xmlschema.List; /**
* ����secondarysort�������ÿ��item����������ļ�¼
*
* @author AllenWoon
*/
public class TopOne { static class TopOneMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable> { OrderBean bean = new OrderBean(); /* Text itemid = new Text(); */ @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString();
String[] fields = StringUtils.split(line, ","); bean.set(new Text(fields[0]), new DoubleWritable(Double.parseDouble(fields[2]))); context.write(bean, NullWritable.get()); } } static class TopOneReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable> { @Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
} public static void main(String[] args) throws Exception { Configuration conf = new Configuration();
Job job = Job.getInstance(conf); // job.setJarByClass(TopOne.class); //告诉框架,我们的程序所在jar包的位置
job.setJar("/root/TopOne.jar");
job.setMapperClass(TopOneMapper.class);
job.setReducerClass(TopOneReducer.class); job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path("/top/input"));
FileOutputFormat.setOutputPath(job, new Path("/top/output1"));
// FileInputFormat.setInputPaths(job, new Path(args[0]));
// FileOutputFormat.setOutputPath(job, new Path(args[1]));
// ָ��shuffle��ʹ�õ�GroupingComparator��
job.setGroupingComparatorClass(ItemidGroupingComparator.class);
// ָ��shuffle��ʹ�õ�partitioner��
job.setPartitionerClass(ItemIdPartitioner.class); job.setNumReduceTasks(1); job.waitForCompletion(true); } }
创建文件夹
hadoop fs -mkdir -p /top/input
上传数据
hadoop fs -put top.txt /top/input
运行
hadoop jar TopOne.jar cn.itcast.mapreduce.top.one.TopOne
运行结果
topN
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.cyf</groupId>
<artifactId>MapReduceCases</artifactId>
<packaging>jar</packaging>
<version>1.0</version> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.4</version>
</dependency> <dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.1.40</version>
</dependency> <dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.36</version>
</dependency>
</dependencies> <build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>cn.itcast.mapreduce.top.n.TopN</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>assembly</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build> </project>
package cn.itcast.mapreduce.top.n; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable; /**
* ������Ϣbean��ʵ��hadoop�����л�����
*/
public class OrderBean implements WritableComparable<OrderBean> { private Text itemid;
private DoubleWritable amount; public OrderBean() {
} public OrderBean(Text itemid, DoubleWritable amount) {
set(itemid, amount); } public void set(Text itemid, DoubleWritable amount) { this.itemid = itemid;
this.amount = amount; } public Text getItemid() {
return itemid;
} public DoubleWritable getAmount() {
return amount;
} public int compareTo(OrderBean o) {
int cmp = this.itemid.compareTo(o.getItemid());
if (cmp == 0) { cmp = -this.amount.compareTo(o.getAmount()); }
return cmp;
} public void write(DataOutput out) throws IOException {
out.writeUTF(itemid.toString());
out.writeDouble(amount.get()); } public void readFields(DataInput in) throws IOException {
String readUTF = in.readUTF();
double readDouble = in.readDouble(); this.itemid = new Text(readUTF);
this.amount = new DoubleWritable(readDouble);
} @Override
public String toString() { return itemid.toString() + "\t" + amount.get(); } /*
* @Override public int hashCode() {
*
* return this.itemid.hashCode(); }
*/
@Override
public boolean equals(Object obj) {
OrderBean bean = (OrderBean) obj; return bean.getItemid().equals(this.itemid);
} }
package cn.itcast.mapreduce.top.n; import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator; /**
* ���ڿ���shuffle�����reduce�˶�kv�Եľۺ���
*/
public class ItemidGroupingComparator extends WritableComparator { protected ItemidGroupingComparator() { super(OrderBean.class, true);
} @Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean abean = (OrderBean) a;
OrderBean bbean = (OrderBean) b; //��item_id��ͬ��bean����Ϊ��ͬ���Ӷ�ۺ�Ϊһ��
return abean.getItemid().compareTo(bbean.getItemid()); } }
package cn.itcast.mapreduce.top.n; import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner; public class ItemIdPartitioner extends Partitioner<OrderBean, NullWritable> { @Override
public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
//ָ��item_id��ͬ��bean������ͬ��reducer task
return (key.getItemid().hashCode() & Integer.MAX_VALUE) % numPartitions; } }
package cn.itcast.mapreduce.top.n; import java.io.IOException; import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.shell.Count;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.sun.xml.bind.v2.schemagen.xmlschema.List; /**
* ����secondarysort�������ÿ��item����������ļ�¼
*/
public class TopN { static class TopNMapper extends Mapper<LongWritable, Text, OrderBean, OrderBean> { OrderBean v = new OrderBean();
Text k = new Text(); @Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString();
String[] fields = StringUtils.split(line, ",");
k.set(fields[0]); v.set(new Text(fields[0]), new DoubleWritable(Double.parseDouble(fields[2]))); context.write(v, v); } } static class TopNReducer extends Reducer<OrderBean, OrderBean, NullWritable, OrderBean> {
int topn = 1;
int count = 0; @Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
topn = Integer.parseInt(conf.get("topn"));
} @Override
protected void reduce(OrderBean key, Iterable<OrderBean> values, Context context) throws IOException, InterruptedException {
for (OrderBean bean : values) {
if ((count++) == topn) {
count = 0;
return;
}
context.write(NullWritable.get(), bean);
}
}
} public static void main(String[] args) throws Exception { Configuration conf = new Configuration();
// ָ������classpath�µ��û��Զ��������ļ�
// conf.addResource("userconfig.xml");
// System.out.println(conf.get("top.n"));
// Ҳ����ֱ���ô��������ò���ݸ�mapreduce�����ڲ�ʹ��
conf.set("topn", "2");
Job job = Job.getInstance(conf); // job.setJarByClass(TopN.class); //告诉框架,我们的程序所在jar包的位置
job.setJar("/root/TopOne.jar");
job.setMapperClass(TopNMapper.class);
job.setReducerClass(TopNReducer.class); job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(OrderBean.class); job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(OrderBean.class); FileInputFormat.setInputPaths(job, new Path("/top/input"));
FileOutputFormat.setOutputPath(job, new Path("/top/outputn"));
// ָ��shuffle��ʹ�õ�partitioner��
job.setPartitionerClass(ItemIdPartitioner.class);
job.setGroupingComparatorClass(ItemidGroupingComparator.class); job.setNumReduceTasks(1); job.waitForCompletion(true); } }
打包并运行
运行
hadoop jar TopN.jar cn.itcast.mapreduce.top.n.TopN
运行结果 n=2
大数据学习——mapreduce学习topN问题的更多相关文章
- 【机器学习实战】第15章 大数据与MapReduce
第15章 大数据与MapReduce 大数据 概述 大数据: 收集到的数据已经远远超出了我们的处理能力. 大数据 场景 假如你为一家网络购物商店工作,很多用户访问该网站,其中有些人会购买商品,有些人则 ...
- 大数据技术 - MapReduce的Combiner介绍
本章来简单介绍下 Hadoop MapReduce 中的 Combiner.Combiner 是为了聚合数据而出现的,那为什么要聚合数据呢?因为我们知道 Shuffle 过程是消耗网络IO 和 磁盘I ...
- 【大数据】Hive学习笔记
第1章 Hive基本概念 1.1 什么是Hive Hive:由Facebook开源用于解决海量结构化日志的数据统计. Hive是基于Hadoop的一个数据仓库工具,可以将结构化的数据文件映射为一张表, ...
- 【大数据】Sqoop学习笔记
第1章 Sqoop简介 Sqoop是一款开源的工具,主要用于在Hadoop(Hive)与传统的数据库(mysql.postgresql...)间进行数据的传递,可以将一个关系型数据库(例如 : MyS ...
- 想转行大数据,开始学习 Hadoop?
学习大数据首先要了解大数据的学习路线,首先搞清楚先学什么,再学什么,大的学习框架知道了,剩下的就是一步一个脚印踏踏实实从最基础的开始学起. 这里给大家普及一下学习路线:hadoop生态圈——Strom ...
- 【福利】送Spark大数据平台视频学习资料
没有套路真的是送!! 大家都知道,大数据行业spark很重要,那话我就不多说了,贴心的大叔给你找了份spark的资料. 多啰嗦两句,一个好的程序猿的基本素养是学习能力和自驱力.视频给了你们,能不能 ...
- 【大数据】Scala学习笔记
第 1 章 scala的概述1 1.1 学习sdala的原因 1 1.2 Scala语言诞生小故事 1 1.3 Scala 和 Java 以及 jvm 的关系分析图 2 1.4 Scala语言的特点 ...
- 大数据-spark-hbase-hive等学习视频资料
不错的大数据spark学习资料,连接过期在评论区评论,再给你分享 https://pan.baidu.com/s/1ts6RNuFpsnc39tL3jetTkg
- 云计算、大数据、编程语言学习指南下载,100+技术课程免费学!这份诚意满满的新年技术大礼包,你Get了吗?
开发者认证.云学院.技术社群,更多精彩,尽在开发者会场 近年来,新技术发展迅速.互联网行业持续高速增长,平均薪资水平持续提升,互联网技术学习已俨然成为学生.在职人员都感兴趣的“业余项目”. 阿里云大学 ...
- Oracle大数据解决方案》学习笔记5——Oracle大数据机的配置、部署架构和监控-1(BDA Config, Deployment Arch, and Monitoring)
原创预见未来to50 发布于2018-12-05 16:18:48 阅读数 146 收藏 展开 这章的内容很多,有的学了. 1. Oracle大数据机——灵活和可扩展的架构 2. Hadoop集群的 ...
随机推荐
- Brush (IV) LightOJ - 1018
题意:平面上有一些点,每刷一次可以把同一条直线上的点都刷光,问最少几次把所有点刷光. 方法: 显然是一个状态压缩dp.ans[S]表示把S集合中点刷掉的最少次数.最开始想到的方法是如果S中只有一个或两 ...
- HtmlUnit爬取Ajax动态生成的页面内容
HtmlUnit说白了就是一个浏览器,这个浏览器是用Java写的无界面的浏览器,正因为其没有界面,因此执行的速度还是可以滴. HtmlUnit提供了一系列的API,这些API可以干的功能比较多,如表单 ...
- P2614 计算器弹琴
题目描述 总所周知,计算器可以拿来干很多它本不应该干的事情,比如写作文.(参看洛谷P2549) 小A发现了一个计算器的另一个隐藏功能——弹琴. http://www.bilibili.com/vide ...
- Lambda表达式的一些常用形式
1.调用一个方法 prod=>EvaluteProduct(prod); 2.lambad表达式来表示一个多参数的委托,则必须把参数封装在括号内.语句如下: (prod,count)=>p ...
- maven编译报错 -source 1.5 中不支持 lambda(或diamond) 表达式,编码 UTF-8 的不可映射字符
在用maven编译项目是由于项目中用了jdk 1.8, 编译是报错 -source 1.5 中不支持 lambda 表达式. 错误原因: Maven Compiler 插件默认会加 -source ...
- (一)Mybatis之初步接触
Maven的安装及环境配置 安装及配置只需按照以下三个链接的步骤走 撸帝的博客https://www.funtl.com/zh/maven/Maven-%E5%AE%89%E8%A3%85%E9%85 ...
- java.lang.NoSuchMethodError: org.hibernate.cfg.Environment.verifyProperties
我在使用jpa2+spring4+hibernate4 的时候,报错java.lang.NoSuchMethodError: org.hibernate.cfg.Environment.verifyP ...
- QWidget标题栏双击事件
widget.h virtual bool event(QEvent *event); widget.cpp bool Widget::event(QEvent *event) { if (event ...
- 关于ie的内存泄漏与javascript内存释放
最近做一个公司的业务系统,公司要求能尽可能的与c/s近似,也就是如c/s一样,点击文本框可以弹出此项目的相关内容,进行选择输入. 我使用了弹出窗口,然后在子窗口双击选中项目,把选中的值返回给父 ...
- UVA 1175 Ladies' Choice 女士的选择(稳定婚姻问题,GS算法)
题意: 给出每个男的心目中的女神排序,给出每个女的心目中的男神排序,即两个n*n的矩阵,一旦任意两个非舞伴的男女同学觉得对方都比现任舞伴要好,他们就会抛弃舞伴而在一起.为了杜绝这种现象,求每个男的最后 ...