MapReduce原理深入理解(二)

1.Mapreduce操作不需要reduce阶段

 1 import org.apache.hadoop.conf.Configuration;

 2 import org.apache.hadoop.fs.FileSystem;

 3 import org.apache.hadoop.fs.Path;

 4 import org.apache.hadoop.io.LongWritable;

 5 import org.apache.hadoop.io.NullWritable;

 6 import org.apache.hadoop.io.Text;

 7 import org.apache.hadoop.mapreduce.Job;

 8 import org.apache.hadoop.mapreduce.Mapper;

 9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

11

12 import java.io.IOException;

13

14 public class WordCount03 {

15     public static class MyMapper extends Mapper<LongWritable, Text,Text, NullWritable>{

16         @Override

17         protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

18             String line = value.toString();

19             String s = line.split(",")[3];

20             if(s.equals("男")){

21                 context.write(new Text(s),NullWritable.get());

22             }

23         }

24     }

25     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

26         Job job= Job.getInstance();

27         job.setNumReduceTasks(0);

28         /**

29          * 有些情况下,不需要reduce(聚合程序),

30          * 在不需要聚合操作的时候,可以不需要reduce

31          * 而reduce默认为1,需要手动设置为0,

32          * 如果没有设置为0,会产生默认的reduce,只不过reduce不处理任何数据

33          */

34         job.setJobName("mr03程序");

35         job.setJarByClass(WordCount03.class);

36         job.setMapOutputKeyClass(Text.class);

37         job.setMapOutputValueClass(NullWritable.class);

38         Path in = new Path("/word");

39         FileInputFormat.addInputPath(job,in);

40         Path out = new Path("/output");

41         FileSystem fs = FileSystem.get(new Configuration());

42         if(fs.exists(out)){

43             fs.delete(out);

44         }

45         FileOutputFormat.setOutputPath(job,out);

46         job.waitForCompletion(true);

47     }

48 }

注意：

有些情况下,不需要reduce(聚合程序),

在不需要聚合操作的时候,可以不需要reduce

而reduce默认为1,需要手动设置为0,
如果没有设置为0,会产生默认的reduce,只不过reduce不处理任何数据



2.MapReduce中join操作（数据拼接）

  1 import org.apache.hadoop.conf.Configuration;

  2 import org.apache.hadoop.fs.FileSystem;

  3 import org.apache.hadoop.fs.Path;

  4 import org.apache.hadoop.io.LongWritable;

  5 import org.apache.hadoop.io.NullWritable;

  6 import org.apache.hadoop.io.Text;

  7 import org.apache.hadoop.mapreduce.InputSplit;

  8 import org.apache.hadoop.mapreduce.Job;

  9 import org.apache.hadoop.mapreduce.Mapper;

 10 import org.apache.hadoop.mapreduce.Reducer;

 11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 12 import org.apache.hadoop.mapreduce.lib.input.FileSplit;

 13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 14

 15 import java.io.IOException;

 16 import java.util.ArrayList;

 17

 18 public class WordCount04 {

 19     public static class JoinMapper extends Mapper<LongWritable,Text,Text,Text>{

 20         @Override

 21         protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

 22             //1.获取数据的路径 InputSplit

 23             //context 上面是hdfs 下面如果有reduce就是reduce 没有就是hdfs

 24             InputSplit inputSplit = context.getInputSplit();

 25             FileSplit fs=(FileSplit)inputSplit;

 26             String url = fs.getPath().toString();

 27             //2.判断

 28             if(url.contains("students")){//true当前数据为students.txt

 29                 String id = value.toString().split(",")[0];

 30                 //为了方便reduce数据的操作 针对于不同的数据 打一个标签

 31                 String line = "*" + value.toString();

 32                 context.write(new Text(id),new Text(line));

 33             }else {//false 当前数据为score.txt

 34                 //以学号作为k 也是两张数据的关联条件

 35                 String id = value.toString().split(",")[0];

 36                 //为了方便reduce数据的操作 针对于不同的数据 打一个标签

 37                 String line = "#" + value.toString();

 38                 context.write(new Text(id),new Text(line));

 39             }

 40         }

 41     }

 42     public static class JoinReduce extends Reducer<Text,Text,Text,NullWritable>{

 43         @Override

 44         protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

 45             //数据在循环之外保存

 46             String stuInfo="";

 47             ArrayList<String> scores = new ArrayList<String>();

 48             //提取数据

 49             for (Text value : values) {

 50                 //获取一行一行的数据(所有数据包含students.txt和score.txt)

 51                 String line = value.toString();

 52                 if(line.startsWith("*")){//true 为学生数据

 53                     stuInfo= line.substring(1);

 54                 }else {//false  为学生成绩数据

 55                     scores.add(line.substring(1));

 56                 }

 57             }

 58             /**

 59              * 求的是 两张表的拼接

 60              */

 61             //数据拼接

 62             for (String score : scores) {

 63                 String subject = score.split(",")[1];

 64                 String s = score.split(",")[2];

 65                 String end=stuInfo+","+subject+","+s;

 66                 context.write(new Text(end),NullWritable.get());

 67             }

 68             /**

 69              * 求的是 两张表的拼接 拼接过程中对成绩求和

 70              */

 71 //            long sum=0l;

 72 //            for (String s : scores) {

 73 //                Integer sc =Integer.valueOf( s.split(",")[2]);

 74 //                sum+=sc;

 75 //            }

 76 //            String end=stuInfo+","+sum;

 77 //            context.write(new Text(end),NullWritable.get());

 78         }

 79     }

 80     public static void main(String[] args) throws Exception {

 81         Job job = Job.getInstance();

 82         job.setJobName("Join MapReduce");

 83         job.setJarByClass(WordCount04.class);

 84

 85         job.setMapperClass(JoinMapper.class);

 86         job.setMapOutputKeyClass(Text.class);

 87         job.setMapOutputValueClass(Text.class);

 88

 89         job.setReducerClass(JoinReduce.class);

 90         job.setOutputKeyClass(Text.class);

 91         job.setOutputValueClass(NullWritable.class);

 92         //指定路径

 93         FileInputFormat.addInputPath(job,new Path("/word"));

 94         Path path = new Path("/output");

 95         FileSystem fs = FileSystem.get(new Configuration());

 96         if(fs.exists(path)){

 97             fs.delete(path);

 98         }

 99         FileOutputFormat.setOutputPath(job,new Path("/output"));

100         job.waitForCompletion(true);

101         System.out.println("join 正在执行");

102     }

103 }

MapReduce原理深入理解(二)的更多相关文章

MapReduce原理深入理解(一)
1.MapReduce概念 1)MapReduce是一种分布式计算模型,由Google提出,主要用于搜索领域,解决海量数据的计算问题. 2)MapReduce是分布式运行的,由两个阶段组成:Map和R ...
hadoop自带例子SecondarySort源码分析MapReduce原理
这里分析MapReduce原理并没用WordCount,目前没用过hadoop也没接触过大数据,感觉,只是感觉,在项目中,如果真的用到了MapReduce那待排序的肯定会更加实用. 先贴上源码 pac ...
大数据运算模型 MapReduce 原理
大数据运算模型 MapReduce 原理 2016-01-24 杜亦舒 MapReduce 是一个大数据集合的并行运算模型,由google提出,现在流行的hadoop中也使用了MapReduce作为计 ...
MapReduce原理及其主要实现平台分析
原文:http://www.infotech.ac.cn/article/2012/1003-3513-28-2-60.html MapReduce原理及其主要实现平台分析亢丽芸, 王效岳, 白如江 ...
MapReduce 原理与 Python 实践
MapReduce 原理与 Python 实践 1. MapReduce 原理以下是个人在MongoDB和Redis实际应用中总结的Map-Reduce的理解 Hadoop 的 MapReduce ...
关系型数据库工作原理-事务管理(二)(翻译自Coding-Geek文章)
本文翻译自Coding-Geek文章:< How does a relational database work>. 原文链接:http://coding-geek.com/how-dat ...
大数据 --> MapReduce原理与设计思想
MapReduce原理与设计思想简单解释 MapReduce 算法一个有趣的例子:你想数出一摞牌中有多少张黑桃.直观方式是一张一张检查并且数出有多少张是黑桃? MapReduce方法则是: 给在座 ...
对CAP原理的理解
对CAP原理的理解 CAP原理按照定义,指的是C(Consistency)一致性,A(Availability)可用性,P(Partition tolerance)分区容错性在一个完整的计算机系统中三 ...
【转帖】linux内存管理原理深入理解段式页式
linux内存管理原理深入理解段式页式 https://blog.csdn.net/h674174380/article/details/75453750 其实一直没弄明白 linux 到底是段页式 ...

随机推荐

Golang语言系列-01-Go语言简介和变量
Go语言简介 Go(又称Golang)是Google开发的一种静态强类型.编译型.并发型,并具有垃圾回收功能的编程语言. 罗伯特·格瑞史莫(Robert Griesemer),罗勃·派克(Rob Pi ...
PHPMailer 远程命令执行漏洞 Writeup
漏洞概述 1.漏洞简介 PHPMailer 小于5.2.18的版本存在远程代码执行漏洞.成功利用该漏洞后,攻击者可以远程任意代码执行.许多知名的 CMS 例如 Wordpress 等都是使用这个组件来 ...
jumpserver堡垒机(2.4)部署
jumpserver 2.4.0 部署 jumpserver 官网: https://www.jumpserver.org/ Jumpserver介绍 JumpServer 是全球首款完全开源的堡垒机 ...
Python小白的数学建模课-18.最小生成树问题
最小生成树(MST)是图论中的基本问题,具有广泛的实际应用,在数学建模中也经常出现. 路线设计.道路规划.官网布局.公交路线.网络设计,都可以转化为最小生成树问题,如要求总线路长度最短.材料最少.成本 ...
deepin设置jdk全局变量
sudo vim /etc/bash.bashrc 在文件最后边添加 JAVA_HOME=jdk地址CLASSPATH=.:$JAVA_HOME/bin.tools.jarPATH=$JAVA_HOM ...
题解 Defence
传送门发现最少次数只和最左,最右及中间最长的全0段有关本来想启发式合并,结果发现直接线段树合并搭配一个类似山海经的方法就可以过了 yysy,线段树单次合并的具体复杂度并不是 $O(logn)$ ...
pycharm使用Djiago创建第一个web项目
安装PyCharm专业版(注意社区版创建Djiago需要配置,比较麻烦) 创建Djiago项目点上 1.Inherit glocal site-packages(不然pycharm不去下载Djiago ...
python的GUI框架tkinter，实现程序员的流氓式表白逻辑
导入依赖 '''导入依赖''' import tkinter as tk import tkinter.messagebox as msg 创建并隐藏根窗口 '''创建并隐藏根窗口''' root_w ...
网络视频m3u8解密及ts文件合并
网络视频m3u8解密及ts文件合并参考了两篇博客: https://blog.csdn.net/weixin_41624645/article/details/95939510 https://bl ...
数据库中sql分类
-- sql语句分类:-- 1)数据定义语句(DDL):-- create/alter/drop-- 2)数据操作语句(DML):-- insert ...

MapReduce原理深入理解(二)

MapReduce原理深入理解(二)的更多相关文章

随机推荐

热门专题