mapreduce的使用

以下案例写之前需要导入jar包依赖：

  <dependencies>

      <dependency>

              <groupId>org.apache.hadoop</groupId>

              <artifactId>hadoop-client</artifactId>

              <version>2.6.0</version>

      </dependency>

  </dependencies>

1.单词计数案例：

package com.xyz;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**

 * @author 小勇子start

 * @create 2021-10-11 16:35

 */

public class WordCount {

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        Configuration conf=new Configuration();

        Job job= Job.getInstance(conf);

		//这里是对jar包的类，map的类和reduce的类三者的映射

        job.setJarByClass(WordCount.class);

        job.setMapperClass(Map.class);

        job.setReducerClass(Reduce.class);

		//这里是对map输出的关键字和值对应类型的映射

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(IntWritable.class);

		//这里是对reduce输出的关键字和值对应类型的映射

        job.setOutputKeyClass(Text.class);

        job.setOutputKeyClass(IntWritable.class);

        //数据输入路径(此处为本地测试)

        FileInputFormat.setInputPaths(job,new Path("C:\\Users\\小勇子\\Desktop\\大数据培训\\练习\\mapreduceTest\\src\\data\\test.txt"));

        //数据输出路径(此处为本地测试)

        FileOutputFormat.setOutputPath(job,new Path("C:\\Users\\小勇子\\Desktop\\大数据培训\\练习\\mapreduceTest\\src\\data\\out1"));

        boolean flag=job.waitForCompletion(true);

        System.exit(flag?0:1);

    }

    static class Map extends Mapper<LongWritable,Text, Text, IntWritable>{

        /*Text的包有很多，注意导的是：org.apache.hadoop.io.Text

        在map中前两个数据类型基本固定

        后两个代表着要输出的类型，如果有reduce，则以该类型发送给reduce

        */没有reduce则直接以该类型输出到目标

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] str=value.toString().split(",");

            for (String s:str) {

                context.write(new Text(s),new IntWritable(1));

            }

      }

    }

    static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{

        /*

        <Text,IntWritable,Text,IntWritable>

        前两个是map发过来关键字和值的类型

        后两个是reduce输出关键字和值的类型

        */

        @Override

        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

           int len=0;

            for (Object s:values) {

                len++;

            }

            context.write(key,new IntWritable(len));

        }

    }

}

2.数据清洗案例：

package com.xyz2;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Counter;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**

 * @author 小勇子start

 * @create 2021-10-11 17:23

 */

public class RepeatProcessing {

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        Configuration conf=new Configuration();

        Job job= Job.getInstance(conf);

        job.setJarByClass(RepeatProcessing.class);

        job.setMapperClass(Map.class);

        job.setReducerClass(Reduce.class);

        job.setMapOutputValueClass(NullWritable.class);

        job.setMapOutputKeyClass(Text.class);

        job.setOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);

		//此处为本地测试

        FileInputFormat.setInputPaths(job,new Path("C:\\Users\\小勇子\\Desktop\\大数据培训\\练习\\mapreduceTest\\src\\data\\test2.txt"));

        //此处为本地测试

        FileOutputFormat.setOutputPath(job,new Path("C:\\Users\\小勇子\\Desktop\\大数据培训\\练习\\mapreduceTest\\src\\data\\out2"));

        boolean flag=job.waitForCompletion(true);

        long repeat=job.getCounters().findCounter(Count.repeatCount).getValue();

        System.out.println("重复的行数为："+repeat);

        System.exit(flag?0:1);

    }

    static enum Count{//枚举计数

        repeatCount

    }

    static class Map extends Mapper<LongWritable, Text,Text, NullWritable>{

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            context.write(value,NullWritable.get());

        }

    }

    static class Reduce extends Reducer<Text, NullWritable,Text,NullWritable>{

        @Override

        protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {

            Counter counter=context.getCounter(Count.repeatCount);

            int len=0;

            for (Object s:values) {

                len++;

            }

            long val= counter.getValue();

            val+=len-1;

            counter.setValue(val);

            context.write(key,NullWritable.get());

        }

    }

}

3.topN案例：

package com.xyz;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

import java.util.*;

/**

 * @author 小勇子start

 * @create 2021-09-30 19:53

 */

public class Task2_4 {

    static class TaskMap extends Mapper<LongWritable, Text, Text, Text> {

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] line = value.toString().split(";");

            String filmName = line[0];//电影名称

            String filmType = line[6];//电影类型

            String[] types = filmType.split("/|、|，");

            for (String s : types) {

                s=s.trim();

                if (s.length() > 2) {

                    for (int i = 2; i < s.length(); i += 2) {

                        String newStr = s.substring(i - 2, i);

                        context.write(new Text(newStr), new Text(filmName));

                    }

                } else

                    context.write(new Text(s), new Text(filmName));

            }

        }

    }

    static class TaskReduce extends Reducer<Text, Text, Text, IntWritable> {

       ArrayList<Object[]> arrayList=new ArrayList<Object[]>();

        @Override

        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

            //此处用set是因为Iterable<Text> values中存储的值中有重复的电影名称，需要取不同名字的个数

            HashSet set=new HashSet();

            for (Text s : values) {

                set.add(s);

            }

            Object[] data={set.size(),key.toString()};

            arrayList.add(data);

        }

        @Override

        protected void cleanup(Context context) throws IOException, InterruptedException {

            Object[] data=arrayList.toArray();

            Arrays.parallelSort(data, new Comparator<Object>() {

                @Override

                public int compare(Object o1, Object o2) {

                    Object[] o_1=(Object[]) o1;

                    Object[] o_2=(Object[]) o2;

                    return (int)o_2[0]-(int)o_1[0];

                }

            });

            for (Object d:data) {

                Object[] d1=(Object[]) d;

                context.write(new Text(d1[1].toString()),new IntWritable((int)d1[0]));

           }

            /*取前3个

            for (int i=0;i<3;i++) {

            Object[] d1=(Object[]) data[i];

            context.write(new Text(d1[1].toString()),new IntWritable((int)d1[0]));

        	}

            */

        }

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(Task2_4.class);

        job.setMapperClass(TaskMap.class);

        job.setReducerClass(TaskReduce.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\小勇子\\Desktop\\大数据培训\\练习\\作业\\7月25日晚上任务\\数据\\2-1"));

        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\小勇子\\Desktop\\大数据培训\\练习\\作业\\7月25日晚上任务\\数据\\2-42"));

        boolean flag = job.waitForCompletion(true);

        System.exit(flag ? 0 : 1);

    }

}

4.join案例（自定义Writable）

package com.xyz;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.*;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

/**

 * @author 小勇子start

 * @create 2021-10-12 10:32

 */

public class JoinDemo {

    static class MyDataWritable implements Writable {

        private String flag;//标记

        private String data;//一行数据

        @Override

        public void write(DataOutput dataOutput) throws IOException {

            dataOutput.writeUTF(flag);

            dataOutput.writeUTF(data);

        }

        @Override

        public void readFields(DataInput dataInput) throws IOException {

            this.flag=dataInput.readUTF();

            this.data=dataInput.readUTF();

        }

        public String getFlag(){

            return flag;

        }

        public void setFlag(String flag){

           this.flag=flag;

        }

        public String getData() {

            return data;

        }

        public void setData(String data) {

            this.data = data;

        }

    }

    static class Map extends Mapper<LongWritable, Text, IntWritable,MyDataWritable>{

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] str=value.toString().split(",");

            int id=Integer.valueOf(str[0]);

            String fileName=((FileSplit)context.getInputSplit()).getPath().getName();

            MyDataWritable myData=new MyDataWritable();

            myData.setData(value.toString());

            if(fileName.contains("customers"))

               myData.setFlag("kh");

            else

                myData.setFlag("order");

            context.write(new IntWritable(id),myData);

        }

    }

    static class Reduce extends Reducer<IntWritable,MyDataWritable,Text, NullWritable>{

        @Override

        protected void reduce(IntWritable key, Iterable<MyDataWritable> values, Context context) throws IOException, InterruptedException {

            String khData="";

            String oData="";

            for (MyDataWritable m:values) {

                if (m.getFlag().equals("kh"))

                    khData=m.getData();

                else

                    oData=m.getData();

            }

            String newData=khData+","+oData;

            context.write(new Text(newData),NullWritable.get());

        }

    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        Configuration conf=new Configuration();

        Job job= Job.getInstance(conf);

        job.setJarByClass(JoinDemo.class);

        job.setMapperClass(Map.class);

        job.setReducerClass(Reduce.class);

        job.setMapOutputValueClass(MyDataWritable.class);

        job.setMapOutputKeyClass(IntWritable.class);

        job.setOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);

        //此处为本地测试

        FileInputFormat.setInputPaths(job,new Path("C:\\Users\\小勇子\\Desktop\\大数据培训\\练习\\joinDemo\\DemoOne\\join\\"));

        //此处为本地测试

        FileOutputFormat.setOutputPath(job,new Path("C:\\Users\\小勇子\\Desktop\\大数据培训\\练习\\joinDemo\\DemoOne\\join\\out1"));

        boolean flag=job.waitForCompletion(true);

        System.exit(flag?0:1);

    }

}

5.时间日期格式化

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.Date;

/**

 * @author 小勇子start

 * @create 2021-10-11 17:53

 */

public class DateTimeFormat {

    public static void main(String[] args) throws ParseException {

        String dateStr="2021.09.05";

        SimpleDateFormat sdf=new SimpleDateFormat("yyyy.MM.dd");

        SimpleDateFormat sdf1=new SimpleDateFormat("yyyy-MM-dd");

        Date oldDate=sdf.parse(dateStr);

        String newDateStr=sdf1.format(oldDate);

        Date newDate=sdf1.parse(newDateStr);

        System.out.println(oldDate);//结果：Sun Sep 05 00:00:00 CST 2021

        System.out.println(newDateStr);//结果：2021-09-05

        System.out.println(newDate);//结果：Sun Sep 05 00:00:00 CST 2021

    }

}

6.mapreduce项目打jar包

右键项目 Open Module Settings
找到左侧 Artifacts，点击中间的 +号，选择JAR,然后选择“”“From Module With dependicies”
选择要执行的Main方法
点击idea上面的Builde菜单，选择Builder Artifacts。然后builder即可（如果改了代码，直接点击Rebuild
打包后将jar包上传至linux中

7.运行jar包

在linux中创建一个文本文件，然后上传至hdfs中
开始运行程序： hadoop jar xxxx.jar com.xyzy.test1.MyWordDriver /xxx.txt /out1

如果你程序中输入，输出路径是写死的（即不是用的args[0]这样的方式），那么/xxx.txt 和/out1就不需要了 ;在上传后jar包所在的位置执行上述命令，否则jar包前面使用绝对路径。

mapreduce的使用的更多相关文章

Mapreduce的文件和hbase共同输入
Mapreduce的文件和hbase共同输入 package duogemap; import java.io.IOException; import org.apache.hadoop.co ...
mapreduce多文件输出的两方法
mapreduce多文件输出的两方法 package duogemap; import java.io.IOException; import org.apache.hadoop.conf ...
mapreduce中一个map多个输入路径
package duogemap; import java.io.IOException; import java.util.ArrayList; import java.util.List; imp ...
Hadoop 中利用 mapreduce 读写 mysql 数据
Hadoop 中利用 mapreduce 读写 mysql 数据有时候我们在项目中会遇到输入结果集很大,但是输出结果很小,比如一些 pv.uv 数据,然后为了实时查询的需求,或者一些 OLAP ...
[Hadoop in Action] 第5章高阶MapReduce
链接多个MapReduce作业执行多个数据集的联结生成Bloom filter 1.链接MapReduce作业 [顺序链接MapReduce作业] mapreduce-1 | mapr ...
MapReduce
2016-12-21 16:53:49 mapred-default.xml mapreduce.input.fileinputformat.split.minsize 0 The minimum ...
使用mapreduce计算环比的实例
最近做了一个小的mapreduce程序,主要目的是计算环比值最高的前5名,本来打算使用spark计算,可是本人目前spark还只是简单看了下,因此就先改用mapreduce计算了,今天和大家分享下这个 ...
MapReduce剖析笔记之八: Map输出数据的处理类MapOutputBuffer分析
在上一节我们分析了Child子进程启动,处理Map.Reduce任务的主要过程,但对于一些细节没有分析,这一节主要对MapOutputBuffer这个关键类进行分析. MapOutputBuffer顾 ...
MapReduce剖析笔记之七：Child子进程处理Map和Reduce任务的主要流程
在上一节我们分析了TaskTracker如何对JobTracker分配过来的任务进行初始化,并创建各类JVM启动所需的信息,最终创建JVM的整个过程,本节我们继续来看,JVM启动后,执行的是Child ...
MapReduce剖析笔记之六：TaskTracker初始化任务并启动JVM过程
在上面一节我们分析了JobTracker调用JobQueueTaskScheduler进行任务分配,JobQueueTaskScheduler又调用JobInProgress按照一定顺序查找任务的流程 ...

随机推荐

i春秋wanna to see your hat?
打开题目网页发现是个选择帽子的网页,点击超链接进入一个网页让我们输入我们的name然后匹配帽子颜色(其实不管怎么填都是绿色的)这里也有个注册窗口先查看源码没什么特别发现,再试试抓包吧在这个界面抓包 ...
SQL语句查询关键字前期数据准备
前期数据准备 create table emp( id int primary key auto_increment, name varchar(20) not null, gender enum(' ...
Linux常用软件的安装及Nginx的使用
主要内容: 软件安装方式上传与下载工具常用软件的安装--jdk.tomcat.mysql.redis 项目的部署 Nginx的安装 Nginx的功能静态网站部署虚拟主机配置及端口绑定域名绑定 ...
USB口3A限流保护芯片。带短路保护
一般说明 PW1503是超低RDS(ON)开关,具有可编程的电流限制,以保护电源源于过电流和短路情况.它具有超温保护以及反向闭锁功能. PW1503采用薄型(1毫米)5针薄型SOT封装,提供可调版本. ...
Pointers and Constants
Pointers and Constants char * const q = "abc"; // q is const *q = 'c'; // OK q++; //ERROR ...
【Redis场景1】用户登录注册
细节回顾: 关于cookie和session不熟悉的朋友: 建议阅读该博客:https://www.cnblogs.com/ityouknow/p/10856177.html 执行流程: 在单体模式下 ...
体验 Gitea Actions
即将推出的 Gitea Actions 致力于打造一个 CI/CD 工具的标准协议,第三方 CI 系统可以基于actions 协议与 Gitea 平台集成,提供一站式管理方案.Gitea Action ...
STL map容器常用API
map容器:键值和实值是分开的,排序规则按照键值排序 #define _CRT_SECURE_NO_WARNINGS #include<iostream> #include<map& ...
[R语言] R语言PCA分析教程 Principal Component Methods in R
R语言PCA分析教程 Principal Component Methods in R(代码下载) 主成分分析Principal Component Methods(PCA)允许我们总结和可视化包含由 ...
FPGA：乒乓球比赛模拟机的设计
简介开发板:EGO1 开发环境:Windows10 + Xilinx Vivado 2020 数字逻辑大作业题目 7: 乒乓球比赛模拟机的设计乒乓球比赛模拟机用发光二极管(LED)模拟乒乓球运动轨 ...

mapreduce的使用

mapreduce的使用

1.单词计数案例：

2.数据清洗案例：

3.topN案例：

4.join案例（自定义Writable）

5.时间日期格式化

6.mapreduce项目打jar包

7.运行jar包

mapreduce的使用的更多相关文章

随机推荐

热门专题