MapReduce编程之wordcount

实践

MapReduce编程之wordcount

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**

 * 使用MapReduce开发WordCount的应用程序

 */

public class WordCountApp {

    /**

     * Map：读取输入的文件

     */

    public static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{

        LongWritable one = new LongWritable(1);

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            // 接收到的每一行数据

            String line = value.toString();

            //按照指定分隔符进行拆分

            String[] words = line.split(" ");

            for(String word : words){

                // 通过上下文把map的处理结果输出

                context.write(new Text(word),one);

            }

        }

    }

    /**

     * 归并操作

     */

    public static class MyReduce extends Reducer<Text,LongWritable,Text,LongWritable>{

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

            long sum = 0;

            for(LongWritable value : values){

                //求key出现的次数和

                sum += value.get();

            }

            context.write(key, new LongWritable(sum));

        }

    }

    /**

     * 定义Driver：封装lMapReduce作业的所有信息

     * @param args

     */

    public static void main(String[] args) throws Exception{

        //创建configuration

        Configuration configuration = new Configuration();

        //准备清理已存在的输出目录

        Path outputPath = new Path(args[1]);

        FileSystem fileSystem = FileSystem.get(configuration);

        if(fileSystem.exists(outputPath)){

            fileSystem.delete(outputPath,true);

            System.out.println("out file exists,but is has deleted!");

        }

        //创建job

        Job job = Job.getInstance(configuration,"WordCount");

        //设置job的处理类

        job.setJarByClass(WordCountApp.class);

        //设置作业处理的输入路径

        FileInputFormat.setInputPaths(job,new Path(args[0]));

        //设置map相关参数

        job.setMapperClass(MyMapper.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(LongWritable.class);

        //设置reduce相关参数

        job.setReducerClass(MyReduce.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        //设置作业处理的输出路径

        FileOutputFormat.setOutputPath(job , new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

运行

hadoop jar hadoop-train-1.0-SNAPSHOT.jar WordCountApp /hdfsapi/test/b.txt /hdfsapi/test/out

MapReduce编程之Combiner

本地reduce（map端reduce）
减少Map Tasks输出的数据量及数据网络传输量
combiner案例开发

使用场景：求和、求次数
代码

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**

 * 使用MapReduce开发WordCount的应用程序

 */

public class CombinerApp {

    /**

     * Map：读取输入的文件

     */

    public static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{

        LongWritable one = new LongWritable(1);

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            // 接收到的每一行数据

            String line = value.toString();

            //按照指定分隔符进行拆分

            String[] words = line.split(" ");

            for(String word : words){

                // 通过上下文把map的处理结果输出

                context.write(new Text(word),one);

            }

        }

    }

    /**

     * 归并操作

     */

    public static class MyReduce extends Reducer<Text,LongWritable,Text,LongWritable>{

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

            long sum = 0;

            for(LongWritable value : values){

                //求key出现的次数和

                sum += value.get();

            }

            context.write(key, new LongWritable(sum));

        }

    }

    /**

     * 定义Driver：封装lMapReduce作业的所有信息

     * @param args

     */

    public static void main(String[] args) throws Exception{

        //创建configuration

        Configuration configuration = new Configuration();

        //准备清理已存在的输出目录

        Path outputPath = new Path(args[1]);

        FileSystem fileSystem = FileSystem.get(configuration);

        if(fileSystem.exists(outputPath)){

            fileSystem.delete(outputPath,true);

            System.out.println("out file exists,but is has deleted!");

        }

        //创建job

        Job job = Job.getInstance(configuration,"WordCount");

        //设置job的处理类

        job.setJarByClass(CombinerApp.class);

        //设置作业处理的输入路径

        FileInputFormat.setInputPaths(job,new Path(args[0]));

        //设置map相关参数

        job.setMapperClass(MyMapper.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(LongWritable.class);

        //设置reduce相关参数

        job.setReducerClass(MyReduce.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        //通过job的设置combiner处理类，其实逻辑上和我们的reduce是一模一样的

        job.setCombinerClass(MyReduce.class);

        //设置作业处理的输出路径

        FileOutputFormat.setOutputPath(job , new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

执行命令

hadoop jar hadoop-train-1.0-SNAPSHOT.jar WordCountApp /hdfsapi/test/b.txt /hdfsapi/test/out

MapReduce编程之Partitioner

partitioner决定MapTask输出的数据交由哪个ReduceTask处理
默认实现：分发的key的hash值对ReduceTask个数取模
partitioner案例开发

代码

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**

 * 使用MapReduce开发WordCount的应用程序

 */

public class PartitionerApp {

    /**

     * Map：读取输入的文件

     */

    public static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{

        LongWritable one = new LongWritable(1);

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            // 接收到的每一行数据

            String line = value.toString();

            //按照指定分隔符进行拆分

            String[] words = line.split(" ");

            context.write(new Text(words[0]),new LongWritable(Long.parseLong(words[1])));

        }

    }

    /**

     * 归并操作

     */

    public static class MyReduce extends Reducer<Text,LongWritable,Text,LongWritable>{

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

            long sum = 0;

            for(LongWritable value : values){

                //求key出现的次数和

                sum += value.get();

            }

            context.write(key, new LongWritable(sum));

        }

    }

    public static class MyPartitioner extends Partitioner<Text,LongWritable>{

        @Override

        public int getPartition(Text key, LongWritable longWritable, int i) {

            if(key.toString().equals("xiaomi")){

                return 0;

            }

            if(key.toString().equals("huawei")){

                return 1;

            }

            if(key.toString().equals("iphone")){

                return 2;

            }

            return 3;

        }

    }

    /**

     * 定义Driver：封装lMapReduce作业的所有信息

     * @param args

     */

    public static void main(String[] args) throws Exception{

        //创建configuration

        Configuration configuration = new Configuration();

        //准备清理已存在的输出目录

        Path outputPath = new Path(args[1]);

        FileSystem fileSystem = FileSystem.get(configuration);

        if(fileSystem.exists(outputPath)){

            fileSystem.delete(outputPath,true);

            System.out.println("out file exists,but is has deleted!");

        }

        //创建job

        Job job = Job.getInstance(configuration,"WordCount");

        //设置job的处理类

        job.setJarByClass(PartitionerApp.class);

        //设置作业处理的输入路径

        FileInputFormat.setInputPaths(job,new Path(args[0]));

        //设置map相关参数

        job.setMapperClass(MyMapper.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(LongWritable.class);

        //设置reduce相关参数

        job.setReducerClass(MyReduce.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        //通过job的设置partition

        job.setPartitionerClass(MyPartitioner.class);

        //设置4个reduce，每个分区一个

        job.setNumReduceTasks(4);

        //设置作业处理的输出路径

        FileOutputFormat.setOutputPath(job , new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

执行命令

hadoop jar hadoop-train-1.0-SNAPSHOT.jar PartitionerApp /hdfsapi/test/partitioner /hdfsapi/test/outpartitioner

MapReduce编程之wordcount的更多相关文章

mapReduce编程之auto complete
1 n-gram模型与auto complete n-gram模型是假设文本中一个词出现的概率只与它前面的N-1个词相关.auto complete的原理就是,根据用户输入的词,将后续出现概率较大的词 ...
mapReduce编程之Recommender System
1 协同过滤算法协同过滤算法是现在推荐系统的一种常用算法.分为user-CF和item-CF. 本文的电影推荐系统使用的是item-CF,主要是由于用户数远远大于电影数,构建矩阵的代价更小:另外,电 ...
mapReduce编程之google pageRank
1 pagerank算法介绍 1.1 pagerank的假设数量假设:每个网页都会给它的链接网页投票,假设这个网页有n个链接,则该网页给每个链接平分投1/n票. 质量假设:一个网页的pagerank ...
MapReduce编程之Reduce Join多种应用场景与使用
在关系型数据库中 Join 是非常常见的操作,各种优化手段已经到了极致.在海量数据的环境下,不可避免的也会碰到这种类型的需求, 例如在数据分析时需要连接从不同的数据源中获取到数据.不同于传统的单机模式 ...
MapReduce编程之Semi Join多种应用场景与使用
Map Join 实现方式一 ● 使用场景:一个大表(整张表内存放不下,但表中的key内存放得下),一个超大表 ● 实现方式:分布式缓存 ● 用法: SemiJoin就是所谓的半连接,其实仔细一看就是 ...
MapReduce编程之Map Join多种应用场景与使用
Map Join 实现方式一:分布式缓存 ● 使用场景:一张表十分小.一张表很大. ● 用法: 在提交作业的时候先将小表文件放到该作业的DistributedCache中,然后从DistributeC ...
Hadoop基础-Map端链式编程之MapReduce统计TopN示例
Hadoop基础-Map端链式编程之MapReduce统计TopN示例作者:尹正杰版权声明:原创作品,谢绝转载!否则将追究法律责任. 一.项目需求对“temp.txt”中的数据进行分析,统计出各 ...
网络编程之socket
网络编程之socket socket:在网络编程中的一个基本组件,也称套接字. 一个套接字就是socket模块中的socket类的一个实例. 套接字包括两个: 服务器套接字和客户机套接字套接字的实例 ...
C++混合编程之idlcpp教程Python篇(9)
上一篇在这 C++混合编程之idlcpp教程Python篇(8) 第一篇在这 C++混合编程之idlcpp教程(一) 与前面的工程相比,工程PythonTutorial7中除了四个文件PythonTu ...

随机推荐

Porsche PIWIS III with V37.250.020 Piwis 3 Software Update New Feature
Porsche Piwis tester 3 PT3G VCI with V37.250.020 Piwis 3 Software unlimited license installed on Ful ...
UI设计教程分享：字体变形—阴阳收缩法
阴阳师中国古代对自然规律发展变化基础因素的描述,是古代美学逻辑思维.推理分析的核心要素,也是描述万物基本要素和成因的概念之一.阴阳代表事物的对立关系,它是自然界的客观规律,是万物运动变化的本源,是人类 ...
室内设计类网站Web原型制作分享——Dinzd
Dinzd是一家德国室内设计网站,网站内涵盖全球设计精品资讯以及优秀案列.网站布局简单直观,内容丰富. 此原型模板所用到的交互动作有结合弹出面板做下拉菜单效果,鼠标按下文字按钮跳转页面,按钮hover ...
算法题思路总结和leecode继续历程
2018-05-03 刷了牛客网的题目:总结思路(总的思路跟数学一样就是化简和转化) 具体启发点: 1.对数据进行预处理排序的思想:比如8皇后问题 2.对一个数组元素进行比较的操作,如果复杂,可以试试 ...
boost的accumulator rolling_mean的使用
Boost.Accumulators is both a library for incremental statistical computation as well as an extensibl ...
java的nio例子
package main; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.Inet ...
Emacs, Nano, or Vim 编辑器“三剑客”
# Vim 强大,多种模式相互切换,不同于传统“录入式“写东西对后续两个编辑器不熟悉 # nano 退出ctrl + x http://man.linuxde.net/nano # emacs ht ...
.net调用word转换pdf出现80080005错误的解决办法
检索 COM 类工厂中 CLSID 为 {000209FF-0000-0000-C000-000000000046} 的组件时失败,原因是出现以下错误: 80080005. 1:在服务器上安装offi ...
[C#.Net]判断文件是否被占用的两种方法
今天开发产线测试Tool时发现日志文件会几率性的被占用,上网浏览找到最简单的代码(API或者FileStream),在这里抛砖引玉下. 第一种方法:API using System.IO; using ...
transform.forward和vector3.forward
Vector3.forward的值永远是(0,0,1)(这里的(0,0,1)是世界坐标的(0,0,1)),而transform.forward我们可以理解为其对应物体的z轴方向,是一个向量,而不是一个 ...

MapReduce编程之wordcount

实践

MapReduce编程之wordcount

运行

MapReduce编程之Combiner

MapReduce编程之Partitioner

MapReduce编程之wordcount的更多相关文章

随机推荐

热门专题