MapReduce词频统计

自定义Mapper实现

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

/**
 * KEYIN: Map任务读取数据的key类型,offset,是每行数据起始位置的偏移量,一般为Long类型
 * VALUEIN: Map任务读取数据的value类型,其实就是一行行的字符串,String
 *
 * KEYOUT： map方法自定义实现输出的key类型,String
 * VALUEOUT: map方法自定义实现输出的value类型,Integer
 *
 * 假设有如下待处理文本：
 * hello world world
 * hello welcome
 *
 * 词频统计：相同单词的次数 (word,1)
 *
 * Long,String,String,Integer是Java里面的数据类型
 * Hadoop自定义类型：支持序列化和反序列化
 *
 * LongWritable,Text,Text,IntWritable
 *
 */
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
    // 重写map方法
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // key是偏移量,value是一行行数据
        /**
         * Map任务的要求：
         *      (1)切割
         *      (2)赋1,转成key-value类型,写入context
         *      (3)其他的交给Shuffle和Reducer处理
         */
        String[] words = value.toString().split(" ");// 按指定分隔符切割
        for (String word : words) {
            context.write(new Text(word),new IntWritable(1)); // java类型转hadoop类型
            // (hello,1) (world,1) (world,1)
            // (hello,1) (welcome,1)
        }

    }
}

自定义Reducer实现

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;

public class WordCountReducer extends Reducer<Text, IntWritable,Text, IntWritable> {
    // 重写reduce方法
    /** map的输出
     * (hello,1) (world,1) (world,1)
     * (hello,1) (welcome,1)
     *
     * map的输出到reduce端,是按照相同的key分发到一个reduce上执行
     * reduce1: (hello,1) (hello,1) ==> (hello,<1,1>)
     * reduce2: (world,1) (world,1) ==> (world,<1,1>)
     * reduce3: (welcome,1)         ==> (welcome,<1>)
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        /**
         * Reducer任务的要求：(因为每个reduce任务处理的是相同的一个单词的集合)
         * (1) 迭代value数组,累加求次数
         * (2) 取出key单词,拼成(key,次数),写入context
         */
        int count = 0;
        Iterator<IntWritable> its = values.iterator();
        while (its.hasNext()){
            IntWritable next = its.next();
            count += next.get(); //取值
        }
        // 写入context
        context.write(key,new IntWritable(count));
    }
}

编写Driver类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 使用MR统计HDFS上文件的词频
 */
public class WordCountDriver {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","hdfs://localhost:9000");
        System.setProperty("HADOOP_USER_NAME","hadoop");
        // 创建一个Job
        Job job = Job.getInstance(conf);
        // 设置Job对应的参数
        job.setJarByClass(WordCountDriver.class); //主类
        job.setMapperClass(WordCountMapper.class); //使用的Mapper
        job.setReducerClass(WordCountReducer.class); //使用的Reducer
        // 设置Mapper,Reducer的输出类型
        job.setMapOutputKeyClass(Text.class);   //Mapper输出的key类型
        job.setMapOutputValueClass(IntWritable.class); //Mapper输出的value类型
        job.setOutputKeyClass(Text.class);   //Reducer输出的key类型
        job.setOutputValueClass(IntWritable.class);  //Reducer输出的value类型
        // 设置作业的输入输出路径
        FileInputFormat.setInputPaths(job,new Path("input"));
        FileOutputFormat.setOutputPath(job,new Path("output"));
        // 提交Job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : -1);
    }
}

本地测试开发

上面使用的都是基于HDFS的，那么如何使用本地呢？

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 使用MR统计本地文件的词频:
 * 使用本地文件进行词频统计,然后把统计结果输出到本地
 * 步骤：
 *      (1)不需要hdfs路径
 *      (2)不需要远程访问权限hadoop
 *      (3)在项目本地创建好input目录访问即可(input和src是同级目录!)
 */
public class WordCountLocalDriver {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        // 创建一个Job
        Job job = Job.getInstance(conf);
        // 设置Job对应的参数
        job.setJarByClass(WordCountLocalDriver.class); //主类
        job.setMapperClass(WordCountMapper.class); //使用的Mapper
        job.setReducerClass(WordCountReducer.class); //使用的Reducer
        // 设置Mapper,Reducer的输出类型
        job.setMapOutputKeyClass(Text.class);   //Mapper输出的key类型
        job.setMapOutputValueClass(IntWritable.class); //Mapper输出的value类型
        job.setOutputKeyClass(Text.class);   //Reducer输出的key类型
        job.setOutputValueClass(IntWritable.class);  //Reducer输出的value类型
        // 设置作业的输入输出路径
        FileInputFormat.setInputPaths(job,new Path("input"));
        FileOutputFormat.setOutputPath(job,new Path("output"));
        // 提交Job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : -1);
    }
}

强烈建议

使用本地模式进行测试和开发,非常高效,Debug也很方便。

代码升级

使用代码,删除HDFS的output目录

// 删除output目录
FileSystem fs = FileSystem.get(new URI("hdfs://localhost:9000"), conf, "hadoop");
Path outputPath = new Path("output");
if (fs.exists(outputPath)){
    fs.delete(outputPath,true);
}

map端聚合Combiner

处理逻辑和Reducer完全一模一样,直接套用即可！

// 设置Combiner
job.setCombinerClass(WordCountReducer.class);

使用Combiner优缺点

优点

能减少IO,提升作业的执行性能。
缺点

除法操作慎用！

MapReduce词频统计的更多相关文章

MapReduce实现词频统计
问题描述:现在有n个文本文件,使用MapReduce的方法实现词频统计. 附上统计词频的关键代码,首先是一个通用的MapReduce模块: class MapReduce: __doc__ = ''' ...
Hadoop上的中文分词与词频统计实践（有待学习 http://www.cnblogs.com/jiejue/archive/2012/12/16/2820788.html）
解决问题的方案 Hadoop上的中文分词与词频统计实践首先来推荐相关材料:http://xiaoxia.org/2011/12/18/map-reduce-program-of-rmm-word-c ...
【原创】大数据基础之词频统计Word Count
对文件进行词频统计,是一个大数据领域的hello word级别的应用,来看下实现有多简单: 1 Linux单机处理 egrep -o "\b[[:alpha:]]+\b" test ...
Hive简单编程实践-词频统计
一.使用MapReduce的方式进行词频统计 (1)在HDFS用户目录下创建input文件夹 hdfs dfs -mkdir input 注意:林子雨老师的博客(http://dblab.xmu.ed ...
hive进行词频统计
统计文件信息: $ /opt/cdh-5.3.6/hadoop-2.5.0/bin/hdfs dfs -text /user/hadoop/wordcount/input/wc.input hadoo ...
Hadoop的改进实验（中文分词词频统计及英文词频统计）（4/4）
声明: 1)本文由我bitpeach原创撰写,转载时请注明出处,侵权必究. 2)本小实验工作环境为Windows系统下的百度云(联网),和Ubuntu系统的hadoop1-2-1(自己提前配好).如不 ...
初学Hadoop之中文词频统计
1.安装eclipse 准备 eclipse-dsl-luna-SR2-linux-gtk-x86_64.tar.gz 安装 1.解压文件. 2.创建图标. ln -s /opt/eclipse/ec ...
初学Hadoop之WordCount词频统计
1.WordCount源码将源码文件WordCount.java放到Hadoop2.6.0文件夹中. import java.io.IOException; import java.util.Str ...
Hadoop之词频统计小实验
声明: 1)本文由我原创撰写,转载时请注明出处,侵权必究. 2)本小实验工作环境为Ubuntu操作系统,hadoop1-2-1,jdk1.8.0. 3)统计词频工作在单节点的伪分布上,至于真正实 ...

随机推荐

C++开发python windows版本的扩展模块示例
C++开发python windows版本的扩展模块示例测试环境介绍和准备测试环境: 操作系统:windows10 Python版本:3.7.0 VS版本:vs2015社区版(免费) 相关工具下载 ...
使用PHP操作ElasticSearch
如何搭建ES环境和使用CURL操作可以参考我的另一篇文章:ElasticSearch尝试网上很多关于ES的例子都过时了,版本很久,这篇文章的测试环境是ES6.5 通过composer 安装 comp ...
shutil、zipfile，tarfile
shutil 模块提供了大量的文件的高级操作.特别针对文件拷贝和删除,主要功能为目录和文件操作以及压缩操作. 1. shutil.copyfileobj(fsrc, fdst[, length]) 功 ...
Java技术栈思维导图
Java技术栈思维导图 Java IO流体系设计模式
[SDOI2013]森林主席树+启发式合并
这题的想法真的很妙啊. 看到题的第一眼,我先想到树链剖分,并把$DFS$序当成一段区间上主席树.但是会发现在询问的时候,可能会非常复杂,因为你需要把路径拆成很多条轻链和重链,它们还不一定连续,很难 ...
Laravel底层实现原理系列
Laravel 从学徒到工匠精校版地址:https://laravelacademy.org/laravel-from-appreciate-to-artisan
Java基础--异常处理
1.异常的传统处理方式缺点: [1] 通过判断影响执行效率. [2] 判断逻辑和业务逻辑交织在一起,可维护性很差. public class Test01 { public static void ...
前端安全类——CSRF/XSS
CSRF 概念:跨站请求伪造全称:Cross-site request forgery 攻击原理:网站中某一个接口存在漏洞,用户在注册网站登录过防御措施: 1.Token验证:引诱链接只会自动携带 ...
Python：Mac 下 MQTT 服务器 Mosquitto 的配置
我在Mac电脑上搭建时遇到了一些不同于网上大部分情况的问题,特此分享给可能也有遇到相同情况又找不到解决方法的人. 我的电脑系统:macOS Mojave 10.14.3. paho-mqtt 的安装 ...
sql0001
001. https://blog.csdn.net/qinshi965273101/article/details/81907658 002. https://blog.csdn.net/ ...

MapReduce词频统计

自定义Mapper实现

自定义Reducer实现

编写Driver类

本地测试开发

强烈建议

代码升级

使用Combiner优缺点

MapReduce词频统计的更多相关文章

随机推荐

热门专题