WordCount基于本地和java的使用

直接使用hadoop中的wordcount中的jar包进行使用

JAVA实现WordCount

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

    public class Demo1 {

        // map类

        // 第一对kv,是决定数据输入的格式

        // 第二队kv 是决定数据输出的格式

        public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{

            @Override

            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

                //一行一行的读，先LongWrtable再value是因为第一个LongWritable是偏移量

                String line=value.toString();

                //需要读出内容和行数1，所以要对结果进行类型转换

                context.write(new Text(line),new LongWritable(1));

            }

        }

        // reduce类

        // 用来接收map端输出的数据

        public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{

            /**

             * reduce 聚合程序 每一个k都会调用一次

             * 默认是一个节点

             * key:每一个单词

             * values:map端 当前k所对应的所有的v

             */

            @Override

            protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

                long sum=0;

                //进行整合后values值变成（key，1,1,1,1），values需要遍历

                for (LongWritable value : values) {

                    //这里同理需要将value转换类型，LongWritable是一个接口可以用get方法转为long型整数

                    sum+=value.get();

                }

                //同理long类型sum转换为LongWritable类型

                context.write(key,new LongWritable(sum));

            }

        }

        //mapreduce的程序入口

        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

            //创造一个job任务

            Job job = Job.getInstance();

            //命名job名称

            job.setJobName("第一次通过自己的jar包连接");

            //指定当前main坐在类端口

            job.setJarByClass(Demo1.class);

            //指定map类端口

            job.setMapperClass(map.class);

            //指定map输出的kv类型

            job.setMapOutputKeyClass(Text.class);

            job.setMapOutputValueClass(LongWritable.class);

            //指定reduce类端口

            job.setReducerClass(reduce.class);

            //指定reduce输出的kv类型

            job.setOutputKeyClass(Text.class);

            job.setOutputValueClass(LongWritable.class);

            //指定输入路径 hdfs路径

            Path in = new Path("/wordcount");

            FileInputFormat.addInputPath(job,in);

            //指定输出路径

            Path out = new Path("/output1");

            //如果路径存在，进行删除操作

            FileSystem fs = FileSystem.get(new Configuration());

            if (fs.exists(out)){

                fs.delete(out,true);  //true可以删除多级目录

            }

            FileOutputFormat.setOutputPath(job,out);

            //启动任务

            job.waitForCompletion(true);

            /**

             * 提交任务

             * 1.通过maven中package将项目打包上传服务器然后执行

             * 2.执行任务 hadoop jar hadoop-mapreduce-examples-2.7.6.jar com.shujia.hadoop.Demo01WordCount /word  /output

             *

             */

            System.out.println("wordcount实现成功");

        }

    }

实现玩代码后进行打包，打完后的包xftp上传到

/usr/local/soft/hadoop-2.7.6/share/hadoop/mapreduce

开始正式对包进行解析（jar）

路径在idea中查看，是mian函数的路径

对数据进行逗号分隔代码

只需对map阶段进行操作即可

public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String s = value.toString();

            String[] split = s.split(",");

            for (String s1 : split) {

                context.write(new Text(s1),new LongWritable(1));

            }

        }

    }

对students中clazz中年龄的总和

public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String s = value.toString();

            String[] split = s.split(",");

            String s1 = split[2];

            LongWritable age = new LongWritable(Integer.valueOf(s1));

            String s2 = split[4];

            Text clazz = new Text(s2);

            context.write(clazz, age);

        }

    }

对students.txt中进行男女性别人数的统计

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo4 {

    public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String s = value.toString().split(",")[3];

            context.write(new Text(s),new LongWritable(1));

        }

    }

    public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

            long sum=0l;

            for (LongWritable value : values) {

                sum+=value.get();

            }

            context.write(key,new LongWritable(sum));

        }

    }

    public static void main(String[] args) throws Exception{

        Job job = Job.getInstance();

        job.setJobName("男女性别人数的统计");

        job.setJarByClass(Demo4.class);

        job.setMapperClass(map.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(LongWritable.class);

        job.setReducerClass(reduce.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        Path in = new Path("/data/students.txt");

        FileInputFormat.addInputPath(job,in);

        Path out = new Path("/output4");

        FileSystem fs = FileSystem.get(new Configuration());

        if (fs.exists(out)){

            fs.delete(out,true);

        }

        FileOutputFormat.setOutputPath(job,out);

        job.waitForCompletion(true);

        System.out.println("第四个了");

    }

}

Students.txt中筛选出男生的所有信息，无reduce阶段，因为无需计算

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo5 {

    public static class map extends Mapper<LongWritable,Text,Text,NullWritable> {

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String s = value.toString().split(",")[3];

            if (s.equals("男")){

                context.write(value,NullWritable.get());

            }

        }

    }

    public static void main(String[] args) throws Exception {

        Job job = Job.getInstance();

        job.setJobName("students中只筛选出男生，无reduce操作");

        job.setJarByClass(Demo5.class);

        job.setMapperClass(map.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(NullWritable.class);

        Path in = new Path("/data/students.txt");

        FileInputFormat.addInputPath(job,in);

        Path out = new Path("/output5");

        FileSystem fs = FileSystem.get(new Configuration());

        if (fs.exists(out)){

            fs.delete(out,true);

        }

        FileOutputFormat.setOutputPath(job,out);

        job.waitForCompletion(true);

    }

}

对两张表进行拼接操作：

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

import java.util.ArrayList;

public class Demo6 {

    public static class map extends Mapper<LongWritable,Text,Text,Text>{

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            //context获取切片，上面是hdfs就从hdfs，下面是reduce

            //获取路径InputSplit

            InputSplit is = context.getInputSplit(); //InputSplit获取切片，然后从hdfs中获取文件名或者路径

            FileSplit fileSplit= (FileSplit) is; //InputSplit是抽象类，不能使用自己的方法，所以用FileSplit来实现

            String s = fileSplit.getPath().toString(); //获取切片的文件路径，是path不是name

            if (s.contains("students")){

                //打上标签

                String s1 = "*"+value.toString();

                String id = value.toString().split(",")[0];

                context.write(new Text(id),new Text(s1));

            }else {

                String s1 = "#"+value.toString();

                String id = value.toString().split(",")[0];

                context.write(new Text(id),new Text(s1));

            }

        }

    }

    public static class reduce extends Reducer<Text,Text,Text,NullWritable> {

        @Override

        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

            //此时进行了一个reducetask任务，key是学号，而values是相同key所对应的所有的数据，包括学生信息和分数信息，

            //此时里面有七个，六个是score信息，对其进行集合存储

            String st="";

            ArrayList<String> sc = new ArrayList<String>();

            //分数弄成一个集合是因为一个学生对应六个分数，可以通过对集合的遍历将六个成绩逐一算到学生中去

            for (Text value : values) {

                String s = value.toString();

                if (s.startsWith("*")){

                     st = s.substring(1); //此时注意s是包含标签的，记得索引0是标签

                }else {

                    sc.add(s.substring(1));

                }

            }

            //两张表进行拼接

            for (String s : sc) {

                String s1 = s.split(",")[2];

                String end=st+","+s1;

                context.write(new Text(end),NullWritable.get());

            }

        }

    }

    public static void main(String[] args) throws Exception{

        Job job = Job.getInstance();

        job.setJobName("两个文件进行拼接");

        job.setJarByClass(Demo6.class);

        job.setMapperClass(map.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(reduce.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(NullWritable.class);

        Path in = new Path("/datajava");

        FileInputFormat.addInputPath(job,in);

        Path out = new Path("/output6");

        FileOutputFormat.setOutputPath(job,out);

        job.waitForCompletion(true);

        System.out.println("可以了第六次");

    }

}

combine对数据进行性别进行计数

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo8 {

    public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{

        @Override

        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String sex = value.toString().split(",")[3];

            context.write(new Text(sex),new LongWritable(1));

        }

    }

    public static class combine extends Reducer<Text,LongWritable,Text,LongWritable>{

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

            long sum=0l;

            for (LongWritable value : values) {

                sum+=value.get();

            }

            context.write(key,new LongWritable(sum));

        }

    }

    public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {

            long sum=0l;

            for (LongWritable value : values) {

                sum+=value.get();

            }

            context.write(key,new LongWritable(sum));

        }

    }

    public static void main(String[] args) throws Exception{

        Job job = Job.getInstance();

        job.setJobName("combine对性别进行计数");

        job.setJarByClass(Demo8.class);

        job.setMapperClass(map.class);

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(LongWritable.class);

        job.setCombinerClass(combine.class);

        job.setReducerClass(reduce.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        Path in = new Path("/data/students.txt");

        FileInputFormat.addInputPath(job,in);

        Path out = new Path("/output8");

        FileSystem fs = FileSystem.get(new Configuration());

        if (fs.exists(out)){

            fs.delete(out,true);

        }

        FileOutputFormat.setOutputPath(job,out);

        job.waitForCompletion(true);

        System.out.println("你又可以了");

    }

}

WordCount基于本地和java的使用的更多相关文章

Terrocotta - 基于JVM的Java应用集群解决方案
前言越来越多的企业关键应用都必须采用集群技术,实现负载均衡(Load Balancing).容错(Fault Tolerance)和灾难恢复(Failover).以达到系统可用性(High Avai ...
[How to]基于本地镜像的yum镜像源搭建
1.简介本文介绍如何在封闭环境(无外网)下安装离线安装本地镜像与基于本地镜像的yum镜像源. 2.环境版本交代: OS:CentOS-6.7-x86_64-minimal yum: yum-3.2. ...
基于JavaMail的Java邮件发送：复杂邮件发送
参考:http://blog.csdn.net/xietansheng/article/details/51722660package com.bfd.ftp.utils;import java.ut ...
基于本地存储的kvm虚拟机在线迁移
基于本地存储的kvm虚拟机在线迁移 kvm虚拟机迁移分为4种(1)热迁移基于共享存储(2)热迁移基于本地存储(3)冷迁移基于共享存储(4)冷迁移基于本地存储这里介绍的是基于本地存储的热迁移动态块迁 ...
基于本地文件系统的LocalDB
零.前言之前写一些小工具的时候,需要用到数据存储方面的技术,但是用数据库又觉得太大了,本地文件存储txt文件存储又不是很规范,于是乎想到了去编写一个简单的基于本地文件系统的数据存储库,暂且叫它loc ...
基于类（Java）和基于原理（JavaScript）的对象系统的比较
Java:面向对象编程语言,吸收了C++语言的各种优点,丢掉了C++让人头疼的多继承.指针等概念.具有功能强大和简单易用的两大特征.Java具有简单性.面向对象.分布式.健壮性.安全性.平台独立与可移 ...
基于Mapxtreme for JAVA的电子地图设计与实现
基于Mapxtreme for JAVA的电子地图设计与实现学生毕业设计,适合测绘类专业研究目标: 开发一个基于MapXtreme for JAVA的校园电子地图项目,使用MapInfo ...
Linux -- 基于zookeeper的java api(二)
Linux -- 基于zookeeper的java api(二) 写一个关于基于集群的zookeeper的自定义实现HA 基于客户端和监控器:使用监控的方法查看每个注册过的节点的状态来做出操作. Wa ...
Linux -- 基于zookeeper的java api(一)
Linux -- 基于zookeeper的java api 首先启动你所有的 zkService.sh 查看状态:检查是否启动正确 [root@hu-hadoop2 ~]# zkServer.sh s ...

随机推荐

CSS Modules 的六种用法
一.局部作用域二.全局作用域三.定制哈希类名四. Class 的组合五.输入其他模块六.输入变量
vivo 评论中台的流量及数据隔离实践
一.背景 vivo评论中台通过提供评论发表.点赞.举报.自定义评论排序等通用能力,帮助前台业务快速搭建评论功能并提供评论运营能力,避免了前台业务的重复建设和数据孤岛问题.目前已有vivo短视频.viv ...
SharePoint Online 为Modern Page添加脚本
前言众所周知,如果我们想向SharePoint 页面添加脚本,最方便的便是经典页面,添加方式主要有内容编辑器Web部件或者直接使用SharePoint Designer. 但是,如果页面是Moder ...
CentOS 7中的系统语言包及UTF-8、en_US.UTF-8和zh_CN.UTF-8的区别
UTF-8.en_US.UTF-8和zh_CN.UTF-8的区别 en_US.UTF-8.zh_CN.UTF-8叫做字符集,就是说'A'.'B'.'中'.'国'等对应的整数值,en_US.UTF-8只 ...
ARC下的内存管理
1.ARC下单对象内存管理局部变量释放对象随之被释放 int main(int argc, const char * argv[]) { @autoreleasepool { Person *p = ...
MySQL语法命令之约束篇
文章目录 1.约束概述 1.1约束的分类 1.2添加约束 2.查看表中的约束 3. `not null` 非空约束 3.1 在 `create table` 时创建 3.2 在`alter table ...
手把手教你在命令行（静默）部署oracle 11gR2
文章目录环境介绍 linux发行版 cpu.内存以及磁盘空间敲黑板关闭防火墙以及selinux 操作系统配置使用阿里的yum源提速安装依赖软件设置用户最大进程数以及最大文件打开数内核参数 ...
MySQL架构原理之存储引擎InnoDB数据文件
MySQL架构原理之体系架构 - 池塘里洗澡的鸭子 - 博客园 (cnblogs.com)中简单介绍了MySQL的系统文件层,其中包含了数据文件.那么InnoDB的数据文件是如何分类并存储的呢? 一. ...
日行一算（Consecutive Integer-连续整数）
题目题目描述 2005年的百度之星初赛有这么一道题,一个正整数有可能可以被表示为 m(m>1) 个连续正整数之和,如: 15=1+2+3+4+5 15=4+5+6 15=7+8 但现在你的任务 ...
IDEA使用JDBC链接MySql（java编程）
1.在Maven的pom.xml文件中引入MySql的驱动 <dependency> <groupId>mysql</groupId> <artifactId ...

WordCount基于本地和java的使用

JAVA实现WordCount

对数据进行逗号分隔代码

对students中clazz中年龄的总和

对students.txt中进行男女性别人数的统计

Students.txt中筛选出男生的所有信息，无reduce阶段，因为无需计算

对两张表进行拼接操作：

combine对数据进行性别进行计数

WordCount基于本地和java的使用的更多相关文章

随机推荐

热门专题