直接使用hadoop中的wordcount中的jar包进行使用

JAVA实现WordCount

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class Demo1 { // map类
// 第一对kv,是决定数据输入的格式
// 第二队kv 是决定数据输出的格式
public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//一行一行的读,先LongWrtable再value是因为第一个LongWritable是偏移量
String line=value.toString();
//需要读出内容和行数1,所以要对结果进行类型转换
context.write(new Text(line),new LongWritable(1));
}
} // reduce类
// 用来接收map端输出的数据
public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{
/**
* reduce 聚合程序 每一个k都会调用一次
* 默认是一个节点
* key:每一个单词
* values:map端 当前k所对应的所有的v
*/
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum=0; //进行整合后values值变成(key,1,1,1,1),values需要遍历
for (LongWritable value : values) {
//这里同理需要将value转换类型,LongWritable是一个接口可以用get方法转为long型整数
sum+=value.get();
}
//同理long类型sum转换为LongWritable类型
context.write(key,new LongWritable(sum));
}
} //mapreduce的程序入口
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//创造一个job任务
Job job = Job.getInstance();
//命名job名称
job.setJobName("第一次通过自己的jar包连接"); //指定当前main坐在类端口
job.setJarByClass(Demo1.class); //指定map类端口
job.setMapperClass(map.class);
//指定map输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class); //指定reduce类端口
job.setReducerClass(reduce.class);
//指定reduce输出的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class); //指定输入路径 hdfs路径
Path in = new Path("/wordcount");
FileInputFormat.addInputPath(job,in); //指定输出路径
Path out = new Path("/output1");
//如果路径存在,进行删除操作
FileSystem fs = FileSystem.get(new Configuration());
if (fs.exists(out)){
fs.delete(out,true); //true可以删除多级目录
}
FileOutputFormat.setOutputPath(job,out); //启动任务
job.waitForCompletion(true); /**
* 提交任务
* 1.通过maven中package将项目打包上传服务器然后执行
* 2.执行任务 hadoop jar hadoop-mapreduce-examples-2.7.6.jar com.shujia.hadoop.Demo01WordCount /word /output
*
*/ System.out.println("wordcount实现成功"); }
}

实现玩代码后进行打包,打完后的包xftp上传到

/usr/local/soft/hadoop-2.7.6/share/hadoop/mapreduce

开始正式对包进行解析(jar)

路径在idea中查看,是mian函数的路径

对数据进行逗号分隔代码

只需对map阶段进行操作即可

public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split(",");
for (String s1 : split) {
context.write(new Text(s1),new LongWritable(1));
} }
}

对students中clazz中年龄的总和

public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split(","); String s1 = split[2];
LongWritable age = new LongWritable(Integer.valueOf(s1));
String s2 = split[4];
Text clazz = new Text(s2);
context.write(clazz, age);
}
}

对students.txt中进行男女性别人数的统计

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class Demo4 {
public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString().split(",")[3];
context.write(new Text(s),new LongWritable(1));
}
} public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum=0l;
for (LongWritable value : values) {
sum+=value.get();
}
context.write(key,new LongWritable(sum));
}
} public static void main(String[] args) throws Exception{ Job job = Job.getInstance();
job.setJobName("男女性别人数的统计");
job.setJarByClass(Demo4.class); job.setMapperClass(map.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class); job.setReducerClass(reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class); Path in = new Path("/data/students.txt");
FileInputFormat.addInputPath(job,in); Path out = new Path("/output4");
FileSystem fs = FileSystem.get(new Configuration());
if (fs.exists(out)){
fs.delete(out,true);
}
FileOutputFormat.setOutputPath(job,out); job.waitForCompletion(true);
System.out.println("第四个了"); }
}

Students.txt中筛选出男生的所有信息,无reduce阶段,因为无需计算

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class Demo5 { public static class map extends Mapper<LongWritable,Text,Text,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString().split(",")[3];
if (s.equals("男")){
context.write(value,NullWritable.get());
}
}
} public static void main(String[] args) throws Exception {
Job job = Job.getInstance();
job.setJobName("students中只筛选出男生,无reduce操作");
job.setJarByClass(Demo5.class); job.setMapperClass(map.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class); Path in = new Path("/data/students.txt");
FileInputFormat.addInputPath(job,in); Path out = new Path("/output5");
FileSystem fs = FileSystem.get(new Configuration());
if (fs.exists(out)){
fs.delete(out,true);
}
FileOutputFormat.setOutputPath(job,out); job.waitForCompletion(true); }
}

对两张表进行拼接操作:

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException;
import java.util.ArrayList; public class Demo6 { public static class map extends Mapper<LongWritable,Text,Text,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//context获取切片,上面是hdfs就从hdfs,下面是reduce
//获取路径InputSplit
InputSplit is = context.getInputSplit(); //InputSplit获取切片,然后从hdfs中获取文件名或者路径
FileSplit fileSplit= (FileSplit) is; //InputSplit是抽象类,不能使用自己的方法,所以用FileSplit来实现
String s = fileSplit.getPath().toString(); //获取切片的文件路径,是path不是name if (s.contains("students")){
//打上标签
String s1 = "*"+value.toString();
String id = value.toString().split(",")[0];
context.write(new Text(id),new Text(s1));
}else {
String s1 = "#"+value.toString();
String id = value.toString().split(",")[0];
context.write(new Text(id),new Text(s1));
} }
} public static class reduce extends Reducer<Text,Text,Text,NullWritable> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//此时进行了一个reducetask任务,key是学号,而values是相同key所对应的所有的数据,包括学生信息和分数信息,
//此时里面有七个,六个是score信息,对其进行集合存储 String st="";
ArrayList<String> sc = new ArrayList<String>();
//分数弄成一个集合是因为一个学生对应六个分数,可以通过对集合的遍历将六个成绩逐一算到学生中去 for (Text value : values) {
String s = value.toString();
if (s.startsWith("*")){
st = s.substring(1); //此时注意s是包含标签的,记得索引0是标签
}else {
sc.add(s.substring(1));
}
} //两张表进行拼接
for (String s : sc) {
String s1 = s.split(",")[2];
String end=st+","+s1;
context.write(new Text(end),NullWritable.get());
}
}
} public static void main(String[] args) throws Exception{
Job job = Job.getInstance();
job.setJobName("两个文件进行拼接");
job.setJarByClass(Demo6.class); job.setMapperClass(map.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class); job.setReducerClass(reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class); Path in = new Path("/datajava");
FileInputFormat.addInputPath(job,in); Path out = new Path("/output6");
FileOutputFormat.setOutputPath(job,out); job.waitForCompletion(true);
System.out.println("可以了第六次"); }
}

combine对数据进行性别进行计数

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class Demo8 { public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String sex = value.toString().split(",")[3];
context.write(new Text(sex),new LongWritable(1)); }
} public static class combine extends Reducer<Text,LongWritable,Text,LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum=0l;
for (LongWritable value : values) {
sum+=value.get();
}
context.write(key,new LongWritable(sum));
}
} public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum=0l;
for (LongWritable value : values) {
sum+=value.get();
}
context.write(key,new LongWritable(sum));
}
} public static void main(String[] args) throws Exception{ Job job = Job.getInstance();
job.setJobName("combine对性别进行计数");
job.setJarByClass(Demo8.class); job.setMapperClass(map.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class); job.setCombinerClass(combine.class); job.setReducerClass(reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class); Path in = new Path("/data/students.txt");
FileInputFormat.addInputPath(job,in); Path out = new Path("/output8");
FileSystem fs = FileSystem.get(new Configuration());
if (fs.exists(out)){
fs.delete(out,true);
}
FileOutputFormat.setOutputPath(job,out); job.waitForCompletion(true);
System.out.println("你又可以了"); }
}

WordCount基于本地和java的使用的更多相关文章

  1. Terrocotta - 基于JVM的Java应用集群解决方案

    前言 越来越多的企业关键应用都必须采用集群技术,实现负载均衡(Load Balancing).容错(Fault Tolerance)和灾难恢复(Failover).以达到系统可用性(High Avai ...

  2. [How to]基于本地镜像的yum镜像源搭建

    1.简介 本文介绍如何在封闭环境(无外网)下安装离线安装本地镜像与基于本地镜像的yum镜像源. 2.环境版本交代: OS:CentOS-6.7-x86_64-minimal yum: yum-3.2. ...

  3. 基于JavaMail的Java邮件发送:复杂邮件发送

    参考:http://blog.csdn.net/xietansheng/article/details/51722660package com.bfd.ftp.utils;import java.ut ...

  4. 基于本地存储的kvm虚拟机在线迁移

    基于本地存储的kvm虚拟机在线迁移 kvm虚拟机迁移分为4种(1)热迁移基于共享存储(2)热迁移基于本地存储(3)冷迁移基于共享存储(4)冷迁移基于本地存储 这里介绍的是基于本地存储的热迁移 动态块迁 ...

  5. 基于本地文件系统的LocalDB

    零.前言 之前写一些小工具的时候,需要用到数据存储方面的技术,但是用数据库又觉得太大了,本地文件存储txt文件存储又不是很规范,于是乎想到了去编写一个简单的基于本地文件系统的数据存储库,暂且叫它loc ...

  6. 基于类(Java)和基于原理(JavaScript)的对象系统的比较

    Java:面向对象编程语言,吸收了C++语言的各种优点,丢掉了C++让人头疼的多继承.指针等概念.具有功能强大和简单易用的两大特征.Java具有简单性.面向对象.分布式.健壮性.安全性.平台独立与可移 ...

  7. 基于Mapxtreme for JAVA的电子地图设计与实现

    基于Mapxtreme for JAVA的电子地图设计与实现学生毕业设计,适合测绘类专业研究目标:        开发一个基于MapXtreme for JAVA的校园电子地图项目,使用MapInfo ...

  8. Linux -- 基于zookeeper的java api(二)

    Linux -- 基于zookeeper的java api(二) 写一个关于基于集群的zookeeper的自定义实现HA 基于客户端和监控器:使用监控的方法查看每个注册过的节点的状态来做出操作. Wa ...

  9. Linux -- 基于zookeeper的java api(一)

    Linux -- 基于zookeeper的java api 首先启动你所有的 zkService.sh 查看状态:检查是否启动正确 [root@hu-hadoop2 ~]# zkServer.sh s ...

随机推荐

  1. 再整理:Visual Studio Code(vscode)下的基于C++的OpenCV的最新搭建攻略解析

    版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https://www.cnblogs.com/czlhxm/p/13848278.ht ...

  2. 社交网络分析的 R 基础:(四)循环与并行

    前三章中列出的大多数示例代码都很短,并没有涉及到复杂的操作.从本章开始将会把前面介绍的数据结构组合起来,构成真正的程序.大部分程序是由条件语句和循环语句控制,R 语言中的条件语句(if-else)和 ...

  3. 让HTML和JSP页面不缓存从Web服务器上重新获取页面

    感谢原文作者:佚名 原文链接:https://www.jb51.net/web/100639.html 问题描述 用户退出后,如果点击浏览器上的后退按钮,Web应用将不能正确保护受保护的页面--在Se ...

  4. Eclipse 找不到或者无法加载主类

    最近因为频繁练习JDBC更换了几次驱动jar包,然后突然发现出现了找不到或者无法加载主类的错误, 项目上出现了一个感叹号. 解决方法: 项目-右键 properties-java Build Path ...

  5. TCP三次握手和四次挥手【转】

    一. TCP/IP协议族 TCP/IP是一个协议族,通常分不同层次进行开发,每个层次负责不同的通信功能.包含以下四个层次: 1. 链路层,也称作数据链路层或者网络接口层,通常包括操作系统中的设备驱动程 ...

  6. 直播流媒体fms

    第一步  下载  Flash Media Server 4.5 安装教程网上很多 也很简单 我的密码记录  用户admin  密码admin23456 第二步 直接 下载 直播测试工具 FlashMe ...

  7. go基础——运算符

    算数运算符 /* 算术运算符:+,-,*,/,%,++,-- */ a := 10 b := 3 sum := a + b //加减乘类似 fmt.Printf("%d + %d = %d\ ...

  8. 编译安装http2.4

    编译安装http2.4 1.安装相关依赖包 [root@centos7 ~]yum -y install gcc make 2.下载http2.4包,并解压 [root@centos7 ~]#tar ...

  9. 如何快速为团队打造自己的组件库(下)—— 基于 element-ui 为团队打造自己的组件库

    文章已收录到 github,欢迎 Watch 和 Star. 简介 在了解 Element 源码架构 的基础上,接下来我们基于 element-ui 为团队打造自己的组件库. 主题配置 基础组件库在 ...

  10. 虫师Selenium2+Python_00学习大纲

        1.自动化测试基础 5.自动化测试模型 9.Selenium Grid2 13.GitHub托管项目 2.测试环境搭建 6.SeleniumIDE 10.Python多线程 14.持续集成Je ...