代码结构

BeanWritable:往数据库读写使用的bean

ControlJobTest:JobControl任务控制

DBInputFormatApp:将关系型数据库的数据导入HDFS,其中包含了Map、Reduce,内部静态类

DBOutputFormatApp:将HDFS的结构化数据导入关系型数据库

此处关系型数据库使用Mysql

代码如下

BeanWritable.java

/**
*
*/
package com.zhen.controlJobTest; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException; import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.lib.db.DBWritable; /**
* JavaBean
* 需要实现Hadoop序列化接口Writable以及与数据库交互时的序列化接口DBWritable
* 官方API中解释如下:
* public class DBInputFormat<T extends DBWritable>
* extends InputFormat<LongWritable, T> implements Configurable
* 即Mapper的Key是LongWritable类型,不可改变;Value是继承自DBWritable接口的自定义JavaBean
*
* @author FengZhen
*/
public class BeanWritable implements Writable, DBWritable { private int id;
private String name;
private double height; public void readFields(ResultSet resultSet) throws SQLException {
this.id = resultSet.getInt();
this.name = resultSet.getString();
this.height = resultSet.getDouble();
} public void write(PreparedStatement preparedStatement) throws SQLException {
preparedStatement.setInt(, id);
preparedStatement.setString(, name);
preparedStatement.setDouble(, height);
} public void readFields(DataInput dataInput) throws IOException {
this.id = dataInput.readInt();
this.name = dataInput.readUTF();
this.height = dataInput.readDouble();
} public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(id);
dataOutput.writeUTF(name);
dataOutput.writeDouble(height);
} public void set(int id,String name,double height){
this.id = id;
this.name = name;
this.height = height;
} @Override
public String toString() {
return id + "\t" + name + "\t" + height;
} }

DBInputFormatApp.java

package com.zhen.controlJobTest;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.lib.db.DBWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* @author FengZhen
* 将mysql数据导入hdfs
*/
public class DBInputFormatApp{ /**
* Map
* 当Map的输出key为LongWritable,value为Text时,reduce可以省略不写,默认reduce也是输出LongWritable:Text
* */
public static class DBInputMapper extends Mapper<LongWritable, BeanWritable, LongWritable, Text> { private LongWritable outputKey;
private Text outputValue; @Override
protected void setup(Mapper<LongWritable, BeanWritable, LongWritable, Text>.Context context)
throws IOException, InterruptedException {
this.outputKey = new LongWritable();
this.outputValue = new Text();
} @Override
protected void map(LongWritable key, BeanWritable value,
Mapper<LongWritable, BeanWritable, LongWritable, Text>.Context context)
throws IOException, InterruptedException {
outputKey.set(key.get());;
outputValue.set(value.toString());
context.write(outputKey, outputValue);
} }
}

DBOutputFormatApp.java

package com.zhen.controlJobTest;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.lib.db.DBWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* @author FengZhen
* 将hdfs数据导入mysql
* 使用DBOutputFormat将HDFS路径下的结构化数据写入mysql中,结构化数据如下,第一列为key,后边三列为数据
* 0 1 Enzo 180.66
* 1 2 Din 170.666
*
*/
public class DBOutputFormatApp{ public static class DBOutputMapper extends Mapper<LongWritable, Text, NullWritable, BeanWritable>{
private NullWritable outputKey;
private BeanWritable outputValue; @Override
protected void setup(Mapper<LongWritable, Text, NullWritable, BeanWritable>.Context context)
throws IOException, InterruptedException {
this.outputKey = NullWritable.get();
this.outputValue = new BeanWritable();
}
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, NullWritable, BeanWritable>.Context context)
throws IOException, InterruptedException {
//插入数据库成功的计数器
final Counter successCounter = context.getCounter("exec", "successfully");
//插入数据库失败的计数器
final Counter faildCounter = context.getCounter("exec", "faild");
//解析结构化数据
String[] fields = value.toString().split("\t");
//DBOutputFormatApp这个MapReduce应用导出的数据包含long类型的key,所以忽略key从1开始
if (fields.length > ) {
int id = Integer.parseInt(fields[]);
String name = fields[];
double height = Double.parseDouble(fields[]);
this.outputValue.set(id, name, height);
context.write(outputKey, outputValue);
//如果插入数据库成功则递增1,表示成功计数
successCounter.increment(1L);
}else{
//如果插入数据库失败则递增1,表示失败计数
faildCounter.increment(1L);
} }
} /**
* 输出的key必须是继承自DBWritable的类型,DBOutputFormat要求输出的key必须是DBWritable类型
* */
public static class DBOutputReducer extends Reducer<NullWritable, BeanWritable, BeanWritable, NullWritable>{
@Override
protected void reduce(NullWritable key, Iterable<BeanWritable> values,
Reducer<NullWritable, BeanWritable, BeanWritable, NullWritable>.Context context)
throws IOException, InterruptedException {
for (BeanWritable beanWritable : values) {
context.write(beanWritable, key);
}
}
} }

ControlJobTest.java

/**
*
*/
package com.zhen.controlJobTest; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.zhen.controlJobTest.DBInputFormatApp.DBInputMapper;
import com.zhen.controlJobTest.DBOutputFormatApp.DBOutputMapper;
import com.zhen.controlJobTest.DBOutputFormatApp.DBOutputReducer; /**
* @author FengZhen
*
*/
public class ControlJobTest { public static void main(String[] args) throws IOException {
//第一个任务,mysql导入到HDFS
Configuration configuration1 = new Configuration();
//配置当前作业需要使用的JDBC配置
DBConfiguration.configureDB(configuration1, "com.mysql.jdbc.Driver", "jdbc:mysql://localhost:3306/hadoop",
"root", "123qwe");
Job job1 = Job.getInstance(configuration1, DBInputFormatApp.class.getSimpleName()); job1.setJarByClass(DBInputFormatApp.class);
job1.setMapperClass(DBInputMapper.class);
job1.setMapOutputKeyClass(LongWritable.class);
job1.setMapOutputValueClass(Text.class); job1.setOutputKeyClass(LongWritable.class);
job1.setOutputValueClass(Text.class); //配置作业的输入数据格式
job1.setInputFormatClass(DBInputFormat.class);
//配置当前作业需要查询的sql语句及接收sql语句的bean
DBInputFormat.setInput(
job1,
BeanWritable.class,
"select * from people",
"select count(1) from people"); FileOutputFormat.setOutputPath(job1, new Path(args[])); //第二个任务 HDFS导出到mysql Configuration configuration2 = new Configuration();
//在创建Configuration的时候紧接着配置数据库连接信息
DBConfiguration.configureDB(configuration2, "com.mysql.jdbc.Driver", "jdbc:mysql://localhost:3306/hadoop", "root", "123qwe");
Job job2 = Job.getInstance(configuration2, DBOutputFormatApp.class.getSimpleName());
job2.setJarByClass(DBOutputFormatApp.class);
job2.setMapperClass(DBOutputMapper.class);
job2.setMapOutputKeyClass(NullWritable.class);
job2.setMapOutputValueClass(BeanWritable.class); job2.setReducerClass(DBOutputReducer.class);
job2.setOutputFormatClass(DBOutputFormat.class);
job2.setOutputKeyClass(BeanWritable.class);
job2.setOutputValueClass(NullWritable.class); job2.setInputFormatClass(TextInputFormat.class);
FileInputFormat.setInputPaths(job2, args[]);
//配置当前作业输出到数据库表、字段信息
DBOutputFormat.setOutput(job2, "people", new String[]{"id","name","height"}); ControlledJob controlledJob1 = new ControlledJob(configuration1);
controlledJob1.setJob(job1); ControlledJob controlledJob2 = new ControlledJob(configuration2);
controlledJob2.setJob(job2); //如果两个任务有依赖关系,必须设置此选项
controlledJob2.addDependingJob(controlledJob1); JobControl jobControl = new JobControl("groupName");
jobControl.addJob(controlledJob1);
jobControl.addJob(controlledJob2);
jobControl.run(); while(true){
boolean allFinished = jobControl.allFinished();
if (allFinished) {
System.exit();
}
} } }

mysql表结构如下

CREATE TABLE `people` (
`id` int() NOT NULL,
`name` varchar() DEFAULT NULL,
`height` double DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8

先插入测试数据

然后将代码打包为jar,传到服务器,执行任务

hadoop jar /Users/FengZhen/Desktop/Hadoop/other/mapreduce_jar/JobControlTest.jar com.zhen.controlJobTest.ControlJobTest 
/user/hadoop/mapreduce/mysqlToHdfs/people

此任务包含了两个子任务,一个是将mysql数据导入HDFS,一个是将HDFS的数据导出Mysql,也可以写个简单的mapreduce任务来测试。

如果两个子任务有依赖关系,那么必须要设置

controlledJob2.addDependingJob(controlledJob1);

说明job2依赖于job1,当job1执行完之后才会去执行job2.

使用JobControl控制MapReduce任务的更多相关文章

  1. 6.7 Mapreduce作业流JobControl和Oozie

    1.1  Mapreduce作业流JobControl和Oozie 更复杂的任务,需要多个mapreduce作业,形成作业流,而不是增加map和reduce的复杂度.复杂问题,可以用高级语言pig.h ...

  2. 组合式+迭代式+链式 MapReduce

    1.迭代式mapreduce 一些复杂的任务难以用一次mapreduce处理完成,需要多次mapreduce才能完成任务,例如Pagrank,Kmeans算法都需要多次的迭代,关于mapreduce迭 ...

  3. MapReduce的核心资料索引 [转]

    转自http://prinx.blog.163.com/blog/static/190115275201211128513868/和http://www.cnblogs.com/jie46583173 ...

  4. 谷歌三大核心技术(二)Google MapReduce中文版

    谷歌三大核心技术(二)Google MapReduce中文版  Google MapReduce中文版     译者: alex   摘要 MapReduce是一个编程模型,也是一个处理和生成超大数据 ...

  5. MapReduce笔记——技术点汇总

    目录 · 概况 · 原理 · MapReduce编程模型 · MapReduce过程 · 容错机制 · API · 概况 · WordCount示例 · Writable接口 · Mapper类 ·  ...

  6. 【机器学习实战】第15章 大数据与MapReduce

    第15章 大数据与MapReduce 大数据 概述 大数据: 收集到的数据已经远远超出了我们的处理能力. 大数据 场景 假如你为一家网络购物商店工作,很多用户访问该网站,其中有些人会购买商品,有些人则 ...

  7. MapReduce shuffle过程剖析及调优

    MapReduce简介 在Hadoop MapReduce中,框架会确保reduce收到的输入数据是根据key排序过的.数据从Mapper输出到Reducer接收,是一个很复杂的过程,框架处理了所有问 ...

  8. 大数据技术 - MapReduce 应用的配置和单元测试

    上一章的 MapReduce 应用中,我们使用了自定义配置,并用 GenericOptionsParser 处理命令行输入的配置,这种方式简单粗暴.但不是 MapReduce 应用常见的写法,本章第一 ...

  9. Python初次实现MapReduce——WordCount

    前言 Hadoop 本身是用 Java 开发的,所以之前的MapReduce代码小练都是由Java代码编写,但是通过Hadoop Streaming,我们可以使用任意语言来编写程序,让Hadoop 运 ...

随机推荐

  1. linux pptp 服务端安装并正常上网

    linux 下 PPTP VPN 1.安装相关软件32位版:yum -y install ppprpm -Uvh http://poptop.sourceforge.net/yum/stable/rh ...

  2. 华为nova3超级慢动作酷玩抖音,没有办法我就是这么强大!

    华为nova3超级慢动作酷玩抖音,没有办法我就是这么强大! 在华为最新发布的nova 3手机上,抖音通过华为himedia SDK集成了60fps.超级慢动作等华为媒体开放能力,在加持这些能力后,抖音 ...

  3. Shell Error: -bash: ./test.sh: /bin/bash^M: bad interpreter: No such file or directory (转)

    错误原因可能有以下几种: 1.在WIN底下用文本编辑工具修改过参数变量,在保存的时候没注意编码格式造成的, 2.也有可能是在VIM里修改,第一行末尾按到ctrl_v 查看文件是DOS格式.UNIX格式 ...

  4. git入门三(远程、标签)

    git 入门三 (远程.标签)     分布式版本控制管理系统本地仓库和中心服务器仓库数据是本地的镜像仓库,中心服务器数据仓库的是为了多用户数据合并和获取同步的中心,多人协作需要管理这些远程仓库,以便 ...

  5. 有关于__align(n) ,内存对齐

    __align __align 关键字指示编译器在 n 字节边界上对齐变量. __align 是一个存储类修饰符.它不影响函数的类型. 语法 __align(n) 其中: n 是对齐边界. 对于局部变 ...

  6. 反射机制,jvm,class类型

    [说明]这是上午完成的内容或者说是接触到的知识点,包括servlet简单的数据库连接,表格的显示需要用到的插件jstl,还有最最多的java反射原理的讲解 1)数据库的设计 2)编程中用到的知识点 3 ...

  7. 【BZOJ1000】A+B Problem ★BZOJ1000题达成★

    [BZOJ1000]A+B Problem Description 输入两个数字,输出它们之和 Input 一行两个数字A,B(0<=A,B<100) Output 输出这两个数字之和 S ...

  8. 九度OJ 1262:Sequence Construction puzzles(I)_构造全递增序列 (DP)

    时间限制:1 秒 内存限制:32 兆 特殊判题:否 提交:118 解决:54 题目描述: 给定一个整数序列,请问如何去掉最少的元素使得原序列变成一个全递增的序列. 输入: 输入的第一行包括一个整数N( ...

  9. ElasticSearch(二十一)正排和倒排索引

    1.区别 搜索的时候,要依靠倒排索引:排序的时候,需要依靠正排索引,看到每个document的每个field,然后进行排序,所谓的正排索引,其实就是doc values 在建立索引的时候,一方面会建立 ...

  10. 洛谷2704 [NOI2001]炮兵阵地

    题目戳这里 Solution 状压DP很好的入门题,用熟练位运算貌似也没那么难. 首先分析一下题目: 看见n=100,m=10,立马就想到了状压,看起来也像DP,所以我们还是采用行号为阶段的状压DP. ...