Hadoop2.4.1 使用MapReduce简单的数据清洗
package com.bank.service;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 将非结构化的数据处理为结构化数据
* @author mengyao
*
*/
import com.bank.entity.CNY;
public class CnyDataFormat extends Configured implements Tool {
static class CnyDataFormatMapper extends Mapper<LongWritable, Text, NullWritable, CNY>{
CNY cny = new CNY();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
if (fields.length == 42) {
String gzh = fields[12] ;
String currency = fields[9];
String version = fields[10];
String valuta = fields[11];
long qfTime;
try {
qfTime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(fields[3]+" "+fields[4]).getTime();
} catch (ParseException e) {
qfTime = System.currentTimeMillis();
}
int flag = Integer.parseInt(fields[5]);
String machineID = fields[13];
cny.set(gzh, currency, version, valuta, qfTime, flag, machineID);
context.write(NullWritable.get(), cny);
} else {
System.err.println(" ERROR: data format failed!");
}
}
}
static class CnyDataFormatReduce extends Reducer<NullWritable, CNY, NullWritable, CNY>{
@Override
protected void reduce(NullWritable key, Iterable<CNY> value, Context context) throws IOException, InterruptedException {
for (CNY cny : value) {
context.write(NullWritable.get(), cny);
}
}
}
@Override
public int run(String[] arg0) throws Exception {
Job job = Job.getInstance(getConf(), CnyDataFormat.class.getSimpleName());
job.setJarByClass(CnyDataFormat.class); //设置main函数所在的类
FileInputFormat.setInputPaths(job, new Path(arg0[0]));
job.setMapperClass(CnyDataFormatMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(CNY.class);
job.setReducerClass(CnyDataFormatReduce.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(CNY.class);
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
return job.waitForCompletion(true) ? 0 : 1; //等待MapReduce执行完成并打印作业进度详情
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] paths = new GenericOptionsParser(conf, args).getRemainingArgs();
if (paths.length != 2) {
System.err.println("Usage: " + CnyDataFormat.class.getName() + " <in> <out>");
System.exit(2);
}
int status = ToolRunner.run(new CnyDataFormat(), args);
System.exit(status);
}
}
package com.bank.entity;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/**
* 实现Hadoop的序列化接口
* @author mengyao
*
*/
public class CNY implements Writable {
private String gzh;
private String currency;
private String version;
private String valuta;
private long qfTime;
private int flag;
private String machineID;
@Override
public void readFields(DataInput in) throws IOException {
this.gzh = in.readUTF();
this.currency = in.readUTF();
this.version = in.readUTF();
this.valuta = in.readUTF();
this.qfTime = in.readLong();
this.flag = in.readInt();
this.machineID = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.gzh);
out.writeUTF(this.currency);
out.writeUTF(this.version);
out.writeUTF(this.valuta);
out.writeLong(this.qfTime);
out.writeInt(this.flag);
out.writeUTF(this.machineID);
}
public void set(String gzh, String currency, String version,
String valuta, long qfTime, int flag, String machineID) {
this.gzh = gzh;
this.currency = currency;
this.version = version;
this.valuta = valuta;
this.qfTime = qfTime;
this.flag = flag;
this.machineID = machineID;
}
@Override
public String toString() {
return this.gzh +"\t"+ this.currency +"\t"+ this.version +"\t"+ this.valuta +"\t"+ this.qfTime +"\t"+ this.flag +"\t"+ this.machineID;
}
public String getGzh() {
return gzh;
}
public void setGzh(String gzh) {
this.gzh = gzh;
}
public String getCurrency() {
return currency;
}
public void setCurrency(String currnecy) {
this.currency = "cny";
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public String getValuta() {
return valuta;
}
public void setValuta(String valuta) {
this.valuta = valuta;
}
public long getQfTime() {
return qfTime;
}
public void setQfTime(long qfTime) {
this.qfTime = qfTime;
}
public int getFlag() {
return flag;
}
public void setFlag(int flag) {
this.flag = flag;
}
public String getMachineID() {
return machineID;
}
public void setMachineID(String machineID) {
this.machineID = machineID;
}
}
Hadoop2.4.1 使用MapReduce简单的数据清洗的更多相关文章
- MapReduce 简单的全文搜索2
上一个全文搜索实现了模糊查找,这个主要实现了精确查找,就是比如你查找mapreduce is simple那么他就只查找有这个句子的文章,而不是查找有这三个单词的文章. 这个版本需要重写反向索引,因为 ...
- oozie与mapreduce简单案例
准备工作 拷贝原来的模板 mkdir oozie-apps cd oozie-apps/ cp -r ../examples/apps/mar-reduce . mv map-reduce mr-w ...
- Hadoop2 使用 YARN 运行 MapReduce 的过程源码分析
Hadoop 使用 YARN 运行 MapReduce 的过程如下图所示: 总共分为11步. 这里以 WordCount 为例, 我们在客户端终端提交作业: # 把本地的 /home/hadoop/t ...
- MapReduce 简单数据统计
1. 准备数据源 摘录了一片散文,保存格式为utf-8 2. 准备环境 2.1 搭建伪分布式环境 https://www.cnblogs.com/cjq10029/p/12336446.html 上传 ...
- MapReduce简单执行过程及Wordcount案例
MapReducer运行过程 以单词统计为案例. 假如现在文件中存在如下内容: aa bb aa cc dd aa 当然,这是小文件,如果文件大小较大时会将文件进行 "切片" ,此 ...
- 【hadoop2.6.0】MapReduce原理
看了几篇博文,感觉还是云里雾里的. http://blog.csdn.net/opennaive/article/details/7514146 http://www.aboutyun.com/thr ...
- hadoop2.2编程:mapreduce编程之二次排序
mr自带的例子中的源码SecondarySort,我重新写了一下,基本没变. 这个例子中定义的map和reduce如下,关键是它对输入输出类型的定义:(java泛型编程) public static ...
- MapReduce 简单的全文搜索
上一个已经实现了反向索引,那么为什么不尝试下全文搜索呢.例如有了 Hello file3.txt:1; MapReduce file3.txt:2;fil1.txt:1;fil2.tx ...
- hadoop mapreduce 简单例子
本例子统计 用空格分开的单词出现数量( 这个Main.mian 启动方式是hadoop 2.0 的写法.1.0 不一样 ) 目录结构: 使用的 maven : 下面是maven 依赖. <de ...
随机推荐
- Linux下python升级
Centos即使用Yum更新也是Python2.6.6所以需要升级到Python2.7.8 1.先下载源码包 1 wget https://www.python.org/ftp/python/2.7. ...
- CentOS 7 安装教程
参考资料: http://www.cnblogs.com/bobbylinux/articles/centos7.html
- Linux系统最小化安装之后的系统基础环境安装以及内核优化脚本
#!/bin/bash #添加epel和rpmforge的外部yum扩展源 cd /usr/local/src wget http://mirrors.ustc.edu.cn/fedora/epel/ ...
- el和jstl
<%@page import="cn.bdqn.bean.News"%> <%@ page language="java" import=&q ...
- hdu 2156
#include <iostream> #include <stdio.h> using namespace std; int main() { int i,n; while( ...
- activiti总结
1.activiti如何修改登录用户名?在哪个数据库里面添加. 2.activiti的启动和部署在http://activiti.org/userguide/index.html#demo.setup ...
- Dragger代码实现
转自:http://www.apkbus.com/blog-705730-60436.html 在工程中引入Dagger 如果想使用Dagger的话,需要添加两个函数库: dependencies { ...
- SQL从入门到基础 - 04 SQLServer基础2(数据删除、数据检索、数据汇总、数据排序、通配符过滤、空值处理、多值匹配)
一.数据删除 1. 删除表中全部数据:Delete from T_Person. 2. Delete 只是删除数据,表还在,和Drop Table(数据和表全部删除)不同. 3. Delete 也可以 ...
- Shell中的${},##和%%的使用
假设我们定义了一个变量为: file=/dir1/dir2/dir3/my.file.txt 可以用${ }分别替换得到不同的值: ${file#*/}:删掉第一个/ 及其左边的字符串:dir1/di ...
- Javascript 常用函数【2】
1.常规函数javascript常规函数包括以下9个函数:(1)alert函数:显示一个警告对话框,包括一个OK按钮.(2)confirm函数:显示一个确认对话框,包括OK.Cancel按钮.(3)e ...