1、dirver

package com.kangaroo.hadoop.drive;

import java.util.Map;
import java.util.Properties; import com.kangaroo.hadoop.mapper.AggregateMapper;
import com.kangaroo.hadoop.reducer.AggregateReducer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import com.kangaroo.hadoop.utils.PropertiesUtil; public class DriveMain extends Configured implements Tool { private static final Logger logger = LoggerFactory.getLogger(DriveMain.class);
private Configuration conf;
private PropertiesUtil propUtil; public DriveMain() {
this.conf = new Configuration();
this.propUtil = new PropertiesUtil("configure.properties");
} public int run(String[] args) throws Exception {
try {
logger.info("MapReduce Job Beginning.");
String dbName = args[0];
String tableName = args[1];
String partition = args[2];
String sumField = args[3];
String outPath = args[4];
String partFilter = partitionFormat(partition);
logger.info("[Params] dbName:{}; tableName:{}, partition:{}, sumField:{}, outPath:{}, partFilter:{}",
dbName, tableName, partition, sumField, outPath, partFilter);
this.conf.set("sumField", sumField);
this.setMapRedConfiguration();
Job job = this.setJobConfiguration(this.conf);
HCatInputFormat.setInput(job, dbName, tableName, partFilter);
logger.info("setInput successfully.");
FileOutputFormat.setOutputPath(job, new Path(outPath));
logger.info("setOutput successfully.");
return (job.waitForCompletion(true) ? 0 : 1);
} catch (Exception ex) {
logger.error(ex.getMessage());
throw ex;
}
} private Job setJobConfiguration(Configuration conf) throws Exception {
try {
logger.info("enter setJobConfiguration");
Job job = Job.getInstance(conf);
job.setJarByClass(DriveMain.class);
job.setInputFormatClass(HCatInputFormat.class);
job.setMapperClass(AggregateMapper.class);
job.setReducerClass(AggregateReducer.class); job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(1);
logger.info("setJobConfiguration successfully.");
return job;
} catch (Exception ex) {
logger.error("setJobConfiguration: " + ex.getMessage());
throw new Exception(ex);
}
} private void setMapRedConfiguration() {
try {
Properties properties = propUtil.getProperties();
logger.info("Load MapReduce Configuration Successfully.");
for (Map.Entry entry : properties.entrySet()) {
if (entry.getKey().toString().startsWith("mapred")) {
conf.set(entry.getKey().toString(), entry.getValue().toString());
logger.info("[MR][Config] key:{}, value:{}", entry.getKey().toString(), entry.getValue().toString());
}
}
logger.info("[MR][Config] Set MapReduce Configuration Successfully.");
} catch (Exception e) { } } private String partitionFormat(String partition) {
String format = "";
if(!partition.contains("pt") && ! partition.contains("dt")) {
String[] items = partition.split("/");
String[] keys = {"year","month","day", "hour"};
for(int i=0; i<items.length; i++) {
if (i == items.length-1) {
format += keys[i] + "='" + items[i] + "'";
} else {
format += keys[i] + "='" + items[i] + "' and ";
}
}
} else {
format = partition;
}
return format;
} public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new DriveMain(), args);
System.exit(exitCode);
} }

2、Mapper

package com.kangaroo.hadoop.mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import java.io.IOException;
import java.util.HashMap;
import java.util.Map; @SuppressWarnings("rawtypes")
public class AggregateMapper extends Mapper<WritableComparable, HCatRecord, Text, Text> { private static final Logger logger = LoggerFactory.getLogger(AggregateMapper.class); private HCatSchema schema;
private Text outKey;
private Text outValue;
private IntWritable one; @Override
protected void setup(Context context) throws IOException, InterruptedException {
outKey = new Text();
outValue = new Text();
schema = HCatInputFormat.getTableSchema(context.getConfiguration());
} @Override
protected void map(WritableComparable key, HCatRecord value, Context context) throws IOException, InterruptedException {
String sumField = context.getConfiguration().get("sumField");
Map<String, String> recordMap = new HashMap<String, String>();
for (String fieldName : schema.getFieldNames()) {
logger.info("fieldName={}", fieldName);
String fieldValue = value.get(fieldName, schema).toString();
logger.info("fieldName={}, fieldValue={}", fieldName, fieldValue);
recordMap.put(fieldName, fieldValue);
logger.info("recordMap={}", recordMap.toString());
}
outKey.set(recordMap.get(sumField));
outValue.set("1");
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(outKey, outValue);
}
}

3、Reducer

package com.kangaroo.hadoop.reducer;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import java.io.IOException; @SuppressWarnings("rawtypes")
public class AggregateReducer extends Reducer<Text, Text, Text, Text> {
protected static final Logger logger = LoggerFactory.getLogger(AggregateReducer.class);
HCatSchema schema;
Text outKey;
Text outValue; @Override
protected void setup(Context context) throws IOException, InterruptedException {
schema = HCatInputFormat.getTableSchema(context.getConfiguration());
} @Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException,InterruptedException {
outKey.set(key);
int sum = 0;
for (Text value : values) {
sum += Integer.parseInt(value.toString());
}
outValue.set(String.valueOf(sum));
} protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(outKey, outValue);
}
}

4、propertyUtil

package com.kangaroo.hadoop.utils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties; import java.io.IOException;
import java.io.InputStream;
import java.util.Properties; public class PropertiesUtil {
private String filePath; public PropertiesUtil() {
this.filePath = "configure.properties";
} public PropertiesUtil(String filePath) {
this.filePath = filePath;
} public Properties getProperties() throws IOException {
Properties prop;
InputStream inStream = null;
try {
inStream = PropertiesUtil.class.getClassLoader()
.getResourceAsStream(this.filePath);
prop = new Properties();
prop.load(inStream); return prop;
} finally {
if (inStream != null)
inStream.close();
}
}
}

5、配置

mapred.job.queue.name=root.XXX
mapred.jar=./XXX.jar
mapred.map.tasks=300
mapred.reduce.tasks=100
#mapred.map.capacity=1
#mapred.reduce.capacity=1
mapred.job.priority=HIGH
mapred.job.name=XXX

Hadoop通过HCatalog编写Mapreduce任务访问hive库中schema数据的更多相关文章

  1. 如何将hive表中的数据导出

    近期经常将现场的数据带回公司测试,所以写下该文章,梳理一下思路. 1.首先要查询相应的hive表,比如我要将c_cons这张表导出,我先查出hive中是否有这张表. 查出数据,证明该表在hive中存在 ...

  2. 访问cv::Mat中的数据时遇到的指针类型问题

    在用Opencv的时候由于下图原本的图像尺寸是1111*1111,要进行resize,代码如下: cv::Mat img = cv::imread("//Users//apple//td3/ ...

  3. 【Hadoop测试程序】编写MapReduce测试Hadoop环境

    我们使用之前搭建好的Hadoop环境,可参见: <[Hadoop环境搭建]Centos6.8搭建hadoop伪分布模式>http://www.cnblogs.com/ssslinppp/p ...

  4. 在hadoop上进行编写mapreduce程序,统计关键词在text出现次数

    mapreduce的处理过程分为2个阶段,map阶段,和reduce阶段.在要求统计指定文件里的全部单词的出现次数时. map阶段把每一个关键词写到一行上以逗号进行分隔.并初始化数量为1(同样的单词h ...

  5. HBase结合MapReduce批量导入(HDFS中的数据导入到HBase)

    HBase结合MapReduce批量导入 package hbase; import java.text.SimpleDateFormat; import java.util.Date; import ...

  6. 批量查询hive库中所有表的count

    一.准备文件 mkdir /query_hive_table_count touch query_db_name_table touch query_table_result.txt 二.编辑文件 2 ...

  7. 小记---------spark组件与其他组件的比较 spark/mapreduce ;spark sql/hive ; spark streaming/storm

    Spark与Hadoop的对比   Scala是Spark的主要编程语言,但Spark还支持Java.Python.R作为编程语言 Hadoop的编程语言是Java    

  8. hive上传下载数据

    ------------------------------------------read me--方式1:适用于工具传输--方式2:适用于手动临时性传输---------------------- ...

  9. Hive扩展功能(三)--使用UDF函数将Hive中的数据插入MySQL中

    软件环境: linux系统: CentOS6.7 Hadoop版本: 2.6.5 zookeeper版本: 3.4.8 主机配置: 一共m1, m2, m3这五部机, 每部主机的用户名都为centos ...

随机推荐

  1. MySQL在高版本需要指明是否进行SSL连接问题

    Java使用mysql-jdbc连接MySQL出现如下警告: Establishing SSL connection without server's identity verification is ...

  2. 【Alpha阶段】第五次scrum meeting

    一.会议照片 二.会议内容 姓名 学号 负责模块 昨日任务完成度 今日任务 杨爱清 099 界面设计和交互功能 完成 去酷狗选择合适的轻音乐 杨立鑫 100 数据库搭建和其他 完成 继续对数据库进行编 ...

  3. 团队作业4——第一次项目冲刺(Alpha版本)4th day

    一.Daily Scrum Meeting照片 二.燃尽图 三.项目进展 计时功能已经完成,然后24点的代码如何在游戏界面与界面组件联系上正在进行. 四.困难与问题 1.在安卓框架与java代码的结合 ...

  4. 201521123030 《Java程序设计》第8周学习总结

    1. 本周学习总结 1.1 以你喜欢的方式(思维导图或其他)归纳总结集合与泛型相关内容. 1.2 选做:收集你认为有用的代码片段 2. 书面作业 本次作业题集集合 1.List中指定元素的删除(题目4 ...

  5. 201521123004《Java程序设计》第4周学习总结

    1. 本周学习总结 1.1 尝试使用思维导图总结有关继承的知识点. 1.2 使用常规方法总结其他上课内容. 本周主要内容为: 继承:extends 抽取共同特征(行为与属性) 复用代码 继承时子类将获 ...

  6. 201521123110《Java程序设计》第12周学习总结

    1. 本周学习总结 2. 书面作业 1. 字符流与文本文件:使用 PrintWriter(写),BufferedReader(读) 1.1 生成的三个学生对象,使用PrintWriter的printl ...

  7. 在Ubuntu中部署并测试Fabric 1.0 Beta

    [更新:1.0Beta已经是过去式了,现在出了1.0.0的正式版,请大家参照 http://www.cnblogs.com/studyzy/p/7437157.html  安装Fabric 1.0.0 ...

  8. 02函数-04-箭头函数(ES6)

    ES6新增的函数:Arrow Function,定义方式就是一个箭头 箭头函数相当于匿名函数,并且简化了函数定义,和匿名函数最大的区别在于其内部的this不再"乱跑",而是由上下文 ...

  9. 【译】The Accidental DBA:Troubleshooting Performance

    最近重新翻看The Accidental DBA,将Troubleshooting Performance部分稍作整理,方便以后查阅.此篇是Part 2Part 1:The Accidental DB ...

  10. Windows的文件权限 研究笔记

    最近公司的一台设备中了病毒,杀了又出现,总是破坏机器上面运行的程序. 想着研究下文件权限,把文件设为只读,让病毒破坏不了即可. 于是开始了实验1: 首先建立一个txt文件,查看权限: 可以看到User ...