1、dirver

package com.kangaroo.hadoop.drive;

import java.util.Map;
import java.util.Properties; import com.kangaroo.hadoop.mapper.AggregateMapper;
import com.kangaroo.hadoop.reducer.AggregateReducer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import com.kangaroo.hadoop.utils.PropertiesUtil; public class DriveMain extends Configured implements Tool { private static final Logger logger = LoggerFactory.getLogger(DriveMain.class);
private Configuration conf;
private PropertiesUtil propUtil; public DriveMain() {
this.conf = new Configuration();
this.propUtil = new PropertiesUtil("configure.properties");
} public int run(String[] args) throws Exception {
try {
logger.info("MapReduce Job Beginning.");
String dbName = args[0];
String tableName = args[1];
String partition = args[2];
String sumField = args[3];
String outPath = args[4];
String partFilter = partitionFormat(partition);
logger.info("[Params] dbName:{}; tableName:{}, partition:{}, sumField:{}, outPath:{}, partFilter:{}",
dbName, tableName, partition, sumField, outPath, partFilter);
this.conf.set("sumField", sumField);
this.setMapRedConfiguration();
Job job = this.setJobConfiguration(this.conf);
HCatInputFormat.setInput(job, dbName, tableName, partFilter);
logger.info("setInput successfully.");
FileOutputFormat.setOutputPath(job, new Path(outPath));
logger.info("setOutput successfully.");
return (job.waitForCompletion(true) ? 0 : 1);
} catch (Exception ex) {
logger.error(ex.getMessage());
throw ex;
}
} private Job setJobConfiguration(Configuration conf) throws Exception {
try {
logger.info("enter setJobConfiguration");
Job job = Job.getInstance(conf);
job.setJarByClass(DriveMain.class);
job.setInputFormatClass(HCatInputFormat.class);
job.setMapperClass(AggregateMapper.class);
job.setReducerClass(AggregateReducer.class); job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(1);
logger.info("setJobConfiguration successfully.");
return job;
} catch (Exception ex) {
logger.error("setJobConfiguration: " + ex.getMessage());
throw new Exception(ex);
}
} private void setMapRedConfiguration() {
try {
Properties properties = propUtil.getProperties();
logger.info("Load MapReduce Configuration Successfully.");
for (Map.Entry entry : properties.entrySet()) {
if (entry.getKey().toString().startsWith("mapred")) {
conf.set(entry.getKey().toString(), entry.getValue().toString());
logger.info("[MR][Config] key:{}, value:{}", entry.getKey().toString(), entry.getValue().toString());
}
}
logger.info("[MR][Config] Set MapReduce Configuration Successfully.");
} catch (Exception e) { } } private String partitionFormat(String partition) {
String format = "";
if(!partition.contains("pt") && ! partition.contains("dt")) {
String[] items = partition.split("/");
String[] keys = {"year","month","day", "hour"};
for(int i=0; i<items.length; i++) {
if (i == items.length-1) {
format += keys[i] + "='" + items[i] + "'";
} else {
format += keys[i] + "='" + items[i] + "' and ";
}
}
} else {
format = partition;
}
return format;
} public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new DriveMain(), args);
System.exit(exitCode);
} }

2、Mapper

package com.kangaroo.hadoop.mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import java.io.IOException;
import java.util.HashMap;
import java.util.Map; @SuppressWarnings("rawtypes")
public class AggregateMapper extends Mapper<WritableComparable, HCatRecord, Text, Text> { private static final Logger logger = LoggerFactory.getLogger(AggregateMapper.class); private HCatSchema schema;
private Text outKey;
private Text outValue;
private IntWritable one; @Override
protected void setup(Context context) throws IOException, InterruptedException {
outKey = new Text();
outValue = new Text();
schema = HCatInputFormat.getTableSchema(context.getConfiguration());
} @Override
protected void map(WritableComparable key, HCatRecord value, Context context) throws IOException, InterruptedException {
String sumField = context.getConfiguration().get("sumField");
Map<String, String> recordMap = new HashMap<String, String>();
for (String fieldName : schema.getFieldNames()) {
logger.info("fieldName={}", fieldName);
String fieldValue = value.get(fieldName, schema).toString();
logger.info("fieldName={}, fieldValue={}", fieldName, fieldValue);
recordMap.put(fieldName, fieldValue);
logger.info("recordMap={}", recordMap.toString());
}
outKey.set(recordMap.get(sumField));
outValue.set("1");
} @Override
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(outKey, outValue);
}
}

3、Reducer

package com.kangaroo.hadoop.reducer;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import java.io.IOException; @SuppressWarnings("rawtypes")
public class AggregateReducer extends Reducer<Text, Text, Text, Text> {
protected static final Logger logger = LoggerFactory.getLogger(AggregateReducer.class);
HCatSchema schema;
Text outKey;
Text outValue; @Override
protected void setup(Context context) throws IOException, InterruptedException {
schema = HCatInputFormat.getTableSchema(context.getConfiguration());
} @Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException,InterruptedException {
outKey.set(key);
int sum = 0;
for (Text value : values) {
sum += Integer.parseInt(value.toString());
}
outValue.set(String.valueOf(sum));
} protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(outKey, outValue);
}
}

4、propertyUtil

package com.kangaroo.hadoop.utils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties; import java.io.IOException;
import java.io.InputStream;
import java.util.Properties; public class PropertiesUtil {
private String filePath; public PropertiesUtil() {
this.filePath = "configure.properties";
} public PropertiesUtil(String filePath) {
this.filePath = filePath;
} public Properties getProperties() throws IOException {
Properties prop;
InputStream inStream = null;
try {
inStream = PropertiesUtil.class.getClassLoader()
.getResourceAsStream(this.filePath);
prop = new Properties();
prop.load(inStream); return prop;
} finally {
if (inStream != null)
inStream.close();
}
}
}

5、配置

mapred.job.queue.name=root.XXX
mapred.jar=./XXX.jar
mapred.map.tasks=300
mapred.reduce.tasks=100
#mapred.map.capacity=1
#mapred.reduce.capacity=1
mapred.job.priority=HIGH
mapred.job.name=XXX

Hadoop通过HCatalog编写Mapreduce任务访问hive库中schema数据的更多相关文章

  1. 如何将hive表中的数据导出

    近期经常将现场的数据带回公司测试,所以写下该文章,梳理一下思路. 1.首先要查询相应的hive表,比如我要将c_cons这张表导出,我先查出hive中是否有这张表. 查出数据,证明该表在hive中存在 ...

  2. 访问cv::Mat中的数据时遇到的指针类型问题

    在用Opencv的时候由于下图原本的图像尺寸是1111*1111,要进行resize,代码如下: cv::Mat img = cv::imread("//Users//apple//td3/ ...

  3. 【Hadoop测试程序】编写MapReduce测试Hadoop环境

    我们使用之前搭建好的Hadoop环境,可参见: <[Hadoop环境搭建]Centos6.8搭建hadoop伪分布模式>http://www.cnblogs.com/ssslinppp/p ...

  4. 在hadoop上进行编写mapreduce程序,统计关键词在text出现次数

    mapreduce的处理过程分为2个阶段,map阶段,和reduce阶段.在要求统计指定文件里的全部单词的出现次数时. map阶段把每一个关键词写到一行上以逗号进行分隔.并初始化数量为1(同样的单词h ...

  5. HBase结合MapReduce批量导入(HDFS中的数据导入到HBase)

    HBase结合MapReduce批量导入 package hbase; import java.text.SimpleDateFormat; import java.util.Date; import ...

  6. 批量查询hive库中所有表的count

    一.准备文件 mkdir /query_hive_table_count touch query_db_name_table touch query_table_result.txt 二.编辑文件 2 ...

  7. 小记---------spark组件与其他组件的比较 spark/mapreduce ;spark sql/hive ; spark streaming/storm

    Spark与Hadoop的对比   Scala是Spark的主要编程语言,但Spark还支持Java.Python.R作为编程语言 Hadoop的编程语言是Java    

  8. hive上传下载数据

    ------------------------------------------read me--方式1:适用于工具传输--方式2:适用于手动临时性传输---------------------- ...

  9. Hive扩展功能(三)--使用UDF函数将Hive中的数据插入MySQL中

    软件环境: linux系统: CentOS6.7 Hadoop版本: 2.6.5 zookeeper版本: 3.4.8 主机配置: 一共m1, m2, m3这五部机, 每部主机的用户名都为centos ...

随机推荐

  1. Javascript学习日志(三):闭包

    说实话,前面一节的原型和原型链在当初学的时候并没有很头疼,对着高级编程第三版撸了几遍就理解透了,闭包这一节真的挺头疼的,很惭愧,看了差不多十来遍吧,还翻看了网上的其他博客和解释文档,五花八门的表达方式 ...

  2. stable_sort()与sort

    stable_sort与sort()都是c++库函数,调用<algorithm>库,但区别是sort是不稳定的排序,而stable_sort是稳定的,有时候stable_sort比sort ...

  3. 201521123086《java程序设计》第7周

    本章学习总结 书面作业 1.ArrayList代码分析 1.1 解释ArrayList的contains源代码 以下是ArrayList的contains源代码: public boolean con ...

  4. for /r命令实现全盘搜索指定文件

    @echo off Rem :全盘搜索指定文件并输出到文本 set "fileName=Normal.dotm" set "outPutPath=C:\result.tx ...

  5. 201521123093 java 第九周学习总结

    1. 本周学习总结 1.1 以你喜欢的方式(思维导图或其他)归纳总结异常相关内容. 2. 书面作业 本次PTA作业题集异常 1.常用异常 题目5-1 1.1 截图你的提交结果(出现学号) 1.2 自己 ...

  6. Java SpringMVC小白的成长(一)

    如果你是一个小白,请跟着我走,我会让你少走弯路,如果你是大牛,那么多谢大牛可以给我提提建议. 说实话,来公司这么久,一直在做的是维护与修改bug.(我的语言是php,来公司才开始接触java). 要毕 ...

  7. oracle客户端plsql设置字符集

    感谢一个新朋友的到来,我帮他的过程中有好些东西都不怎么想的起来了,所以从现在起我需要记录下每一点一滴, 因为我觉得写下来的东西才不会丢,而且欢迎以后的朋友到来. 使用plsql查数据的时候有时候中文会 ...

  8. 基于pytorch实现HighWay Networks之Train Deep Networks

    (一)Highway Networks 与 Deep Networks 的关系 理论实践表明神经网络的深度是至关重要的,深层神经网络在很多方面都已经取得了很好的效果,例如,在1000-class Im ...

  9. 跨Storyboard调用

    在开发中我们会有这种需求从一个故事板跳到另一个故事板 modal UIStoryboard *secondStoryboard = [UIStoryboard storyboardWithName:@ ...

  10. CSS的常用属性

    刚开始学习前段的我,还处于初级阶段,一些东西还是会有搞不明白的时候,还是要大家多多理解.今说就一些关于CSS的常用属性吧! 一.CSS常用选择器 CSS选择器应该说是一个非常重要的工具吧,选择器用得好 ...