package iie.hadoop.hcatalog.spark;

import iie.udps.common.hcatalog.SerHCatInputFormat;
import iie.udps.common.hcatalog.SerHCatOutputFormat; import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID; import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.DefaultHCatRecord;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.spark.Accumulator;
import org.apache.spark.SerializableWritable;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
import org.apache.hadoop.hive.ql.io.RCFileOutputFormat;
import org.apache.hadoop.hive.serde.serdeConstants;
//import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hive.hcatalog.mapreduce.OutputJobInfo;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.thrift.TException; import scala.Tuple2; /**
* spark+hcatalog 实现表的复制功能, 并将原表一列数据变成大写存到新表 ; create table test(name String,age
* int); 执行命令:spark-submit --master yarn-cluster --class
* iie.hadoop.hcatalog.spark.LowerUpperCaseConvert /home/xdf/test.jar -c
* /user/xdf/stdin.xml
*
* @author xiaodongfang
*
*/
public class LowerUpperCaseConvert { private static Accumulator<Integer> inputDataCount;
private static Accumulator<Integer> outputDataCount; @SuppressWarnings("rawtypes")
public static void main(String[] args) throws Exception { if (args.length < 2) {
System.err.println("Usage: <-c> <stdin.xml>");
System.exit(1);
} String stdinXml = args[1];
String userName = null;
String jobinstanceid = null;
String operatorName = null;
String dbName = null;
String inputTabName = null;
String operFieldName = null;
int fieldCount = 0; // 读取stdin.xml文件
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
FSDataInputStream dis = fs.open(new Path(stdinXml));
InputStreamReader isr = new InputStreamReader(dis, "utf-8");
BufferedReader read = new BufferedReader(isr);
String tempString = "";
String xmlParams = "";
while ((tempString = read.readLine()) != null) {
xmlParams += "\n" + tempString;
}
read.close();
xmlParams = xmlParams.substring(1); // 获取xml文件中的参数值
OperatorParamXml operXML = new OperatorParamXml();
List<Map> list = operXML.parseStdinXml(xmlParams);
userName = list.get(0).get("userName").toString();
dbName = list.get(0).get("dbName").toString();
inputTabName = list.get(0).get("inputTabName").toString();
operatorName = list.get(0).get("operatorName").toString();
jobinstanceid = list.get(0).get("jobinstanceid").toString();
fieldCount = Integer.parseInt(list.get(0).get("fieldCount").toString()); // 设置输出表字段名及类型
ArrayList<String> fieldName = new ArrayList<String>();
ArrayList<String> fieldType = new ArrayList<String>();
for (int i = 1; i <= fieldCount; i++) {
fieldName.add(list.get(0).get("fieldName" + i).toString());
fieldType.add(list.get(0).get("fieldType" + i).toString());
}
String[] fieldNames = new String[fieldCount];
String[] fieldTypes = new String[fieldCount]; // 设置输出表的名字
String outputTable = "tmp_" + UUID.randomUUID().toString().replace('-', '_'); // 获取表字段名字和类型
for (int j = 0; j < fieldCount; j++) {
fieldNames[j] = fieldName.get(j);
fieldTypes[j] = fieldType.get(j);
System.out.println("====fieldName=====" + fieldNames[j]);
System.out.println("====fieldType=====" + fieldTypes[j]);
}
System.out.println("====fieldCount=====" + fieldCount); // 创建hive表
HCatSchema schema = getHCatSchema(dbName, inputTabName);
createTable(dbName, outputTable, schema); // 将输入表字段数据转换为大写,写入输出表文件中
JavaSparkContext jsc = new JavaSparkContext(
new SparkConf().setAppName("LowerUpperCaseConvert"));
inputDataCount = jsc.accumulator(0);
outputDataCount = jsc.accumulator(0); // 要操作的字段名称及字段序号
operFieldName = fieldNames[0];
System.out.println("====operFieldName======" + operFieldName);
int position = schema.getPosition(operFieldName); JavaRDD<SerializableWritable<HCatRecord>> rdd1 = LowerUpperCaseConvert
.lowerUpperCaseConvert(jsc, dbName, inputTabName, position);
LowerUpperCaseConvert.storeToTable(rdd1, dbName, outputTable);
jsc.stop(); // 设置输出xml文件参数
List<Map> listOut = new ArrayList<Map>();
Map<String, String> mapOut = new HashMap<String, String>();
mapOut.put("jobinstanceid", jobinstanceid);
mapOut.put("dbName", dbName);
mapOut.put("outputTable", outputTable);
mapOut.put("inputDataCount", inputDataCount.value().toString());
mapOut.put("outputDataCount", outputDataCount.value().toString()); String operFieldType = fieldTypes[0];// 要操作的字段类型
if (operFieldType.equalsIgnoreCase("String")) {
// 创建正常输出xml文件
listOut.add(mapOut);
String hdfsOutXml = "/user/" + userName + "/optasks/"
+ jobinstanceid + "/" + operatorName + "/out"
+ "/stdout.xml";
operXML.genStdoutXml(hdfsOutXml, listOut);
} else {
// 创建错误输出xml文件
String errorMessage = "fieldType is not string!!!";
String errotCode = "80001";
mapOut.put("errorMessage", errorMessage);
mapOut.put("errotCode", errotCode);
listOut.add(mapOut);
String hdfsErrorXml = "/user/" + userName + "/optasks/"
+ jobinstanceid + "/" + operatorName + "/out"
+ "/stderr.xml";
operXML.genStderrXml(hdfsErrorXml, listOut);
}
System.exit(0);
} @SuppressWarnings("rawtypes")
public static JavaRDD<SerializableWritable<HCatRecord>> lowerUpperCaseConvert(
JavaSparkContext jsc, String dbName, String inputTabName,
int position) throws IOException { Configuration inputConf = new Configuration();
SerHCatInputFormat.setInput(inputConf, dbName, inputTabName); JavaPairRDD<WritableComparable, SerializableWritable> rdd = jsc
.newAPIHadoopRDD(inputConf, SerHCatInputFormat.class,
WritableComparable.class, SerializableWritable.class); final Broadcast<Integer> posBc = jsc.broadcast(position);
// 获取表记录集
JavaRDD<SerializableWritable<HCatRecord>> result = null;
final Accumulator<Integer> output = jsc.accumulator(0);
final Accumulator<Integer> input = jsc.accumulator(0); result = rdd
.map(new Function<Tuple2<WritableComparable, SerializableWritable>, SerializableWritable<HCatRecord>>() { private static final long serialVersionUID = -2362812254158054659L; private final int postion = posBc.getValue().intValue(); public SerializableWritable<HCatRecord> call(
Tuple2<WritableComparable, SerializableWritable> v)
throws Exception {
HCatRecord record = (HCatRecord) v._2.value();
// +1 inport
input.add(1);
List<Object> newRecord = new ArrayList<Object>(record
.size());
for (int i = 0; i < record.size(); ++i) {
newRecord.add(record.get(i));
}
/*
* if (ok) +1 outport1 else +1 errport
*/
newRecord.set(postion, newRecord.get(postion)
.toString().toUpperCase());
output.add(1);
return new SerializableWritable<HCatRecord>(
new DefaultHCatRecord(newRecord));// 返回记录
}
});
inputDataCount = input;
outputDataCount = output;
return result;
} @SuppressWarnings("rawtypes")
public static void storeToTable(
JavaRDD<SerializableWritable<HCatRecord>> rdd, String dbName,
String tblName) {
Job outputJob = null;
try {
outputJob = Job.getInstance();
outputJob.setJobName("lowerUpperCaseConvert");
outputJob.setOutputFormatClass(SerHCatOutputFormat.class);
outputJob.setOutputKeyClass(WritableComparable.class);
outputJob.setOutputValueClass(SerializableWritable.class);
SerHCatOutputFormat.setOutput(outputJob,
OutputJobInfo.create(dbName, tblName, null));
HCatSchema schema = SerHCatOutputFormat.getTableSchema(outputJob
.getConfiguration());
SerHCatOutputFormat.setSchema(outputJob, schema);
} catch (IOException e) {
e.printStackTrace();
} // 将RDD存储到目标表中
rdd.mapToPair(
new PairFunction<SerializableWritable<HCatRecord>, WritableComparable, SerializableWritable<HCatRecord>>() { private static final long serialVersionUID = -4658431554556766962L; @Override
public Tuple2<WritableComparable, SerializableWritable<HCatRecord>> call(
SerializableWritable<HCatRecord> record)
throws Exception {
return new Tuple2<WritableComparable, SerializableWritable<HCatRecord>>(
NullWritable.get(), record);
}
}).saveAsNewAPIHadoopDataset(outputJob.getConfiguration());
} // 创建表结构
public static void createTable(String dbName, String tblName,
HCatSchema schema) {
HiveMetaStoreClient client = null;
try {
HiveConf hiveConf = HCatUtil.getHiveConf(new Configuration());
try {
client = HCatUtil.getHiveClient(hiveConf);
} catch (MetaException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if (client.tableExists(dbName, tblName)) {
client.dropTable(dbName, tblName);
}
} catch (TException e) {
e.printStackTrace();
} List<FieldSchema> fields = HCatUtil.getFieldSchemaList(schema
.getFields());
System.out.println(fields);
Table table = new Table();
table.setDbName(dbName);
table.setTableName(tblName); StorageDescriptor sd = new StorageDescriptor();
sd.setCols(fields);
table.setSd(sd);
sd.setInputFormat(RCFileInputFormat.class.getName());
sd.setOutputFormat(RCFileOutputFormat.class.getName());
sd.setParameters(new HashMap<String, String>());
sd.setSerdeInfo(new SerDeInfo());
sd.getSerdeInfo().setName(table.getTableName());
sd.getSerdeInfo().setParameters(new HashMap<String, String>());
sd.getSerdeInfo().getParameters()
.put(serdeConstants.SERIALIZATION_FORMAT, "1");
sd.getSerdeInfo().setSerializationLib(
org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class
.getName());
Map<String, String> tableParams = new HashMap<String, String>();
table.setParameters(tableParams);
try {
client.createTable(table);
System.out.println("Create table successfully!");
} catch (TException e) {
e.printStackTrace();
return;
} finally {
client.close();
}
} // 获得HCatSchema
public static HCatSchema getHCatSchema(String dbName, String tblName) {
Job outputJob = null;
HCatSchema schema = null;
try {
outputJob = Job.getInstance();
outputJob.setJobName("getHCatSchema");
outputJob.setOutputFormatClass(SerHCatOutputFormat.class);
outputJob.setOutputKeyClass(WritableComparable.class);
outputJob.setOutputValueClass(SerializableWritable.class);
SerHCatOutputFormat.setOutput(outputJob,
OutputJobInfo.create(dbName, tblName, null));
schema = SerHCatOutputFormat.getTableSchema(outputJob
.getConfiguration());
} catch (IOException e) {
e.printStackTrace();
}
return schema;
}
}

  

spark+hcatalog操作hive表及其数据的更多相关文章

  1. spark2.3.0 配置spark sql 操作hive

    spark可以通过读取hive的元数据来兼容hive,读取hive的表数据,然后在spark引擎中进行sql统计分析,从而,通过spark sql与hive结合实现数据分析将成为一种最佳实践.配置步骤 ...

  2. 通过 Spark R 操作 Hive

    作为数据工程师,我日常用的主力语言是R,HiveQL,Java与Scala.R是非常适合做数据清洗的脚本语言,并且有非常好用的服务端IDE——RStudio Server:而用户日志主要储存在hive ...

  3. luigi操作hive表

    关于luigi框架下查询hive表的操作 class JoinQuery(HiveQueryTask): date=luigi.DateParameter() def hiveconfs(self): ...

  4. 22.把hive表中数据导入到mysql中

    先通过可视化工具链接mysql,在链接的时候用sqoop 用户登录 在数据库userdb下新建表 保存,输入表名upflow 现在我们需要把hive里面的数据通过sqoop导入到mysql里面 sqo ...

  5. Spark SQL 操作Hive 数据

    Spark 2.0以前版本:val sparkConf = new SparkConf().setAppName("soyo")    val spark = new SparkC ...

  6. 用java代码调用shell脚本执行sqoop将hive表中数据导出到mysql

    1:创建shell脚本 touch sqoop_options.sh chmod 777 sqoop_options.sh 编辑文件  特地将执行map的个数设置为变量  测试 可以java代码传参数 ...

  7. Spark:spark df插入hive表后小文件数量多,如何合并?

    在做spark开发过程中,时不时的就有可能遇到租户的hive库目录下的文件个数超出了最大限制问题. 一般情况下通过hive的参数设置: val conf = new SparkConf().setAp ...

  8. spark sql 查询hive表并写入到PG中

    import java.sql.DriverManager import java.util.Properties import com.zhaopin.tools.{DateUtils, TextU ...

  9. 大数据学习day25------spark08-----1. 读取数据库的形式创建DataFrame 2. Parquet格式的数据源 3. Orc格式的数据源 4.spark_sql整合hive 5.在IDEA中编写spark程序(用来操作hive) 6. SQL风格和DSL风格以及RDD的形式计算连续登陆三天的用户

    1. 读取数据库的形式创建DataFrame DataFrameFromJDBC object DataFrameFromJDBC { def main(args: Array[String]): U ...

随机推荐

  1. [Js]焦点图轮播效果

    一.所用到的知识点 1.DOM操作 2.定时器 3.事件运用 4.Js动画 5.函数递归 6.无限滚动大法 二.结构和样式 <div id="banner" class=&q ...

  2. UI UIView

    课程内容:   一.iOS概述 2007年1月9日Macworld大会上公布iPhone OS系统,2010WWDC大会上改名为iOS   二. UI编程概述 UI的本意是用户界面,是英文User和 ...

  3. 【个人使用.Net类库】(1)INI配置文件操作类

    开发接口程序时,对于接口程序配置的IP地址.端口等都需要是可配置的,而在Win Api原生实现了INI文件的读写操作,因此只需要调用Win Api中的方法即可操作INI配置文件,关键代码就是如何调用W ...

  4. java枚举类

    enum关键字用于定义枚举类,若枚举只有一个成员, 则可以作为一种单例模式的实现方式.   枚举类对象的属性不应允许被改动, 所以应该使用 private final 修饰. 枚举类的使用 priva ...

  5. Jquery easyui datagrid 导出Excel

    From:http://www.cnblogs.com/weiqt/articles/4022399.html datagrid的扩展方法,用于将当前的数据生成excel需要的内容. 1 <sc ...

  6. NABCD分析

    NABCD——今日事 N(Need):开创的成就系统可以在一定程度上督促用户坚持下来. A(Approach):做一个APP软件,是在android平台构建. B(Benefit):可以逐步改变用户的 ...

  7. SharePoint安全 - 攻破SharePoint(黑客工具介绍)

    博客地址 http://blog.csdn.net/foxdave SharePoint的安全性很高,这是我们潜意识里的第一印象,所以具体的安全性体现在哪并没仔细研究过.但是事实上确实没有绝对安全的东 ...

  8. JVM-class文件完全解析-方法表集合

    方法表集合 前面的魔数,次版本号,主板本号,常量池入口,常量池,访问标志,类索引,父类索引,接口索引集合,字段表集合,那么再接下来就是方法表了.   方法表的构造如同字段表一样,依次包括了访问标志(a ...

  9. postgreSQL初步使用总结

    一.安装 postgreSQL安装完成后会默认生成一个名为postgres的用户和一个名为postgres的数据库.可以使用自带的psql.exe工具来登录.其帮助信息如下 连接到本地的postgre ...

  10. UITableView详解(1)

    一,UITableView控件使用必备,红色部分是易错点和难点 首先创建一个项目,要使用UITableView就需要实现协议<UITableViewDataSource>,数据源协议主要完 ...