spark+hcatalog操作hive表及其数据
package iie.hadoop.hcatalog.spark; import iie.udps.common.hcatalog.SerHCatInputFormat;
import iie.udps.common.hcatalog.SerHCatOutputFormat; import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID; import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.DefaultHCatRecord;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.spark.Accumulator;
import org.apache.spark.SerializableWritable;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
import org.apache.hadoop.hive.ql.io.RCFileOutputFormat;
import org.apache.hadoop.hive.serde.serdeConstants;
//import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hive.hcatalog.mapreduce.OutputJobInfo;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.thrift.TException; import scala.Tuple2; /**
* spark+hcatalog 实现表的复制功能, 并将原表一列数据变成大写存到新表 ; create table test(name String,age
* int); 执行命令:spark-submit --master yarn-cluster --class
* iie.hadoop.hcatalog.spark.LowerUpperCaseConvert /home/xdf/test.jar -c
* /user/xdf/stdin.xml
*
* @author xiaodongfang
*
*/
public class LowerUpperCaseConvert { private static Accumulator<Integer> inputDataCount;
private static Accumulator<Integer> outputDataCount; @SuppressWarnings("rawtypes")
public static void main(String[] args) throws Exception { if (args.length < 2) {
System.err.println("Usage: <-c> <stdin.xml>");
System.exit(1);
} String stdinXml = args[1];
String userName = null;
String jobinstanceid = null;
String operatorName = null;
String dbName = null;
String inputTabName = null;
String operFieldName = null;
int fieldCount = 0; // 读取stdin.xml文件
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
FSDataInputStream dis = fs.open(new Path(stdinXml));
InputStreamReader isr = new InputStreamReader(dis, "utf-8");
BufferedReader read = new BufferedReader(isr);
String tempString = "";
String xmlParams = "";
while ((tempString = read.readLine()) != null) {
xmlParams += "\n" + tempString;
}
read.close();
xmlParams = xmlParams.substring(1); // 获取xml文件中的参数值
OperatorParamXml operXML = new OperatorParamXml();
List<Map> list = operXML.parseStdinXml(xmlParams);
userName = list.get(0).get("userName").toString();
dbName = list.get(0).get("dbName").toString();
inputTabName = list.get(0).get("inputTabName").toString();
operatorName = list.get(0).get("operatorName").toString();
jobinstanceid = list.get(0).get("jobinstanceid").toString();
fieldCount = Integer.parseInt(list.get(0).get("fieldCount").toString()); // 设置输出表字段名及类型
ArrayList<String> fieldName = new ArrayList<String>();
ArrayList<String> fieldType = new ArrayList<String>();
for (int i = 1; i <= fieldCount; i++) {
fieldName.add(list.get(0).get("fieldName" + i).toString());
fieldType.add(list.get(0).get("fieldType" + i).toString());
}
String[] fieldNames = new String[fieldCount];
String[] fieldTypes = new String[fieldCount]; // 设置输出表的名字
String outputTable = "tmp_" + UUID.randomUUID().toString().replace('-', '_'); // 获取表字段名字和类型
for (int j = 0; j < fieldCount; j++) {
fieldNames[j] = fieldName.get(j);
fieldTypes[j] = fieldType.get(j);
System.out.println("====fieldName=====" + fieldNames[j]);
System.out.println("====fieldType=====" + fieldTypes[j]);
}
System.out.println("====fieldCount=====" + fieldCount); // 创建hive表
HCatSchema schema = getHCatSchema(dbName, inputTabName);
createTable(dbName, outputTable, schema); // 将输入表字段数据转换为大写,写入输出表文件中
JavaSparkContext jsc = new JavaSparkContext(
new SparkConf().setAppName("LowerUpperCaseConvert"));
inputDataCount = jsc.accumulator(0);
outputDataCount = jsc.accumulator(0); // 要操作的字段名称及字段序号
operFieldName = fieldNames[0];
System.out.println("====operFieldName======" + operFieldName);
int position = schema.getPosition(operFieldName); JavaRDD<SerializableWritable<HCatRecord>> rdd1 = LowerUpperCaseConvert
.lowerUpperCaseConvert(jsc, dbName, inputTabName, position);
LowerUpperCaseConvert.storeToTable(rdd1, dbName, outputTable);
jsc.stop(); // 设置输出xml文件参数
List<Map> listOut = new ArrayList<Map>();
Map<String, String> mapOut = new HashMap<String, String>();
mapOut.put("jobinstanceid", jobinstanceid);
mapOut.put("dbName", dbName);
mapOut.put("outputTable", outputTable);
mapOut.put("inputDataCount", inputDataCount.value().toString());
mapOut.put("outputDataCount", outputDataCount.value().toString()); String operFieldType = fieldTypes[0];// 要操作的字段类型
if (operFieldType.equalsIgnoreCase("String")) {
// 创建正常输出xml文件
listOut.add(mapOut);
String hdfsOutXml = "/user/" + userName + "/optasks/"
+ jobinstanceid + "/" + operatorName + "/out"
+ "/stdout.xml";
operXML.genStdoutXml(hdfsOutXml, listOut);
} else {
// 创建错误输出xml文件
String errorMessage = "fieldType is not string!!!";
String errotCode = "80001";
mapOut.put("errorMessage", errorMessage);
mapOut.put("errotCode", errotCode);
listOut.add(mapOut);
String hdfsErrorXml = "/user/" + userName + "/optasks/"
+ jobinstanceid + "/" + operatorName + "/out"
+ "/stderr.xml";
operXML.genStderrXml(hdfsErrorXml, listOut);
}
System.exit(0);
} @SuppressWarnings("rawtypes")
public static JavaRDD<SerializableWritable<HCatRecord>> lowerUpperCaseConvert(
JavaSparkContext jsc, String dbName, String inputTabName,
int position) throws IOException { Configuration inputConf = new Configuration();
SerHCatInputFormat.setInput(inputConf, dbName, inputTabName); JavaPairRDD<WritableComparable, SerializableWritable> rdd = jsc
.newAPIHadoopRDD(inputConf, SerHCatInputFormat.class,
WritableComparable.class, SerializableWritable.class); final Broadcast<Integer> posBc = jsc.broadcast(position);
// 获取表记录集
JavaRDD<SerializableWritable<HCatRecord>> result = null;
final Accumulator<Integer> output = jsc.accumulator(0);
final Accumulator<Integer> input = jsc.accumulator(0); result = rdd
.map(new Function<Tuple2<WritableComparable, SerializableWritable>, SerializableWritable<HCatRecord>>() { private static final long serialVersionUID = -2362812254158054659L; private final int postion = posBc.getValue().intValue(); public SerializableWritable<HCatRecord> call(
Tuple2<WritableComparable, SerializableWritable> v)
throws Exception {
HCatRecord record = (HCatRecord) v._2.value();
// +1 inport
input.add(1);
List<Object> newRecord = new ArrayList<Object>(record
.size());
for (int i = 0; i < record.size(); ++i) {
newRecord.add(record.get(i));
}
/*
* if (ok) +1 outport1 else +1 errport
*/
newRecord.set(postion, newRecord.get(postion)
.toString().toUpperCase());
output.add(1);
return new SerializableWritable<HCatRecord>(
new DefaultHCatRecord(newRecord));// 返回记录
}
});
inputDataCount = input;
outputDataCount = output;
return result;
} @SuppressWarnings("rawtypes")
public static void storeToTable(
JavaRDD<SerializableWritable<HCatRecord>> rdd, String dbName,
String tblName) {
Job outputJob = null;
try {
outputJob = Job.getInstance();
outputJob.setJobName("lowerUpperCaseConvert");
outputJob.setOutputFormatClass(SerHCatOutputFormat.class);
outputJob.setOutputKeyClass(WritableComparable.class);
outputJob.setOutputValueClass(SerializableWritable.class);
SerHCatOutputFormat.setOutput(outputJob,
OutputJobInfo.create(dbName, tblName, null));
HCatSchema schema = SerHCatOutputFormat.getTableSchema(outputJob
.getConfiguration());
SerHCatOutputFormat.setSchema(outputJob, schema);
} catch (IOException e) {
e.printStackTrace();
} // 将RDD存储到目标表中
rdd.mapToPair(
new PairFunction<SerializableWritable<HCatRecord>, WritableComparable, SerializableWritable<HCatRecord>>() { private static final long serialVersionUID = -4658431554556766962L; @Override
public Tuple2<WritableComparable, SerializableWritable<HCatRecord>> call(
SerializableWritable<HCatRecord> record)
throws Exception {
return new Tuple2<WritableComparable, SerializableWritable<HCatRecord>>(
NullWritable.get(), record);
}
}).saveAsNewAPIHadoopDataset(outputJob.getConfiguration());
} // 创建表结构
public static void createTable(String dbName, String tblName,
HCatSchema schema) {
HiveMetaStoreClient client = null;
try {
HiveConf hiveConf = HCatUtil.getHiveConf(new Configuration());
try {
client = HCatUtil.getHiveClient(hiveConf);
} catch (MetaException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if (client.tableExists(dbName, tblName)) {
client.dropTable(dbName, tblName);
}
} catch (TException e) {
e.printStackTrace();
} List<FieldSchema> fields = HCatUtil.getFieldSchemaList(schema
.getFields());
System.out.println(fields);
Table table = new Table();
table.setDbName(dbName);
table.setTableName(tblName); StorageDescriptor sd = new StorageDescriptor();
sd.setCols(fields);
table.setSd(sd);
sd.setInputFormat(RCFileInputFormat.class.getName());
sd.setOutputFormat(RCFileOutputFormat.class.getName());
sd.setParameters(new HashMap<String, String>());
sd.setSerdeInfo(new SerDeInfo());
sd.getSerdeInfo().setName(table.getTableName());
sd.getSerdeInfo().setParameters(new HashMap<String, String>());
sd.getSerdeInfo().getParameters()
.put(serdeConstants.SERIALIZATION_FORMAT, "1");
sd.getSerdeInfo().setSerializationLib(
org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class
.getName());
Map<String, String> tableParams = new HashMap<String, String>();
table.setParameters(tableParams);
try {
client.createTable(table);
System.out.println("Create table successfully!");
} catch (TException e) {
e.printStackTrace();
return;
} finally {
client.close();
}
} // 获得HCatSchema
public static HCatSchema getHCatSchema(String dbName, String tblName) {
Job outputJob = null;
HCatSchema schema = null;
try {
outputJob = Job.getInstance();
outputJob.setJobName("getHCatSchema");
outputJob.setOutputFormatClass(SerHCatOutputFormat.class);
outputJob.setOutputKeyClass(WritableComparable.class);
outputJob.setOutputValueClass(SerializableWritable.class);
SerHCatOutputFormat.setOutput(outputJob,
OutputJobInfo.create(dbName, tblName, null));
schema = SerHCatOutputFormat.getTableSchema(outputJob
.getConfiguration());
} catch (IOException e) {
e.printStackTrace();
}
return schema;
}
}
spark+hcatalog操作hive表及其数据的更多相关文章
- spark2.3.0 配置spark sql 操作hive
spark可以通过读取hive的元数据来兼容hive,读取hive的表数据,然后在spark引擎中进行sql统计分析,从而,通过spark sql与hive结合实现数据分析将成为一种最佳实践.配置步骤 ...
- 通过 Spark R 操作 Hive
作为数据工程师,我日常用的主力语言是R,HiveQL,Java与Scala.R是非常适合做数据清洗的脚本语言,并且有非常好用的服务端IDE——RStudio Server:而用户日志主要储存在hive ...
- luigi操作hive表
关于luigi框架下查询hive表的操作 class JoinQuery(HiveQueryTask): date=luigi.DateParameter() def hiveconfs(self): ...
- 22.把hive表中数据导入到mysql中
先通过可视化工具链接mysql,在链接的时候用sqoop 用户登录 在数据库userdb下新建表 保存,输入表名upflow 现在我们需要把hive里面的数据通过sqoop导入到mysql里面 sqo ...
- Spark SQL 操作Hive 数据
Spark 2.0以前版本:val sparkConf = new SparkConf().setAppName("soyo") val spark = new SparkC ...
- 用java代码调用shell脚本执行sqoop将hive表中数据导出到mysql
1:创建shell脚本 touch sqoop_options.sh chmod 777 sqoop_options.sh 编辑文件 特地将执行map的个数设置为变量 测试 可以java代码传参数 ...
- Spark:spark df插入hive表后小文件数量多,如何合并?
在做spark开发过程中,时不时的就有可能遇到租户的hive库目录下的文件个数超出了最大限制问题. 一般情况下通过hive的参数设置: val conf = new SparkConf().setAp ...
- spark sql 查询hive表并写入到PG中
import java.sql.DriverManager import java.util.Properties import com.zhaopin.tools.{DateUtils, TextU ...
- 大数据学习day25------spark08-----1. 读取数据库的形式创建DataFrame 2. Parquet格式的数据源 3. Orc格式的数据源 4.spark_sql整合hive 5.在IDEA中编写spark程序(用来操作hive) 6. SQL风格和DSL风格以及RDD的形式计算连续登陆三天的用户
1. 读取数据库的形式创建DataFrame DataFrameFromJDBC object DataFrameFromJDBC { def main(args: Array[String]): U ...
随机推荐
- webapi方式
随笔 - 112 文章 - 0 评论 - 334 ASP.NET MVC学习系列(二)-WebAPI请求 继续接着上文 ASP.NET MVC学习系列(一)-WebAPI初探 来看看对于一般前 ...
- 转:Linux 安装 Mysql
前段时间安装了Mysql,但是有些问题,就想把他卸载了,重新安装一个,但是没想到在Linux卸载软件是一个很痛苦的事情. 我的Mysql是用命令的方式安装的,就是上一篇文章用到的那个命令(sudo ...
- shell学习记录002-知识点储备
1.echo "4*0.33" |bc #计算机功能的运用 [root@oc3408554812 shell]# ss=22; [root@oc3408554812 shel ...
- 10款优秀Vim插件帮你打造完美IDE
导读 如果你稍微写过一点代码,就能知道“集成开发环境”(IDE)是多么的便利.不管是Java.C还是Python,当IDE会帮你检查语法.后台编译,或者自动导入你需要的库时,写代码就变得容易许多.另外 ...
- HDU 3533 Escape bfs 难度:1
http://acm.hdu.edu.cn/showproblem.php?pid=3533 一道普通的bfs,但是由于代码实现出了bug还是拖了很久甚至对拍了 需要注意的是: 1.人不能经过炮台 2 ...
- 神奇的NOIP模拟赛 T3 LGTB 玩THD
LGTB 玩THD LGTB 最近在玩一个类似DOTA 的游戏名叫THD有一天他在守一座塔,对面的N 个小兵排成一列从近到远站在塔前面每个小兵有一定的血量hi,杀死后有一定的金钱gi每一秒,他都可以攻 ...
- 戴文的Linux内核专题:04安全
转自Linux中国 Linux内核是所有Linux系统的核心.如果有任何恶意代码控制或破害了内核的任何一部分,那么系统会严重受损,文件可能被删除或损坏,私人信息可能被盗等等.很明显,保持内核安全涉及到 ...
- ios 8.4 Xcode6.4 设置LaunchImage图片
Step1 1.点击Image.xcassets 进入图片管理,然后右击,弹出"New Launch Image" 2.如图,右侧的勾选可以让你选择是否要对ipad,横屏,竖屏,以 ...
- centos ssh 无密码登录
在linux系统中,ssh是远程登录的默认工具,因为该工具的协议使用了RSA/DSA的加密算法.该工具做linux系统的远程管理是非常安全的.telnet,因为其不安全性,在linux系统中被搁置使用 ...
- (spring-第7回【IoC基础篇】)BeanDefinition的载入与解析&&spring.schemas、spring.handlers的使用
报错信息:Configuration problem: Unable to locate Spring NamespaceHandler for XML schema namespace [http: ...