spark+hcatalog操作hive表及其数据

package iie.hadoop.hcatalog.spark;

import iie.udps.common.hcatalog.SerHCatInputFormat;

import iie.udps.common.hcatalog.SerHCatOutputFormat;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.UUID;

import org.apache.hive.hcatalog.common.HCatUtil;

import org.apache.hive.hcatalog.data.DefaultHCatRecord;

import org.apache.hive.hcatalog.data.HCatRecord;

import org.apache.hive.hcatalog.data.schema.HCatSchema;

import org.apache.spark.Accumulator;

import org.apache.spark.SerializableWritable;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.JavaPairRDD;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.hive.conf.HiveConf;

import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;

import org.apache.hadoop.hive.metastore.api.FieldSchema;

import org.apache.hadoop.hive.metastore.api.MetaException;

import org.apache.hadoop.hive.metastore.api.SerDeInfo;

import org.apache.hadoop.hive.metastore.api.StorageDescriptor;

import org.apache.hadoop.hive.metastore.api.Table;

import org.apache.hadoop.hive.ql.io.RCFileInputFormat;

import org.apache.hadoop.hive.ql.io.RCFileOutputFormat;

import org.apache.hadoop.hive.serde.serdeConstants;

//import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hive.hcatalog.mapreduce.OutputJobInfo;

import org.apache.spark.api.java.function.Function;

import org.apache.spark.api.java.function.PairFunction;

import org.apache.spark.broadcast.Broadcast;

import org.apache.thrift.TException;

import scala.Tuple2;

/**

 * spark+hcatalog 实现表的复制功能， 并将原表一列数据变成大写存到新表 ; create table test(name String,age

 * int); 执行命令：spark-submit --master yarn-cluster --class

 * iie.hadoop.hcatalog.spark.LowerUpperCaseConvert /home/xdf/test.jar -c

 * /user/xdf/stdin.xml

 *

 * @author xiaodongfang

 *

 */

public class LowerUpperCaseConvert {

	private static Accumulator<Integer> inputDataCount;

	private static Accumulator<Integer> outputDataCount;

	@SuppressWarnings("rawtypes")

	public static void main(String[] args) throws Exception {

		if (args.length < 2) {

			System.err.println("Usage: <-c> <stdin.xml>");

			System.exit(1);

		}

		String stdinXml = args[1];

		String userName = null;

		String jobinstanceid = null;

		String operatorName = null;

		String dbName = null;

		String inputTabName = null;

		String operFieldName = null;

		int fieldCount = 0;

		// 读取stdin.xml文件

		Configuration conf = new Configuration();

		FileSystem fs = FileSystem.get(conf);

		FSDataInputStream dis = fs.open(new Path(stdinXml));

		InputStreamReader isr = new InputStreamReader(dis, "utf-8");

		BufferedReader read = new BufferedReader(isr);

		String tempString = "";

		String xmlParams = "";

		while ((tempString = read.readLine()) != null) {

			xmlParams += "\n" + tempString;

		}

		read.close();

		xmlParams = xmlParams.substring(1);

		// 获取xml文件中的参数值

		OperatorParamXml operXML = new OperatorParamXml();

		List<Map> list = operXML.parseStdinXml(xmlParams);

		userName = list.get(0).get("userName").toString();

		dbName = list.get(0).get("dbName").toString();

		inputTabName = list.get(0).get("inputTabName").toString();

		operatorName = list.get(0).get("operatorName").toString();

		jobinstanceid = list.get(0).get("jobinstanceid").toString();

		fieldCount = Integer.parseInt(list.get(0).get("fieldCount").toString());

		// 设置输出表字段名及类型

		ArrayList<String> fieldName = new ArrayList<String>();

		ArrayList<String> fieldType = new ArrayList<String>();

		for (int i = 1; i <= fieldCount; i++) {

			fieldName.add(list.get(0).get("fieldName" + i).toString());

			fieldType.add(list.get(0).get("fieldType" + i).toString());

		}

		String[] fieldNames = new String[fieldCount];

		String[] fieldTypes = new String[fieldCount];

		// 设置输出表的名字

		String outputTable = "tmp_" + UUID.randomUUID().toString().replace('-', '_');

		// 获取表字段名字和类型

		for (int j = 0; j < fieldCount; j++) {

			fieldNames[j] = fieldName.get(j);

			fieldTypes[j] = fieldType.get(j);

			System.out.println("====fieldName=====" + fieldNames[j]);

			System.out.println("====fieldType=====" + fieldTypes[j]);

		}

		System.out.println("====fieldCount=====" + fieldCount);

		// 创建hive表

		HCatSchema schema = getHCatSchema(dbName, inputTabName);

		createTable(dbName, outputTable, schema);

		// 将输入表字段数据转换为大写，写入输出表文件中

		JavaSparkContext jsc = new JavaSparkContext(

				new SparkConf().setAppName("LowerUpperCaseConvert"));

		inputDataCount = jsc.accumulator(0);

		outputDataCount = jsc.accumulator(0);

		// 要操作的字段名称及字段序号

		operFieldName = fieldNames[0];

		System.out.println("====operFieldName======" + operFieldName);

		int position = schema.getPosition(operFieldName);

		JavaRDD<SerializableWritable<HCatRecord>> rdd1 = LowerUpperCaseConvert

				.lowerUpperCaseConvert(jsc, dbName, inputTabName, position);

		LowerUpperCaseConvert.storeToTable(rdd1, dbName, outputTable);

		jsc.stop();

		// 设置输出xml文件参数

		List<Map> listOut = new ArrayList<Map>();

		Map<String, String> mapOut = new HashMap<String, String>();

		mapOut.put("jobinstanceid", jobinstanceid);

		mapOut.put("dbName", dbName);

		mapOut.put("outputTable", outputTable);

		mapOut.put("inputDataCount", inputDataCount.value().toString());

		mapOut.put("outputDataCount", outputDataCount.value().toString());

		String operFieldType = fieldTypes[0];// 要操作的字段类型

		if (operFieldType.equalsIgnoreCase("String")) {

			// 创建正常输出xml文件

			listOut.add(mapOut);

			String hdfsOutXml = "/user/" + userName + "/optasks/"

					+ jobinstanceid + "/" + operatorName + "/out"

					+ "/stdout.xml";

			operXML.genStdoutXml(hdfsOutXml, listOut);

		} else {

			// 创建错误输出xml文件

			String errorMessage = "fieldType is not string!!!";

			String errotCode = "80001";

			mapOut.put("errorMessage", errorMessage);

			mapOut.put("errotCode", errotCode);

			listOut.add(mapOut);

			String hdfsErrorXml = "/user/" + userName + "/optasks/"

					+ jobinstanceid + "/" + operatorName + "/out"

					+ "/stderr.xml";

			operXML.genStderrXml(hdfsErrorXml, listOut);

		}

		System.exit(0);

	}

	@SuppressWarnings("rawtypes")

	public static JavaRDD<SerializableWritable<HCatRecord>> lowerUpperCaseConvert(

			JavaSparkContext jsc, String dbName, String inputTabName,

			int position) throws IOException {

		Configuration inputConf = new Configuration();

		SerHCatInputFormat.setInput(inputConf, dbName, inputTabName);

		JavaPairRDD<WritableComparable, SerializableWritable> rdd = jsc

				.newAPIHadoopRDD(inputConf, SerHCatInputFormat.class,

						WritableComparable.class, SerializableWritable.class);

		final Broadcast<Integer> posBc = jsc.broadcast(position);

		// 获取表记录集

		JavaRDD<SerializableWritable<HCatRecord>> result = null;

		final Accumulator<Integer> output = jsc.accumulator(0);

		final Accumulator<Integer> input = jsc.accumulator(0);

		result = rdd

				.map(new Function<Tuple2<WritableComparable, SerializableWritable>, SerializableWritable<HCatRecord>>() {

					private static final long serialVersionUID = -2362812254158054659L;

					private final int postion = posBc.getValue().intValue();

					public SerializableWritable<HCatRecord> call(

							Tuple2<WritableComparable, SerializableWritable> v)

							throws Exception {

						HCatRecord record = (HCatRecord) v._2.value();

						// +1 inport

						input.add(1);

						List<Object> newRecord = new ArrayList<Object>(record

								.size());

						for (int i = 0; i < record.size(); ++i) {

							newRecord.add(record.get(i));

						}

						/*

						 * if (ok) +1 outport1 else +1 errport

						 */

						newRecord.set(postion, newRecord.get(postion)

								.toString().toUpperCase());

						output.add(1);

						return new SerializableWritable<HCatRecord>(

								new DefaultHCatRecord(newRecord));// 返回记录

					}

				});

		inputDataCount = input;

		outputDataCount = output;

		return result;

	}

	@SuppressWarnings("rawtypes")

	public static void storeToTable(

			JavaRDD<SerializableWritable<HCatRecord>> rdd, String dbName,

			String tblName) {

		Job outputJob = null;

		try {

			outputJob = Job.getInstance();

			outputJob.setJobName("lowerUpperCaseConvert");

			outputJob.setOutputFormatClass(SerHCatOutputFormat.class);

			outputJob.setOutputKeyClass(WritableComparable.class);

			outputJob.setOutputValueClass(SerializableWritable.class);

			SerHCatOutputFormat.setOutput(outputJob,

					OutputJobInfo.create(dbName, tblName, null));

			HCatSchema schema = SerHCatOutputFormat.getTableSchema(outputJob

					.getConfiguration());

			SerHCatOutputFormat.setSchema(outputJob, schema);

		} catch (IOException e) {

			e.printStackTrace();

		}

		// 将RDD存储到目标表中

		rdd.mapToPair(

				new PairFunction<SerializableWritable<HCatRecord>, WritableComparable, SerializableWritable<HCatRecord>>() {

					private static final long serialVersionUID = -4658431554556766962L;

					@Override

					public Tuple2<WritableComparable, SerializableWritable<HCatRecord>> call(

							SerializableWritable<HCatRecord> record)

							throws Exception {

						return new Tuple2<WritableComparable, SerializableWritable<HCatRecord>>(

								NullWritable.get(), record);

					}

				}).saveAsNewAPIHadoopDataset(outputJob.getConfiguration());

	}

	// 创建表结构

	public static void createTable(String dbName, String tblName,

			HCatSchema schema) {

		HiveMetaStoreClient client = null;

		try {

			HiveConf hiveConf = HCatUtil.getHiveConf(new Configuration());

			try {

				client = HCatUtil.getHiveClient(hiveConf);

			} catch (MetaException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			}

		} catch (IOException e) {

			e.printStackTrace();

		}

		try {

			if (client.tableExists(dbName, tblName)) {

				client.dropTable(dbName, tblName);

			}

		} catch (TException e) {

			e.printStackTrace();

		}

		List<FieldSchema> fields = HCatUtil.getFieldSchemaList(schema

				.getFields());

		System.out.println(fields);

		Table table = new Table();

		table.setDbName(dbName);

		table.setTableName(tblName);

		StorageDescriptor sd = new StorageDescriptor();

		sd.setCols(fields);

		table.setSd(sd);

		sd.setInputFormat(RCFileInputFormat.class.getName());

		sd.setOutputFormat(RCFileOutputFormat.class.getName());

		sd.setParameters(new HashMap<String, String>());

		sd.setSerdeInfo(new SerDeInfo());

		sd.getSerdeInfo().setName(table.getTableName());

		sd.getSerdeInfo().setParameters(new HashMap<String, String>());

		sd.getSerdeInfo().getParameters()

				.put(serdeConstants.SERIALIZATION_FORMAT, "1");

		sd.getSerdeInfo().setSerializationLib(

				org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class

						.getName());

		Map<String, String> tableParams = new HashMap<String, String>();

		table.setParameters(tableParams);

		try {

			client.createTable(table);

			System.out.println("Create table successfully!");

		} catch (TException e) {

			e.printStackTrace();

			return;

		} finally {

			client.close();

		}

	}

	// 获得HCatSchema

	public static HCatSchema getHCatSchema(String dbName, String tblName) {

		Job outputJob = null;

		HCatSchema schema = null;

		try {

			outputJob = Job.getInstance();

			outputJob.setJobName("getHCatSchema");

			outputJob.setOutputFormatClass(SerHCatOutputFormat.class);

			outputJob.setOutputKeyClass(WritableComparable.class);

			outputJob.setOutputValueClass(SerializableWritable.class);

			SerHCatOutputFormat.setOutput(outputJob,

					OutputJobInfo.create(dbName, tblName, null));

			schema = SerHCatOutputFormat.getTableSchema(outputJob

					.getConfiguration());

		} catch (IOException e) {

			e.printStackTrace();

		}

		return schema;

	}

}

spark+hcatalog操作hive表及其数据的更多相关文章

spark2.3.0 配置spark sql 操作hive
spark可以通过读取hive的元数据来兼容hive,读取hive的表数据,然后在spark引擎中进行sql统计分析,从而,通过spark sql与hive结合实现数据分析将成为一种最佳实践.配置步骤 ...
通过 Spark R 操作 Hive
作为数据工程师,我日常用的主力语言是R,HiveQL,Java与Scala.R是非常适合做数据清洗的脚本语言,并且有非常好用的服务端IDE——RStudio Server:而用户日志主要储存在hive ...
luigi操作hive表
关于luigi框架下查询hive表的操作 class JoinQuery(HiveQueryTask): date=luigi.DateParameter() def hiveconfs(self): ...
22.把hive表中数据导入到mysql中
先通过可视化工具链接mysql,在链接的时候用sqoop 用户登录在数据库userdb下新建表保存,输入表名upflow 现在我们需要把hive里面的数据通过sqoop导入到mysql里面 sqo ...
Spark SQL 操作Hive 数据
Spark 2.0以前版本:val sparkConf = new SparkConf().setAppName("soyo") val spark = new SparkC ...
用java代码调用shell脚本执行sqoop将hive表中数据导出到mysql
1:创建shell脚本 touch sqoop_options.sh chmod 777 sqoop_options.sh 编辑文件特地将执行map的个数设置为变量测试可以java代码传参数 ...
Spark:spark df插入hive表后小文件数量多，如何合并？
在做spark开发过程中,时不时的就有可能遇到租户的hive库目录下的文件个数超出了最大限制问题. 一般情况下通过hive的参数设置: val conf = new SparkConf().setAp ...
spark sql 查询hive表并写入到PG中
import java.sql.DriverManager import java.util.Properties import com.zhaopin.tools.{DateUtils, TextU ...
大数据学习day25------spark08-----1. 读取数据库的形式创建DataFrame 2. Parquet格式的数据源 3. Orc格式的数据源 4.spark_sql整合hive 5.在IDEA中编写spark程序（用来操作hive） 6. SQL风格和DSL风格以及RDD的形式计算连续登陆三天的用户
1. 读取数据库的形式创建DataFrame DataFrameFromJDBC object DataFrameFromJDBC { def main(args: Array[String]): U ...

随机推荐

mysql 游标取值为空的问题
DELIMITER $$ DROP PROCEDURE IF EXISTS updatePic $$ CREATE PROCEDURE updatePic() BEGIN DECLARE cover_ ...
Java: constructor 构造代码块
构造器与类同名每个类可以有1一个以上的构造器构造器可以有0个,1个或者多个参数构造器没有返回值,不可以写return 构造器总是伴随着new操作一起调用构造函数的作用:可以用于给对象进行初始化 ...
使用SMSManager短信管理器实现短信群发
import java.util.ArrayList; import android.os.Bundle;import android.provider.ContactsContract;import ...
iOS 登陆的实现四种方式
iOS 登陆的实现四种方式一. 网页加载: http://www.cnblogs.com/tekkaman/archive/2013/02/21/2920218.ht ml [iOS登陆的实现] A ...
wp8.1 Study9：针对不同的屏幕和手机方向调整UI
一.预备知识现在不同屏幕大小WP8.1手机越来越多,那么在设计UI时,这需要我们考虑这个问题.在WP中,比例因子(a scale factor)能很好的解决问题,而且在微软系统的PC/平板/手机都是 ...
配置Java环境-20160613
http://jingyan.baidu.com/article/870c6fc33e62bcb03fe4be90.html 1.安装JDK,参照目录在D:\Program Files\ec ...
C++指针（部分有误需修改）
一.取地址运算符&(内存地址) C++编译的程序占用的内存分为以下几个部分: 1．栈区:由编译器自动分配释放 ,存放函数的参数值,局部变量的值等.其操作方式类似于数据结构中的栈.与其它分区不同 ...
iOS应用崩溃日志分析
转自raywenderlich 作为一名应用开发者,你是否有过如下经历? 为确保你的应用正确无误,在将其提交到应用商店之前,你必定进行了大量的测试工作.它在你的设备上也运行得很好,但是,上了应 ...
Xrun 将 app 转化为 IPA
xcodebuild命令行打包,在使用xcodebuild编译后发现有些东西有些临时性质的东西,依然存在,搜索了一些资料,找到有clean的命令:在之前打包都是生成app文件,将app打包成ipa文件 ...
未来WEB程序员
作为一名程序员,如果你想在这个领域内继续向前进步或者在当前的经济形势下保持不被炒鱿鱼,那么你就决不应当自满自足,你需要继续学习.近日,著名IT评论员Justin James在他的博客中列出了未来五年程 ...

spark+hcatalog操作hive表及其数据

spark+hcatalog操作hive表及其数据的更多相关文章

随机推荐

热门专题