spark+hcatalog操作hive表及其数据

package iie.hadoop.hcatalog.spark;

import iie.udps.common.hcatalog.SerHCatInputFormat;

import iie.udps.common.hcatalog.SerHCatOutputFormat;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.UUID;

import org.apache.hive.hcatalog.common.HCatUtil;

import org.apache.hive.hcatalog.data.DefaultHCatRecord;

import org.apache.hive.hcatalog.data.HCatRecord;

import org.apache.hive.hcatalog.data.schema.HCatSchema;

import org.apache.spark.Accumulator;

import org.apache.spark.SerializableWritable;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.JavaPairRDD;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.hive.conf.HiveConf;

import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;

import org.apache.hadoop.hive.metastore.api.FieldSchema;

import org.apache.hadoop.hive.metastore.api.MetaException;

import org.apache.hadoop.hive.metastore.api.SerDeInfo;

import org.apache.hadoop.hive.metastore.api.StorageDescriptor;

import org.apache.hadoop.hive.metastore.api.Table;

import org.apache.hadoop.hive.ql.io.RCFileInputFormat;

import org.apache.hadoop.hive.ql.io.RCFileOutputFormat;

import org.apache.hadoop.hive.serde.serdeConstants;

//import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hive.hcatalog.mapreduce.OutputJobInfo;

import org.apache.spark.api.java.function.Function;

import org.apache.spark.api.java.function.PairFunction;

import org.apache.spark.broadcast.Broadcast;

import org.apache.thrift.TException;

import scala.Tuple2;

/**

 * spark+hcatalog 实现表的复制功能， 并将原表一列数据变成大写存到新表 ; create table test(name String,age

 * int); 执行命令：spark-submit --master yarn-cluster --class

 * iie.hadoop.hcatalog.spark.LowerUpperCaseConvert /home/xdf/test.jar -c

 * /user/xdf/stdin.xml

 *

 * @author xiaodongfang

 *

 */

public class LowerUpperCaseConvert {

	private static Accumulator<Integer> inputDataCount;

	private static Accumulator<Integer> outputDataCount;

	@SuppressWarnings("rawtypes")

	public static void main(String[] args) throws Exception {

		if (args.length < 2) {

			System.err.println("Usage: <-c> <stdin.xml>");

			System.exit(1);

		}

		String stdinXml = args[1];

		String userName = null;

		String jobinstanceid = null;

		String operatorName = null;

		String dbName = null;

		String inputTabName = null;

		String operFieldName = null;

		int fieldCount = 0;

		// 读取stdin.xml文件

		Configuration conf = new Configuration();

		FileSystem fs = FileSystem.get(conf);

		FSDataInputStream dis = fs.open(new Path(stdinXml));

		InputStreamReader isr = new InputStreamReader(dis, "utf-8");

		BufferedReader read = new BufferedReader(isr);

		String tempString = "";

		String xmlParams = "";

		while ((tempString = read.readLine()) != null) {

			xmlParams += "\n" + tempString;

		}

		read.close();

		xmlParams = xmlParams.substring(1);

		// 获取xml文件中的参数值

		OperatorParamXml operXML = new OperatorParamXml();

		List<Map> list = operXML.parseStdinXml(xmlParams);

		userName = list.get(0).get("userName").toString();

		dbName = list.get(0).get("dbName").toString();

		inputTabName = list.get(0).get("inputTabName").toString();

		operatorName = list.get(0).get("operatorName").toString();

		jobinstanceid = list.get(0).get("jobinstanceid").toString();

		fieldCount = Integer.parseInt(list.get(0).get("fieldCount").toString());

		// 设置输出表字段名及类型

		ArrayList<String> fieldName = new ArrayList<String>();

		ArrayList<String> fieldType = new ArrayList<String>();

		for (int i = 1; i <= fieldCount; i++) {

			fieldName.add(list.get(0).get("fieldName" + i).toString());

			fieldType.add(list.get(0).get("fieldType" + i).toString());

		}

		String[] fieldNames = new String[fieldCount];

		String[] fieldTypes = new String[fieldCount];

		// 设置输出表的名字

		String outputTable = "tmp_" + UUID.randomUUID().toString().replace('-', '_');

		// 获取表字段名字和类型

		for (int j = 0; j < fieldCount; j++) {

			fieldNames[j] = fieldName.get(j);

			fieldTypes[j] = fieldType.get(j);

			System.out.println("====fieldName=====" + fieldNames[j]);

			System.out.println("====fieldType=====" + fieldTypes[j]);

		}

		System.out.println("====fieldCount=====" + fieldCount);

		// 创建hive表

		HCatSchema schema = getHCatSchema(dbName, inputTabName);

		createTable(dbName, outputTable, schema);

		// 将输入表字段数据转换为大写，写入输出表文件中

		JavaSparkContext jsc = new JavaSparkContext(

				new SparkConf().setAppName("LowerUpperCaseConvert"));

		inputDataCount = jsc.accumulator(0);

		outputDataCount = jsc.accumulator(0);

		// 要操作的字段名称及字段序号

		operFieldName = fieldNames[0];

		System.out.println("====operFieldName======" + operFieldName);

		int position = schema.getPosition(operFieldName);

		JavaRDD<SerializableWritable<HCatRecord>> rdd1 = LowerUpperCaseConvert

				.lowerUpperCaseConvert(jsc, dbName, inputTabName, position);

		LowerUpperCaseConvert.storeToTable(rdd1, dbName, outputTable);

		jsc.stop();

		// 设置输出xml文件参数

		List<Map> listOut = new ArrayList<Map>();

		Map<String, String> mapOut = new HashMap<String, String>();

		mapOut.put("jobinstanceid", jobinstanceid);

		mapOut.put("dbName", dbName);

		mapOut.put("outputTable", outputTable);

		mapOut.put("inputDataCount", inputDataCount.value().toString());

		mapOut.put("outputDataCount", outputDataCount.value().toString());

		String operFieldType = fieldTypes[0];// 要操作的字段类型

		if (operFieldType.equalsIgnoreCase("String")) {

			// 创建正常输出xml文件

			listOut.add(mapOut);

			String hdfsOutXml = "/user/" + userName + "/optasks/"

					+ jobinstanceid + "/" + operatorName + "/out"

					+ "/stdout.xml";

			operXML.genStdoutXml(hdfsOutXml, listOut);

		} else {

			// 创建错误输出xml文件

			String errorMessage = "fieldType is not string!!!";

			String errotCode = "80001";

			mapOut.put("errorMessage", errorMessage);

			mapOut.put("errotCode", errotCode);

			listOut.add(mapOut);

			String hdfsErrorXml = "/user/" + userName + "/optasks/"

					+ jobinstanceid + "/" + operatorName + "/out"

					+ "/stderr.xml";

			operXML.genStderrXml(hdfsErrorXml, listOut);

		}

		System.exit(0);

	}

	@SuppressWarnings("rawtypes")

	public static JavaRDD<SerializableWritable<HCatRecord>> lowerUpperCaseConvert(

			JavaSparkContext jsc, String dbName, String inputTabName,

			int position) throws IOException {

		Configuration inputConf = new Configuration();

		SerHCatInputFormat.setInput(inputConf, dbName, inputTabName);

		JavaPairRDD<WritableComparable, SerializableWritable> rdd = jsc

				.newAPIHadoopRDD(inputConf, SerHCatInputFormat.class,

						WritableComparable.class, SerializableWritable.class);

		final Broadcast<Integer> posBc = jsc.broadcast(position);

		// 获取表记录集

		JavaRDD<SerializableWritable<HCatRecord>> result = null;

		final Accumulator<Integer> output = jsc.accumulator(0);

		final Accumulator<Integer> input = jsc.accumulator(0);

		result = rdd

				.map(new Function<Tuple2<WritableComparable, SerializableWritable>, SerializableWritable<HCatRecord>>() {

					private static final long serialVersionUID = -2362812254158054659L;

					private final int postion = posBc.getValue().intValue();

					public SerializableWritable<HCatRecord> call(

							Tuple2<WritableComparable, SerializableWritable> v)

							throws Exception {

						HCatRecord record = (HCatRecord) v._2.value();

						// +1 inport

						input.add(1);

						List<Object> newRecord = new ArrayList<Object>(record

								.size());

						for (int i = 0; i < record.size(); ++i) {

							newRecord.add(record.get(i));

						}

						/*

						 * if (ok) +1 outport1 else +1 errport

						 */

						newRecord.set(postion, newRecord.get(postion)

								.toString().toUpperCase());

						output.add(1);

						return new SerializableWritable<HCatRecord>(

								new DefaultHCatRecord(newRecord));// 返回记录

					}

				});

		inputDataCount = input;

		outputDataCount = output;

		return result;

	}

	@SuppressWarnings("rawtypes")

	public static void storeToTable(

			JavaRDD<SerializableWritable<HCatRecord>> rdd, String dbName,

			String tblName) {

		Job outputJob = null;

		try {

			outputJob = Job.getInstance();

			outputJob.setJobName("lowerUpperCaseConvert");

			outputJob.setOutputFormatClass(SerHCatOutputFormat.class);

			outputJob.setOutputKeyClass(WritableComparable.class);

			outputJob.setOutputValueClass(SerializableWritable.class);

			SerHCatOutputFormat.setOutput(outputJob,

					OutputJobInfo.create(dbName, tblName, null));

			HCatSchema schema = SerHCatOutputFormat.getTableSchema(outputJob

					.getConfiguration());

			SerHCatOutputFormat.setSchema(outputJob, schema);

		} catch (IOException e) {

			e.printStackTrace();

		}

		// 将RDD存储到目标表中

		rdd.mapToPair(

				new PairFunction<SerializableWritable<HCatRecord>, WritableComparable, SerializableWritable<HCatRecord>>() {

					private static final long serialVersionUID = -4658431554556766962L;

					@Override

					public Tuple2<WritableComparable, SerializableWritable<HCatRecord>> call(

							SerializableWritable<HCatRecord> record)

							throws Exception {

						return new Tuple2<WritableComparable, SerializableWritable<HCatRecord>>(

								NullWritable.get(), record);

					}

				}).saveAsNewAPIHadoopDataset(outputJob.getConfiguration());

	}

	// 创建表结构

	public static void createTable(String dbName, String tblName,

			HCatSchema schema) {

		HiveMetaStoreClient client = null;

		try {

			HiveConf hiveConf = HCatUtil.getHiveConf(new Configuration());

			try {

				client = HCatUtil.getHiveClient(hiveConf);

			} catch (MetaException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			}

		} catch (IOException e) {

			e.printStackTrace();

		}

		try {

			if (client.tableExists(dbName, tblName)) {

				client.dropTable(dbName, tblName);

			}

		} catch (TException e) {

			e.printStackTrace();

		}

		List<FieldSchema> fields = HCatUtil.getFieldSchemaList(schema

				.getFields());

		System.out.println(fields);

		Table table = new Table();

		table.setDbName(dbName);

		table.setTableName(tblName);

		StorageDescriptor sd = new StorageDescriptor();

		sd.setCols(fields);

		table.setSd(sd);

		sd.setInputFormat(RCFileInputFormat.class.getName());

		sd.setOutputFormat(RCFileOutputFormat.class.getName());

		sd.setParameters(new HashMap<String, String>());

		sd.setSerdeInfo(new SerDeInfo());

		sd.getSerdeInfo().setName(table.getTableName());

		sd.getSerdeInfo().setParameters(new HashMap<String, String>());

		sd.getSerdeInfo().getParameters()

				.put(serdeConstants.SERIALIZATION_FORMAT, "1");

		sd.getSerdeInfo().setSerializationLib(

				org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class

						.getName());

		Map<String, String> tableParams = new HashMap<String, String>();

		table.setParameters(tableParams);

		try {

			client.createTable(table);

			System.out.println("Create table successfully!");

		} catch (TException e) {

			e.printStackTrace();

			return;

		} finally {

			client.close();

		}

	}

	// 获得HCatSchema

	public static HCatSchema getHCatSchema(String dbName, String tblName) {

		Job outputJob = null;

		HCatSchema schema = null;

		try {

			outputJob = Job.getInstance();

			outputJob.setJobName("getHCatSchema");

			outputJob.setOutputFormatClass(SerHCatOutputFormat.class);

			outputJob.setOutputKeyClass(WritableComparable.class);

			outputJob.setOutputValueClass(SerializableWritable.class);

			SerHCatOutputFormat.setOutput(outputJob,

					OutputJobInfo.create(dbName, tblName, null));

			schema = SerHCatOutputFormat.getTableSchema(outputJob

					.getConfiguration());

		} catch (IOException e) {

			e.printStackTrace();

		}

		return schema;

	}

}

spark+hcatalog操作hive表及其数据的更多相关文章

spark2.3.0 配置spark sql 操作hive
spark可以通过读取hive的元数据来兼容hive,读取hive的表数据,然后在spark引擎中进行sql统计分析,从而,通过spark sql与hive结合实现数据分析将成为一种最佳实践.配置步骤 ...
通过 Spark R 操作 Hive
作为数据工程师,我日常用的主力语言是R,HiveQL,Java与Scala.R是非常适合做数据清洗的脚本语言,并且有非常好用的服务端IDE——RStudio Server:而用户日志主要储存在hive ...
luigi操作hive表
关于luigi框架下查询hive表的操作 class JoinQuery(HiveQueryTask): date=luigi.DateParameter() def hiveconfs(self): ...
22.把hive表中数据导入到mysql中
先通过可视化工具链接mysql,在链接的时候用sqoop 用户登录在数据库userdb下新建表保存,输入表名upflow 现在我们需要把hive里面的数据通过sqoop导入到mysql里面 sqo ...
Spark SQL 操作Hive 数据
Spark 2.0以前版本:val sparkConf = new SparkConf().setAppName("soyo") val spark = new SparkC ...
用java代码调用shell脚本执行sqoop将hive表中数据导出到mysql
1:创建shell脚本 touch sqoop_options.sh chmod 777 sqoop_options.sh 编辑文件特地将执行map的个数设置为变量测试可以java代码传参数 ...
Spark:spark df插入hive表后小文件数量多，如何合并？
在做spark开发过程中,时不时的就有可能遇到租户的hive库目录下的文件个数超出了最大限制问题. 一般情况下通过hive的参数设置: val conf = new SparkConf().setAp ...
spark sql 查询hive表并写入到PG中
import java.sql.DriverManager import java.util.Properties import com.zhaopin.tools.{DateUtils, TextU ...
大数据学习day25------spark08-----1. 读取数据库的形式创建DataFrame 2. Parquet格式的数据源 3. Orc格式的数据源 4.spark_sql整合hive 5.在IDEA中编写spark程序（用来操作hive） 6. SQL风格和DSL风格以及RDD的形式计算连续登陆三天的用户
1. 读取数据库的形式创建DataFrame DataFrameFromJDBC object DataFrameFromJDBC { def main(args: Array[String]): U ...

随机推荐

生JS实现jQuery的ready方法呢？下面是其中之一的做法：
function ready(fn){ if(document.addEventListener){ //标准浏览器 document.addEventListener('DOMConten ...
第三章 XHTML 表单
1.表单的主要作用在于在网页上提供一个图形用户界面,以采集和提交用户输入的数据. 2.HTML表单元素和属性可以分为两种类型:定义表单整体结构,使浏览器知道如何处理表单数据的元素:创建输入控件的元素. ...
Hadoop Hello World
Hadoop单机环境配置OK后,需要找个例子测试一下Mapreduce功能.因此从Hadoop源代码中找到一个例子:WordCount.java,来验证. 编译过程如下: cd hadoop-exam ...
POJ 1328 Radar Installation 贪心难度:1
http://poj.org/problem?id=1328 思路: 1.肯定y大于d的情况下答案为-1,其他时候必定有非负整数解 2.x,y同时考虑是较为麻烦的,想办法消掉y,用d^2-y^2获得圆 ...
HDU 1698 Just a Hook
题意:初始1-n 值为1,有Q操作,每次可以把一段[l,r] 整段每个值变成 x,问最后的[1,n]总和. 线段树成段更新(基础题) #include<cstdio> #include&l ...
【NOIP2015】提高day2解题报告
题目: P1981跳石头描述一年一度的“跳石头”比赛又要开始了!这项比赛将在一条笔直的河道中进行,河道中分布着一些巨大岩石.组委会已经选择好了两块岩石作为比赛起点和终点.在起点和终点之间,有 N ...
踏着前人的脚印学Hadoop——RPC源码
A simple RPC mechanism.A protocol is a Java interface. All parameters and return types must be one ...
使用AppCan自带的升级功能实现移动端升级
1.需要在AppCan项目的config.xml文件中设置“更新地址”,即在执行uexWidget.checkUpdate();时访问的后台页面地址,比如: http://192.168.0.10:8 ...
win10系统的点评
Windows 10 是美国微软公司所研发的新一代跨平台及设备应用的操作系统.在正式版本发布一年内,所有符合条件的Windows7.Windows 8.1的用户都将可以免费升级到Windows 10, ...
[pjsip]Pjlib中配置文件config.h解析
config_site.h 这个头文件包含在config.h中,用于引入平台?(site)/用户特定的配置以控制PJLIB的特性,用户需要自己生成这个文件. 譬如说我们要把PJLIB编译成DLL,那么 ...

spark+hcatalog操作hive表及其数据

spark+hcatalog操作hive表及其数据的更多相关文章

随机推荐

热门专题