kafka+hbase+hive实现实时接入数据至hive
整体架构:
项目目标,实现配置mysql,便可以自动化入湖至Hive,入湖至Hive方便后期数据分析。
首先在Mysql中配置好kafka的topic、Server以及入户表等信息,java程序初始化加载配置信息;
其次,消费者根据配置信息消费对应topic,并解析topic对应的表字段,将其put至hbase;
在Hive中创建映射表,刷新表,便可以在impala中快速查询表信息。
1、实现hbase与hive集成
-- 在hive中执行以下语句,实现hive与hbase映射 CREATE EXTERNAL TABLE dl.tg_iotavrodata_hh_test(
rk string -- 对应hbase的rowkey
,PID string
,RESOURCE_NO string
,CREATE_USER string
,CREATE_DATE string
,PP_01_SENSORNO string
,PP_01_VALUE string
,PP_02_SENSORNO string
,PP_02_VALUE string
,PP_03_SENSORNO string
,PP_03_VALUE string
,PP_04_SENSORNO string
,PP_04_VALUE string
,PP_05_SENSORNO string
,PP_05_VALUE string
,PP_06_SENSORNO string
,PP_06_VALUE string
,PP_07_SENSORNO string
,PP_07_VALUE string
,PP_08_SENSORNO string
,PP_08_VALUE string
,PP_09_SENSORNO string
,PP_09_VALUE string
,PP_10_SENSORNO string
,PP_10_VALUE string
)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
')
TBLPROPERTIES ('hbase.table.name'='dl.tg_test', 'storage_handler'='org.apache.hadoop.hive.hbase.HBaseStorageHandler');
注:'hbase.table.name'='dl.tg_test' 这个是hbase表
baseInfo:PID ,其中baseInfo 为hbase设置的列簇
2、HBase工具类:
实现keberos认证下连接hbase api操作
package cn.lin.utils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.security.UserGroupInformation;
import java.io.IOException;
public class HBaseUtils {
private static Configuration conf = null;
//以下hbase配置信息可以看hbase-site.xml配置信息,便可!
static {
// 这个配置文件主要是记录 kerberos的相关配置信息,例如KDC是哪个IP?默认的realm是哪个?
// 如果没有这个配置文件这边认证的时候肯定不知道KDC的路径喽
// 这个文件也是从远程服务器上copy下来的
System. setProperty("java.security.krb5.conf", "E:/krb5.conf" );
conf = HBaseConfiguration.create();
conf.set("hadoop.security.authentication" , "Kerberos" );
// 这个hbase.keytab也是从远程服务器上copy下来的, 里面存储的是密码相关信息
// 这样我们就不需要交互式输入密码了
conf.set("keytab.file" , "E:/etl.keytab" );
// 这个可以理解成用户名信息,也就是Principal
conf.set("kerberos.principal" , "hive@XXXXXXX" );
conf.set("hbase.master.kerberos.principal","hbase/_HOST@XXXXXXX");
conf.set("hbase.regionserver.kerberos.principal","hbase/_HOST@XXXXXXX");
conf.set("hbase.zookeeper.quorum","qfwa0466,qfwa0467,qfwa0468");
conf.set("hbase.zookeeper.property.clientPort","2181");
conf.set("hbase.security.authentication","kerberos");
UserGroupInformation. setConfiguration(conf);
try {
UserGroupInformation. loginUserFromKeytab("hive@XXXXXXX", "E:/etl.keytab" );
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// 建表
public static void createTable(String tableName, String[] cols) {
HBaseAdmin admin = null;
try {
admin = new HBaseAdmin(conf);
if (admin.tableExists(tableName)) {
System.out.println("talbe is exists!");
} else {
HTableDescriptor hTableDescriptor = new HTableDescriptor(tableName);
for (String col : cols) {
HColumnDescriptor hColumnDescriptor = new HColumnDescriptor(col);
hTableDescriptor.addFamily(hColumnDescriptor);
}
admin.createTable(hTableDescriptor);
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void scanSpan(final String tableName) {
HTable table = null;
try {
table = new HTable(conf, tableName);
System. out.println("tablename:" +new String(table.getTableName()));
Scan s = new Scan();
ResultScanner rs = table.getScanner(s);
for (Result r : rs) {
System. out.println(r.toString());
KeyValue[] kv = r. raw();
for (int i = 0; i < kv.length; i++) {
System. out.print(new String(kv[i].getRow()) + "");
System. out.print(new String(kv[i].getFamily()) + ":");
System. out.print(new String(kv[i].getQualifier() ) + "" );
System. out.print(kv[i].getTimestamp() + "" );
System. out.println(new String(kv[i].getValue() ));
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void put(String tableName,String rowkey,String cf,String column,String value) {
HTable table = null;
try {
table = new HTable(conf, tableName);
} catch (IOException e) {
e.printStackTrace();
}
Put put = new Put(Bytes.toBytes(rowkey));
put.add(Bytes.toBytes(cf), Bytes.toBytes(column), Bytes.toBytes(value));
try {
table.put(put);
} catch (IOException e) {
e.printStackTrace();
}
}
public static Result get(String tableName,String rowkey,String cf,String column) {
// HTable table = new HTable(conf, tableName);
HTable table = null;
try {
table = new HTable(conf, tableName);
} catch (IOException e) {
e.printStackTrace();
}
Get get = new Get(Bytes.toBytes(rowkey));
get.addColumn(Bytes.toBytes(cf),Bytes.toBytes(column));
Result result = null;
try {
result = table.get(get);
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
/**
* @param args
*/
public static void main(String[] args) {
createTable("test", new String[] { "info" });
// TODO Auto-generated method stub
// rowkey: 时间+主键
// createTable("dl.tg_iotavrodata_test", new String[] { "baseInfo" });
// put("dl.tg_iotavrodata_test","20191009100001","baseInfo","pid","100001");
// put("userInfo","3","baseInfo","name","rose03");
/*try {
Test. scanSpan("userInfo");
Result rs = get("userInfo","01","baseInfo","name");
for(Cell cell:rs.rawCells()){
System.out.println("column Family:"+new String(CellUtil.cloneFamily(cell))+" ");
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}*/
}
}
主程序,接收解析avro类型数据:
package cn.lin.app;
import cn.lin.domain.TopicInfo;
import cn.lin.domain.TopicTask;
import cn.lin.utils.DBUtils;
import cn.lin.utils.FileUtils;
import cn.lin.utils.HBaseUtils;
import cn.lin.utils.RemoteShellUtils;
import com.twitter.bijection.Injection;
import com.twitter.bijection.avro.GenericAvroCodecs;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.json.JSONArray;
import org.json.JSONObject;
import java.io.File;
import java.io.FileInputStream;
import java.text.SimpleDateFormat;
import java.util.*;
public class IotKafkaConsumer {
/**
* 设置schema模板
*
* @param topic
* @return Schema
*/
public static Schema getSchema(String topic) {
String schemaStr = "{\"namespace\": \"org.mes.fn.action.quartzJob.KafkaTask\"," +
" \"type\": \"record\"," +
" \"name\": \"" + topic + "\"," +
" \"fields\": [" +
" {\"name\": \"version\", \"type\": [\"string\", \"null\"]}," +
" {\"name\": \"server\", \"type\": \"string\"}," +
" {\"name\": \"service\", \"type\": [\"string\", \"null\"]}," +
" {\"name\": \"domain\", \"type\": [\"string\", \"null\"]}," +
" {\"name\": \"operation_type\", \"type\": [\"string\", \"null\"]}," +
" {\"name\": \"loadtime\", \"type\": [\"string\", \"null\"]}," +
" {\"name\": \"guid\", \"type\": [\"string\", \"null\"]}," +
" {\"name\": \"data\", \"type\": [\"null\",\"string\"]}" +
" ]" +
"}";
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(schemaStr);
return schema;
}
/**
* kafka配置
*
* @return
*/
private KafkaConsumer<String, byte[]> getConsumer(TopicInfo info) {
Properties props = new Properties();
props.put("bootstrap.servers", info.getServers());
//每个消费者分配独立的组号
props.put("group.id", "group-1");
//如果value合法,则自动提交偏移量
props.put("enable.auto.commit", "true");
//设置多久一次更新被消费消息的偏移量
props.put("auto.commit.interval.ms", "1000");
//设置会话响应的时间,超过这个时间kafka可以选择放弃消费或者消费下一条消息
props.put("session.timeout.ms", "30000");
// 一次最大拉去的条数
props.put("max.poll.record", 100);
// 消费规则
// 当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
props.put("auto.offset.reset", "earliest");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
/*props.put("deserialzer.encoding","UTF-8");*/
KafkaConsumer<String, byte[]> consumer = new KafkaConsumer<String, byte[]>(props);
consumer.subscribe(Arrays.asList(info.getTopic()));
Schema schema = getSchema(info.getTopic());
Injection<GenericRecord, byte[]> recordInjection = GenericAvroCodecs.toBinary(schema);
TopicTask task = DBUtils.getTopicTask(info.getTopic_id());
List<String> list = task.getDomain();
Map<String, List> map = new HashMap();
SimpleDateFormat df = new SimpleDateFormat("yyyyMMdd");//设置日期格式
for (int i = 0; i < list.size(); i++) {
String domain = list.get(i);
List fields = DBUtils.getFieldsInfo(domain);
map.put(domain, fields);
}
try {
while (true) {
ConsumerRecords<String, byte[]> records = consumer.poll(1000);
for (ConsumerRecord<String, byte[]> record : records) {
GenericRecord genericRecord = recordInjection.invert(record.value()).get();
String topic = record.topic();
String domain = genericRecord.get("domain") == null ? "" : genericRecord.get("domain").toString();
if (!map.containsKey(domain)) {
task = DBUtils.getTopicTask(info.getTopic_id());
list = task.getDomain();
map = new HashMap();
for (int i = 0; i < list.size(); i++) {
String domainTmp = list.get(i);
List fields = DBUtils.getFieldsInfo(domainTmp);
map.put(domain, fields);
}
}
String operation_type = genericRecord.get("operation_type") == null ? "" : genericRecord.get("operation_type").toString();
String loadtime = genericRecord.get("loadtime") == null ? "" : genericRecord.get("loadtime").toString();
String data = genericRecord.get("data") == null ? "" : genericRecord.get("data").toString();
String acc_mode = task.getAcc_mode();
if (data != "" && data != "null") {
if ("hive".equals(acc_mode)){
JSONArray array = new JSONArray(data);
List<List> pList = new ArrayList<List>();
for (int i = 0; i < array.length(); i++) {
JSONObject jsonObj = array.getJSONObject(i);
List<String> pump = new ArrayList<String>();
List<String> fieldsList = map.get(domain);
for (int j = 0; j < fieldsList.size(); j++) {
if (jsonObj.has(fieldsList.get(j))) {
pump.add(jsonObj.get(fieldsList.get(j)).toString());
}else {
pump.add("");//sql中查不到对应字段,则该字段置空
}
}
pump.add(operation_type);
pump.add(loadtime);
pList.add(pump);
}
//文件名根据topic + domain 这样就可以将数据落入对应的文件中
String path = task.getFile_path() + topic;
FileUtils.isChartPathExist(path);
String fileName = path + "/" + task.getSrc_sys_tbl().get(domain) + ".txt";
int fileSize = (int) new File(fileName).length() / 1024; // 1024;
int fileLength = Integer.parseInt(task.getFile_length().get(domain));
if (fileSize > fileLength) {
FileUtils.renameFileAndCreateFile(fileName);
// 执行远程服务器上的shell
RemoteShellUtils tool = new RemoteShellUtils("xxxx", "xxxx",
"xxxx", "utf-8");
tool.exec("sh /home/etl/lin/test/run.sh");
}
FileUtils.writeFile(fileName, pList);
}else{
JSONArray array = new JSONArray(data);
for (int i = 0; i < array.length(); i++) {
JSONObject jsonObj = array.getJSONObject(i);
List<String> fieldsList = map.get(domain); String time = df.format(new Date());
for (int j = 0; j < fieldsList.size(); j++) {
HBaseUtils.put(task.getTbl_name().get(domain),time+jsonObj.get(fieldsList.get(0)).toString(),"baseInfo",fieldsList.get(j),jsonObj.get(fieldsList.get(j)).toString());
}
}
}
}
}
}
} finally {
consumer.wakeup();
}
}
public static void main(String[] args) {
/*if (args.length != 1)
System.exit(0);
String task_id = args[0];*/
String task_id = "1"; //临时设置
IotKafkaConsumer iot = new IotKafkaConsumer();
TopicInfo info = DBUtils.getTopicInfo(task_id);
iot.getConsumer(info);
}
}
执行远程服务器shell脚本工具类:
package cn.gtmc.utils;
import ch.ethz.ssh2.Connection;
import ch.ethz.ssh2.Session;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
public class RemoteShellUtils {
private Connection conn;
private String ipAddr;
private String charset = Charset.defaultCharset().toString();
private String userName;
private String password;
public RemoteShellUtils(String ipAddr, String userName, String password,
String charset) {
this.ipAddr = ipAddr;
this.userName = userName;
this.password = password;
if (charset != null) {
this.charset = charset;
}
}
public boolean login() throws IOException {
conn = new Connection(ipAddr);
conn.connect(); // 连接
return conn.authenticateWithPassword(userName, password); // 认证
}
public String exec(String cmds) {
InputStream in = null;
String result = "";
try {
if (this.login()) {
Session session = conn.openSession(); // 打开一个会话
session.execCommand(cmds);
in = session.getStdout();
result = this.processStdout(in, this.charset);
session.close();
conn.close();
}
} catch (IOException e1) {
e1.printStackTrace();
}
return result;
}
public String processStdout(InputStream in, String charset) {
byte[] buf = new byte[1024];
StringBuffer sb = new StringBuffer();
try {
while (in.read(buf) != -1) {
sb.append(new String(buf, charset));
}
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
/**
* @param args
*/
public static void main(String[] args) {
RemoteShellUtils tool = new RemoteShellUtils("XXXXX", "XXXXX",
"XXXXX", "utf-8");
tool.exec("sh /home/etl/lin/test/run.sh");
}
}
mysql工具类:
package cn.lin.utils;
import cn.lin.domain.TopicInfo;
import cn.lin.domain.TopicTask;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DBUtils {
public static TopicInfo getTopicInfo(String topic_id){
//获得数据库连接
Connection conn = DB.getConnection();
TopicInfo info = new TopicInfo();
//sql语句
String sql = "select topic_id,servers,topic,type from topic_info where topic_id=?";
//通过conn获得PreparedStatement对象,该对象可对sql语句进行预编译
PreparedStatement ptmt = null;
try {
ptmt = (PreparedStatement) conn.prepareStatement(sql);
ptmt.setString(1, topic_id);
//new一个Resultset对象接受返回的数据
ResultSet rs = ptmt.executeQuery();
while (rs.next()) {
info.setTopic_id(rs.getString("topic_id"));
info.setServers(rs.getString("servers"));
info.setTopic(rs.getString("topic"));
info.setType(rs.getString("type"));
}
} catch (SQLException e) {
e.printStackTrace();
}
return info;
}
public static TopicTask getTopicTask(String topic_id){
//获得数据库连接
Connection conn = DB.getConnection();
TopicTask task = new TopicTask();
//sql语句
String sql = "SELECT a.task_id,topic_id,domain,acc_mode,ip,file_path,file_length,tbl_name,src_sys_tbl FROM topic_task a,dmp_tbl_info b WHERE a.task_id = b.task_id AND topic_id=?";
//通过conn获得PreparedStatement对象,该对象可对sql语句进行预编译
PreparedStatement ptmt = null;
try {
ptmt = (PreparedStatement) conn.prepareStatement(sql);
ptmt.setString(1, topic_id);
//new一个Resultset对象接受返回的数据
ResultSet rs = ptmt.executeQuery();
Map map = new HashMap();
Map tbl_map = new HashMap();
Map src_tbl_map = new HashMap();
List domainList = new ArrayList();
while (rs.next()) {
task.setTask_id(rs.getString("task_id"));
task.setTopic_id(rs.getString("topic_id"));
domainList.add(rs.getString("domain"));
task.setAcc_mode(rs.getString("acc_mode"));
task.setIp(rs.getString("ip"));
task.setFile_path(rs.getString("file_path"));
map.put(rs.getString("domain"),rs.getString("file_length"));
tbl_map.put(rs.getString("domain"),rs.getString("tbl_name"));
src_tbl_map.put(rs.getString("domain"),rs.getString("src_sys_tbl"));
}
task.setDomain(domainList);
task.setFile_length(map);
task.setTbl_name(tbl_map);
task.setSrc_sys_tbl(src_tbl_map);
} catch (SQLException e) {
e.printStackTrace();
}
return task;
}
public static List getFieldsInfo(String domain){
List list = new ArrayList();
//获得数据库连接
Connection conn = DB.getConnection();
//sql语句
String sql = "SELECT c.FIELD FROM topic_task a,dmp_tbl_info b,fields_info c WHERE a.task_id = b.task_id AND b.tbl_id = c.tbl_id AND a.domain = ? AND c.status = '1' ORDER BY CAST(c.field_order AS SIGNED )";
//通过conn获得PreparedStatement对象,该对象可对sql语句进行预编译
PreparedStatement ptmt = null;
try {
ptmt = (PreparedStatement) conn.prepareStatement(sql);
ptmt.setString(1, domain);
//new一个Resultset对象接受返回的数据
ResultSet rs = ptmt.executeQuery();
while (rs.next()) {
list.add(rs.getString("field"));
}
} catch (SQLException e) {
e.printStackTrace();
}
return list;
}
public static void main(String[] args){
TopicInfo info = getTopicInfo("1");
System.out.println(info.getTopic()+" "+info.getServers());
TopicTask task = getTopicTask("1");
System.out.println(task.getDomain()+""+task.getFile_length());
List list = getFieldsInfo("WATER");
for(int i = 0;i<list.size();i++){
System.out.println(list.get(i));
}
}
}
kafka+hbase+hive实现实时接入数据至hive的更多相关文章
- 利用SparkSQL(java版)将离线数据或实时流数据写入hive的用法及坑点
1. 通常利用SparkSQL将离线或实时流数据的SparkRDD数据写入Hive,一般有两种方法.第一种是利用org.apache.spark.sql.types.StructType和org.ap ...
- TOP100summit:【分享实录-Microsoft】基于Kafka与Spark的实时大数据质量监控平台
本篇文章内容来自2016年TOP100summit Microsoft资深产品经理邢国冬的案例分享.编辑:Cynthia 邢国冬(Tony Xing):Microsoft资深产品经理.负责微软应用与服 ...
- hive 之 将excel数据导入hive中 : excel 转 txt
一.需求: 1.客户每月上传固定格式的excel文件到指定目录.每月上传的文件名只有结尾月份不同,如: 10月文件名: zhongdiangedan202010.xlsx , 11月文件名: zh ...
- kafka实时流数据架构
初识kafka https://www.cnblogs.com/wenBlog/p/9550039.html 简介 Kafka经常用于实时流数据架构,用于提供实时分析.本篇将会简单介绍kafka以及它 ...
- hive查询ncdc天气数据
使用hive查询ncdc天气数据 在hive中将ncdc天气数据导入,然后执行查询shell,可以让hive自动生成mapredjob,快速去的想要的数据结果. 1. 在hive中创建ncdc表,这个 ...
- 大数据-使用Hive导入10G数据
前言 Hadoop和Hive的环境已经搭建起来了,开始导入数据进行测试.我的数据1G大概对应500W行,MySQL的查询500W行大概3.29秒,用hive同样的查询大概30秒.如果我们把数据增加到1 ...
- 使用Sqoop从MySQL导入数据到Hive和HBase 及近期感悟
使用Sqoop从MySQL导入数据到Hive和HBase 及近期感悟 Sqoop 大数据 Hive HBase ETL 使用Sqoop从MySQL导入数据到Hive和HBase 及近期感悟 基础环境 ...
- SparkStreaming+Kafka 处理实时WIFI数据
业务背景 技术选型 Kafka Producer SparkStreaming 接收Kafka数据流 基于Receiver接收数据 直连方式读取kafka数据 Direct连接示例 使用Zookeep ...
- 教程 | 使用Sqoop从MySQL导入数据到Hive和HBase
基础环境 sqoop:sqoop-1.4.5+cdh5.3.6+78, hive:hive-0.13.1+cdh5.3.6+397, hbase:hbase-0.98.6+cdh5.3.6+115 S ...
随机推荐
- 唤醒 App
一.Deep Link 1.什么是 Deep Link? Deep Link 是 App 的深度连接,当单击链接或编程请求调用Web URI意图时,Android系统按顺序依次尝试以下每一个操作,直到 ...
- HTML与CSS中的定位个人分享
定位 static - 默认值 (几乎不用,了解就可以) absolute - 绝对定位,不为元素预留空间,脱离文档流: 如果当前元素的父级元素是<body>元素的话 -> 是相对于 ...
- 超级POM
在一个有POM的文件夹下执行: mvn help:effective-pom 会输出一个超级POM文件,可以就该文件,进行分析.
- postman实现Base64加密
1.新建一个Collection 2.新建一个request 3.新增一个环境变量(全局变量也可以) 4.在variable中填入需要加密的变量名称,比如password 5.在body中填好参数,需 ...
- pandas、matplotlib、Numpy模块的简单学习
目录 一.pandas模块 二.matplotlib模块 1.条形图 2. 直方图 3.折线图 4.散点图+直线图 三.numpy 一.pandas模块 pandas是BSD许可的开源库,为Pytho ...
- 【leetcode】698. Partition to K Equal Sum Subsets
题目如下: 解题思路:本题是[leetcode]473. Matchsticks to Square的姊妹篇,唯一的区别是[leetcode]473. Matchsticks to Square指定了 ...
- HRESULT: 0x80040228
When run the arcgis engine codes in the console application, may come to the exception: HRESULT: 0x8 ...
- mobx状态管理快速入门
1.mobx状态管理 安装: creact-react-app mobx
- Java中日期
package com.shiro.springbootshiro; import java.text.SimpleDateFormat; import java.util.Date; /** * 作 ...
- Bugku web 计算器
计算器 打开网页,想输入正确的计算结果发现只输进去一位数??? 遇事不决先F12看一眼源码,发现flag
