Hive实践（hive0.12）

版本号：cdh5.0.0+hadoop2.3.0+hive0.12

一、原始数据：

1. 本地数据

[root@node33 data]# ll

total 12936

-rw-r--r--. 1 root root 13245467 May  1 17:08 hbase-data.csv

[root@node33 data]# head -n 3 hbase-data.csv

1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0,0,1

2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0,0,1

3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0,0,1

2. hdfs数据：

[root@node33 data]# hadoop fs -ls /input

Found 1 items

-rwxrwxrwx   1 hdfs supergroup   13245467 2014-05-01 17:09 /input/hbase-data.csv

[root@node33 data]# hadoop fs -cat /input/* | head -n 3

1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0,0,1

2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0,0,1

3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0,0,1

二、创建hive表：

1.hive外部表：

[root@node33 hive]# cat employees_ext.sql

create external table if not exists employees_ext(

	id	int,

	x1	float,

	x2	float,

	x3	float,

	x4	float,

	x5	float,

	x6	float,

	x7	float,

	x8	float,

	x9	float,

	y	int)

row format delimited fields terminated by ','

location '/input/'

创建表，client执行：hive -f employees_ext.sql

2. hive表

[root@node33 hive]# cat employees.sql

create table employees(

	id	int,

	x1	float,

	x2	float,

	x3	float,

	x4	float,

	x5	float,

	x6	float,

	x7	float,

	x8	float,

	x9	float

)

partitioned by (y int);

创建表，client执行：hive -f employees.sql

3. hive表（orc方式存储）

[root@node33 hive]# cat employees_orc.sql

create table employees_orc(

	id	int,

	x1	float,

	x2	float,

	x3	float,

	x4	float,

	x5	float,

	x6	float,

	x7	float,

	x8	float,

	x9	float

)

partitioned by (y int)

row format serde "org.apache.hadoop.hive.ql.io.orc.OrcSerde"

stored as orc;

执行：hive -f employees_orc.sql

三、导入数据：

1. employees_ext 表导入employees表：

[root@node33 hive]# cat employees_ext-to-employees.sql 

set hive.exec.dynamic.partition=true;

set hive.exec.dynamic.partition.mode=nonstrict;

set hive.eec.max.dynamic.partitions.pernode=1000;

insert overwrite table employees

	partition(y)

select

	emp_ext.id,

	emp_ext.x1,

	emp_ext.x2,

	emp_ext.x3,

	emp_ext.x4,

	emp_ext.x5,

	emp_ext.x6,

	emp_ext.x7,

	emp_ext.x8,

	emp_ext.x9,

	emp_ext.y

from employees_ext emp_ext;

执行：hive -f employees_ext-to-employees.sql。其部分log例如以下：

Partition default.employees{y=1} stats: [num_files: 1, num_rows: 0, total_size: 3622, raw_data_size: 0]

Partition default.employees{y=2} stats: [num_files: 1, num_rows: 0, total_size: 4060, raw_data_size: 0]

Partition default.employees{y=3} stats: [num_files: 1, num_rows: 0, total_size: 910, raw_data_size: 0]

Partition default.employees{y=5} stats: [num_files: 1, num_rows: 0, total_size: 699, raw_data_size: 0]

Partition default.employees{y=6} stats: [num_files: 1, num_rows: 0, total_size: 473, raw_data_size: 0]

Partition default.employees{y=7} stats: [num_files: 1, num_rows: 0, total_size: 13561851, raw_data_size: 0]

Table default.employees stats: [num_partitions: 6, num_files: 6, num_rows: 0, total_size: 13571615, raw_data_size: 0]

MapReduce Jobs Launched:

Job 0: Map: 1   Cumulative CPU: 6.78 sec   HDFS Read: 13245660 HDFS Write: 13571615 SUCCESS

Total MapReduce CPU Time Spent: 6 seconds 780 msec

OK

Time taken: 186.743 seconds

查看hdfs文件大小：

[root@node33 hive]# hadoop fs -count /user/hive/warehouse/employees

           7            6           13571615 /user/hive/warehouse/employees

查看hdfs文件内容：

bash-4.1$ hadoop fs -cat /user/hive/warehouse/employees/y=1/* | head -n 1

11.5210113.644.491.171.780.068.750.00.0

watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvZmFuc3kxOTkw/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast" alt="" />

（截图的内容为输出，拷贝到代码块里面有问题）

2. employees_ext 表导入employees_orc表：

[root@node33 hive]# cat employees_ext-to-employees_orc.sql 

set hive.exec.dynamic.partition=true;

set hive.exec.dynamic.partition.mode=nonstrict;

set hive.eec.max.dynamic.partitions.pernode=1000;

insert overwrite table employees_orc

	partition(y)

select

	emp_ext.id,

	emp_ext.x1,

	emp_ext.x2,

	emp_ext.x3,

	emp_ext.x4,

	emp_ext.x5,

	emp_ext.x6,

	emp_ext.x7,

	emp_ext.x8,

	emp_ext.x9,

	emp_ext.y

from employees_ext emp_ext;

执行：hive -f employees_ext-to-employees_orc.sql，其部分log例如以下：

Partition default.employees_orc{y=1} stats: [num_files: 1, num_rows: 0, total_size: 2355, raw_data_size: 0]

Partition default.employees_orc{y=2} stats: [num_files: 1, num_rows: 0, total_size: 2539, raw_data_size: 0]

Partition default.employees_orc{y=3} stats: [num_files: 1, num_rows: 0, total_size: 1290, raw_data_size: 0]

Partition default.employees_orc{y=5} stats: [num_files: 1, num_rows: 0, total_size: 1165, raw_data_size: 0]

Partition default.employees_orc{y=6} stats: [num_files: 1, num_rows: 0, total_size: 955, raw_data_size: 0]

Partition default.employees_orc{y=7} stats: [num_files: 1, num_rows: 0, total_size: 1424599, raw_data_size: 0]

Table default.employees_orc stats: [num_partitions: 6, num_files: 6, num_rows: 0, total_size: 1432903, raw_data_size: 0]

MapReduce Jobs Launched:

Job 0: Map: 1   Cumulative CPU: 7.84 sec   HDFS Read: 13245660 HDFS Write: 1432903 SUCCESS

Total MapReduce CPU Time Spent: 7 seconds 840 msec

OK

Time taken: 53.014 seconds

查看hdfs文件大小：

[root@node33 hive]# hadoop fs -count /user/hive/warehouse/employees_orc

           7            6            1432903 /user/hive/warehouse/employees_orc

查看hdfs文件内容：

3. 比較两者性能

	时间	压缩率
employees表：	186.7秒	13571615/13245660=1.0246
employees_orc表：	53.0秒	1432903/13245660=0.108

时间上来说，orc的表现方式会好非常多。同一时候压缩率也好非常多。

只是，这个測试是在本人虚拟机上測试的，并且是单机測试的，所以參考价值不是非常大，可是压缩率还是有一定參考价值的。

四、导出数据

1. employees表：

[root@node33 hive]# cat export_employees.sql 

insert overwrite local directory '/opt/hivedata/employees.dat'

row format delimited

fields terminated by ','

select

	emp.id,

	emp.x1,

	emp.x2,

	emp.x3,

	emp.x4,

	emp.x5,

	emp.x6,

	emp.x7,

	emp.x8,

	emp.x9,

	emp.y

from employees emp

执行：hive -f export_employees.sql
部分log：

MapReduce Total cumulative CPU time: 9 seconds 630 msec

Ended Job = job_1398958404577_0007

Copying data to local directory /opt/hivedata/employees.dat

Copying data to local directory /opt/hivedata/employees.dat

MapReduce Jobs Launched:

Job 0: Map: 1   Cumulative CPU: 9.63 sec   HDFS Read: 13572220 HDFS Write: 13978615 SUCCESS

Total MapReduce CPU Time Spent: 9 seconds 630 msec

OK

Time taken: 183.841 seconds

数据查看：

[root@node33 hive]# ll /opt/hivedata/employees.dat/

total 13652

-rw-r--r--. 1 root root 13978615 May  2 05:15 000000_0

[root@node33 hive]# head -n 1 /opt/hivedata/employees.dat/000000_0

1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1

2. employees_orc表：

[root@node33 hive]# cat export_employees_orc.sql 

insert overwrite local directory '/opt/hivedata/employees_orc.dat'

row format delimited

fields terminated by ','

select

	emp.id,

	emp.x1,

	emp.x2,

	emp.x3,

	emp.x4,

	emp.x5,

	emp.x6,

	emp.x7,

	emp.x8,

	emp.x9,

	emp.y

from employees_orc emp

执行 hive -f export_employees_orc.sql

部分log：

MapReduce Total cumulative CPU time: 4 seconds 920 msec

Ended Job = job_1398958404577_0008

Copying data to local directory /opt/hivedata/employees_orc.dat

Copying data to local directory /opt/hivedata/employees_orc.dat

MapReduce Jobs Launched:

Job 0: Map: 1   Cumulative CPU: 4.92 sec   HDFS Read: 1451352 HDFS Write: 13978615 SUCCESS

Total MapReduce CPU Time Spent: 4 seconds 920 msec

OK

Time taken: 41.686 second

查看数据：

[root@node33 hive]# head -n 1 /opt/hivedata/employees_orc.dat/000000_0

1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1

[root@node33 hive]# ll /opt/hivedata/employees_orc.dat/

total 13652

-rw-r--r--. 1 root root 13978615 May  2 05:18 000000_0

这里的数据和原始数据的大小不一样。原始数据是13245467，而导出到本地的是13978615 。这是由于数据的精度问题，比如原始数据中的0都被存储为了0.0。

分享，成长。快乐

转载请注明blog地址：http://blog.csdn.net/fansy1990

Hive实践（hive0.12）的更多相关文章

hive-0.12升级成hive 0.13.1
安装了0.12之后,听说0.13.1有许多新的特性,包括永久函数,所以想更新成0.13版的(元数据放在mysql中) 2014年8月5日实验成功 hive0.13.1的新特性新特性详见 http:/ ...
hbase0.96与hive0.12整合高可靠文档及问题总结
本文链接:http://www.aboutyun.com/thread-7881-1-1.html 问题导读:1.hive安装是否需要安装mysql?2.hive是否分为客户端和服务器端?3.hive ...
Hadoop2.2.0 hive0.12 hbase0.94 配置问题记录
环境:centos6.2 Hadoop2.2.0 hive0.12 hbase0.94 1>hadoop配好之后,跑任务老失败,yarn失败,报out of memory错误,然后怎么调整内存大 ...
在Hadoop1.2.1分布式集群环境下安装hive0.12
在Hadoop1.2.1分布式集群环境下安装hive0.12 ● 前言: 1. 大家最好通读一遍过后,在理解的基础上再按照步骤搭建. 2. 之前写过两篇<<在VMware下安装Ubuntu ...
Hadoop2.3+Hive0.12集群部署
0 机器说明 IP Role 192.168.1.106 NameNode.DataNode.NodeManager.ResourceManager 192.168.1.107 Secondary ...
Caused by: org.xml.sax.SAXParseException; systemId: file:/home/hadoop/hive-0.12.0/conf/hive-site.xml; lineNumber: 5; columnNumber: 2; The markup in the document following the root element must be well
1:Hive安装的过程(Hive启动的时候报的错误),贴一下错误,和为什么错,以及解决方法: [root@master bin]# ./hive // :: INFO Configuration.de ...
hive-0.12.0-cdh5.1.0安装
先前条件: 要先安装好MYSQL 下载:hive-0.12.0-cdh5.1.0.tar.gz,并解压到安装目录 1. 添加环境变量修改/etc/profile文件. #vi /etc/profil ...
黑盒测试实践--Day7 12.1
黑盒测试实践--Day7 12.1 今天完成任务情况: 录制小组作业中的自动化测试工具实践视频汇总大家提交的各种作业模块,打包完成小组共同作业小组成员完成个人情况说明后在截止时间前分别提交作业小 ...
敏捷软件开发：原则、模式与实践——第12章　ISP：接口隔离原则
第12章 ISP:接口隔离原则不应该强迫客户程序依赖并未使用的方法. 这个原则用来处理“胖”接口所存在的缺点.如果类的接口不是内敛的,就表示该类具有“胖”接口.换句话说,类的“胖”接口可以分解成多组 ...

随机推荐

camunda流程部署的一些简单操作
act_re_deployment:(流程部署对象表)存放流程部署的显示名和部署时间 act_re_procdef:(流程定义表)存放流程定义的属性信息 act_ge_bytearray:(资源文件表 ...
安装Treserocr遇到的问题
相关链接: tesseract下载地址:http://digi.bib.uni-mannheim.de/tesseract 一.出现的问题 1.点击进去进行下载注意:其中文件名中带有dev的为开发 ...
Spring Boot集成Mybatis双数据源
这里用到了Spring Boot + Mybatis + DynamicDataSource配置动态双数据源,可以动态切换数据源实现数据库的读写分离. 添加依赖加入Mybatis启动器,这里添加了D ...
Python删除列表中的空格
list1 = ['122','2333','3444',' ','422',' ',' ','54',' '] list1=[x.strip() for x in list1 if x.strip( ...
牛客小白月赛18 G Forsaken的三维数点
思路: 这是一道树状数组和二分的题,用线段树空间直接爆,时间也会超然后这道题我犯了一个很低级的错误,导致我wa了十发左右,一个int型变量用lld输入,然后他给的提示是运行错误,我哭了,我一直以为是 ...
[轉]Exploit The Linux Kernel NULL Pointer Dereference
Exploit The Linux Kernel NULL Pointer Dereference Author: wztHome: http://hi.baidu.com/wzt85date: 20 ...
正在从 Windows 应用商店下载... 无法从 Windows 应用商店下载。请检查网络连接。
手贱关掉了一下服务,再打开就是嘛
spring中bean的高级属性之list, set, map以及props元素(含举例)
转自:http://qingfeng825.iteye.com/blog/144704 list, set, map和props元素分别用来设置类型为List,Set,Map和Propertis的属性 ...
python 装饰器的坑
今天研究了下装饰器,添加重试功能遇到了个坑,跟大家分享一下: 代码如下: def re_try(maxtry): print locals() def wrapper(fn): print local ...
Delphi如何获取一个字符串再另一个字符串中最后一次出现的位置
uses StrUtils; function ReversePos(SubStr, S: String): Integer; var i : Integer; begin i := Po ...

Hive实践（hive0.12）

Hive实践（hive0.12）的更多相关文章

随机推荐

热门专题