• 测试数据
create table sort_test(
id int,
name string
)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile; [root@wadeyu ~]# cat sort_test.log
4679 aaa
4728 aaa
3040 aaa
4207 aaa
2231 aaa
1279 aaa
7954 aaa
582 aaa
7096 aaa
4878 aaa
9684 aaa
1540 aaa
4826 aaa
2543 aaa
2323 aaa
1420 aaa
5083 aaa
8965 aaa
1391 aaa
9719 aaa
9901 aaa
2393 aaa
6024 aaa
444 aaa
1574 aaa
8881 aaa
5739 aaa
8689 aaa
1614 aaa
9340 aaa
6726 aaa
109 aaa
6941 aaa
9562 aaa
9019 aaa
4945 aaa
2206 aaa
5910 aaa
8552 aaa
1795 aaa
2720 aaa
9007 aaa
8377 aaa
2179 aaa
3683 aaa
5869 aaa
5448 aaa
5223 aaa
5127 aaa
4616 aaa
2340 aaa
1268 aaa
4332 aaa
2989 aaa
19 aaa
7880 aaa
505 aaa
5975 aaa
5288 aaa
5682 aaa
376 aaa
7502 aaa
6448 aaa
3774 aaa
5541 aaa
9636 aaa
2037 aaa
246 aaa
6151 aaa
7837 aaa
1506 aaa
3749 aaa
9335 aaa
3973 aaa
5160 aaa
7929 aaa
834 aaa
3451 aaa
1766 aaa
6228 aaa
8961 aaa
8177 aaa
2340 aaa
4245 aaa
3226 aaa
2670 aaa
784 aaa
7699 aaa
2054 aaa
6006 aaa
4204 aaa
8905 aaa
6182 aaa
1271 aaa
5415 aaa
5164 aaa
4320 aaa
3736 aaa
2287 aaa
6559 aaa
  • Order By

    • Job中只会启动一个reduce做全局排序,数据量大时,耗时会很久
    • 在strict模式(hive.mapred.mode=strict)下,必须添加limit语句限制返回条数
# 语法格式
colOrder: ( ASC | DESC )
colNullOrder: (NULLS FIRST | NULLS LAST) -- (Note: Available in Hive 2.1.0 and later)
orderBy: ORDER BY colName colOrder? colNullOrder? (',' colName colOrder? colNullOrder?)*
query: SELECT expression (',' expression)* FROM src orderBy # 排序
select * from sort_test order by id desc; +---------------+-----------------+--+
| sort_test.id | sort_test.name |
+---------------+-----------------+--+
| 9901 | aaa |
| 9719 | aaa |
| 9684 | aaa |
| 9636 | aaa |
| 9562 | aaa |
| 9340 | aaa |
| 9335 | aaa |
| 9019 | aaa |
| 9007 | aaa |
| 8965 | aaa |
| 8961 | aaa |
| 8905 | aaa |
| 8881 | aaa |
| 8689 | aaa |
| 8552 | aaa |
| 8377 | aaa |
| 8177 | aaa |
| 7954 | aaa |
| 7929 | aaa |
| 7880 | aaa |
| 7837 | aaa |
| 7699 | aaa |
| 7502 | aaa |
| 7096 | aaa |
| 6941 | aaa |
| 6726 | aaa |
| 6559 | aaa |
| 6448 | aaa |
| 6228 | aaa |
| 6182 | aaa |
| 6151 | aaa |
| 6024 | aaa |
| 6006 | aaa |
| 5975 | aaa |
| 5910 | aaa |
| 5869 | aaa |
| 5739 | aaa |
| 5682 | aaa |
| 5541 | aaa |
| 5448 | aaa |
| 5415 | aaa |
| 5288 | aaa |
| 5223 | aaa |
| 5164 | aaa |
| 5160 | aaa |
| 5127 | aaa |
| 5083 | aaa |
| 4945 | aaa |
| 4878 | aaa |
| 4826 | aaa |
| 4728 | aaa |
| 4679 | aaa |
| 4616 | aaa |
| 4332 | aaa |
| 4320 | aaa |
| 4245 | aaa |
| 4207 | aaa |
| 4204 | aaa |
| 3973 | aaa |
| 3774 | aaa |
| 3749 | aaa |
| 3736 | aaa |
| 3683 | aaa |
| 3451 | aaa |
| 3226 | aaa |
| 3040 | aaa |
| 2989 | aaa |
| 2720 | aaa |
| 2670 | aaa |
| 2543 | aaa |
| 2393 | aaa |
| 2340 | aaa |
| 2340 | aaa |
| 2323 | aaa |
| 2287 | aaa |
| 2231 | aaa |
| 2206 | aaa |
| 2179 | aaa |
| 2054 | aaa |
| 2037 | aaa |
| 1795 | aaa |
| 1766 | aaa |
| 1614 | aaa |
| 1574 | aaa |
| 1540 | aaa |
| 1506 | aaa |
| 1420 | aaa |
| 1391 | aaa |
| 1279 | aaa |
| 1271 | aaa |
| 1268 | aaa |
| 834 | aaa |
| 784 | aaa |
| 582 | aaa |
| 505 | aaa |
| 444 | aaa |
| 376 | aaa |
| 246 | aaa |
| 109 | aaa |
| 19 | aaa |
+---------------+-----------------+--+
  • Sort By

    • 排序前会根据排序字段分区,一个job启动多个reduce进行局部排序
    • 如果有limit语句,会再次启动一个job,取出每个局部排好序的前n条,再进行全局排序
    • 只保证局部有序,不保证全局有序
# Sort By语法
colOrder: ( ASC | DESC )
sortBy: SORT BY colName colOrder? (',' colName colOrder?)*
query: SELECT expression (',' expression)* FROM src sortBy # 设置开启的reduce个数
set mapreduce.job.reduces=2; 0: jdbc:hive2://> set mapreduce.job.reduces;
+--------------------------+--+
| set |
+--------------------------+--+
| mapreduce.job.reduces=2 |
+--------------------------+--+ # 执行局部排序(未带limit)
0: jdbc:hive2://> select * from sort_test sort by id desc;
+---------------+-----------------+--+
| sort_test.id | sort_test.name |
+---------------+-----------------+--+
| 9901 | aaa |
| 9684 | aaa |
| 9340 | aaa |
| 9019 | aaa |
| 9007 | aaa |
| 8965 | aaa |
| 8961 | aaa |
| 8689 | aaa |
| 8552 | aaa |
| 8177 | aaa |
| 7837 | aaa |
| 7699 | aaa |
| 7502 | aaa |
| 6559 | aaa |
| 6448 | aaa |
| 6228 | aaa |
| 6024 | aaa |
| 6006 | aaa |
| 5975 | aaa |
| 5910 | aaa |
| 5869 | aaa |
| 5739 | aaa |
| 5682 | aaa |
| 5541 | aaa |
| 5448 | aaa |
| 5415 | aaa |
| 5288 | aaa |
| 5164 | aaa |
| 5160 | aaa |
| 5083 | aaa |
| 4878 | aaa |
| 4826 | aaa |
| 4679 | aaa |
| 4616 | aaa |
| 4245 | aaa |
| 4207 | aaa |
| 3736 | aaa |
| 3451 | aaa |
| 3226 | aaa |
| 3040 | aaa |
| 2989 | aaa |
| 2720 | aaa |
| 2670 | aaa |
| 2340 | aaa |
| 2231 | aaa |
| 2206 | aaa |
| 2054 | aaa |
| 2037 | aaa |
| 1766 | aaa |
| 1614 | aaa |
| 1540 | aaa |
| 1506 | aaa |
| 1420 | aaa |
| 1268 | aaa |
| 834 | aaa |
| 784 | aaa |
| 582 | aaa |
| 444 | aaa |
| 376 | aaa |
| 246 | aaa |
| 19 | aaa |
| 9719 | aaa |
| 9636 | aaa |
| 9562 | aaa |
| 9335 | aaa |
| 8905 | aaa |
| 8881 | aaa |
| 8377 | aaa |
| 7954 | aaa |
| 7929 | aaa |
| 7880 | aaa |
| 7096 | aaa |
| 6941 | aaa |
| 6726 | aaa |
| 6182 | aaa |
| 6151 | aaa |
| 5223 | aaa |
| 5127 | aaa |
| 4945 | aaa |
| 4728 | aaa |
| 4332 | aaa |
| 4320 | aaa |
| 4204 | aaa |
| 3973 | aaa |
| 3774 | aaa |
| 3749 | aaa |
| 3683 | aaa |
| 2543 | aaa |
| 2393 | aaa |
| 2340 | aaa |
| 2323 | aaa |
| 2287 | aaa |
| 2179 | aaa |
| 1795 | aaa |
| 1574 | aaa |
| 1391 | aaa |
| 1279 | aaa |
| 1271 | aaa |
| 505 | aaa |
| 109 | aaa |
+---------------+-----------------+--+ # 带limit排序(会额外再启动一个job进行全局排序)
0: jdbc:hive2://> select * from sort_test sort by id desc limit 300;
+---------------+-----------------+--+
| sort_test.id | sort_test.name |
+---------------+-----------------+--+
| 9901 | aaa |
| 9719 | aaa |
| 9684 | aaa |
| 9636 | aaa |
| 9562 | aaa |
| 9340 | aaa |
| 9335 | aaa |
| 9019 | aaa |
| 9007 | aaa |
| 8965 | aaa |
| 8961 | aaa |
| 8905 | aaa |
| 8881 | aaa |
| 8689 | aaa |
| 8552 | aaa |
| 8377 | aaa |
| 8177 | aaa |
| 7954 | aaa |
| 7929 | aaa |
| 7880 | aaa |
| 7837 | aaa |
| 7699 | aaa |
| 7502 | aaa |
| 7096 | aaa |
| 6941 | aaa |
| 6726 | aaa |
| 6559 | aaa |
| 6448 | aaa |
| 6228 | aaa |
| 6182 | aaa |
| 6151 | aaa |
| 6024 | aaa |
| 6006 | aaa |
| 5975 | aaa |
| 5910 | aaa |
| 5869 | aaa |
| 5739 | aaa |
| 5682 | aaa |
| 5541 | aaa |
| 5448 | aaa |
| 5415 | aaa |
| 5288 | aaa |
| 5223 | aaa |
| 5164 | aaa |
| 5160 | aaa |
| 5127 | aaa |
| 5083 | aaa |
| 4945 | aaa |
| 4878 | aaa |
| 4826 | aaa |
| 4728 | aaa |
| 4679 | aaa |
| 4616 | aaa |
| 4332 | aaa |
| 4320 | aaa |
| 4245 | aaa |
| 4207 | aaa |
| 4204 | aaa |
| 3973 | aaa |
| 3774 | aaa |
| 3749 | aaa |
| 3736 | aaa |
| 3683 | aaa |
| 3451 | aaa |
| 3226 | aaa |
| 3040 | aaa |
| 2989 | aaa |
| 2720 | aaa |
| 2670 | aaa |
| 2543 | aaa |
| 2393 | aaa |
| 2340 | aaa |
| 2340 | aaa |
| 2323 | aaa |
| 2287 | aaa |
| 2231 | aaa |
| 2206 | aaa |
| 2179 | aaa |
| 2054 | aaa |
| 2037 | aaa |
| 1795 | aaa |
| 1766 | aaa |
| 1614 | aaa |
| 1574 | aaa |
| 1540 | aaa |
| 1506 | aaa |
| 1420 | aaa |
| 1391 | aaa |
| 1279 | aaa |
| 1271 | aaa |
| 1268 | aaa |
| 834 | aaa |
| 784 | aaa |
| 582 | aaa |
| 505 | aaa |
| 444 | aaa |
| 376 | aaa |
| 246 | aaa |
| 109 | aaa |
| 19 | aaa |
+---------------+-----------------+--+
  • Order By 和 Sort By区别

    • Order By全局排序,Sort By局部排序
    • 取TopN时,Sort By 比 Order By效率更高
  • Distribute By

    • 查询语句对指定字段分组
    • 通常结合Sort By语句使用,比如同一个地区,不同商家排序,就需要用到这个
  • Cluster By
    • 分组且排序,等价于 Distribute By 和 Sort By 的结合
-- 使用示例
SELECT col1, col2 FROM t1 CLUSTER BY col1 SELECT col1, col2 FROM t1 DISTRIBUTE BY col1 SELECT col1, col2 FROM t1 DISTRIBUTE BY col1 SORT BY col1 ASC, col2 DESC

参考资料

【0】Hive wiki - LanguageManual SortBy

Hive之Order,Sort,Cluster and Distribute By的更多相关文章

  1. hive 中的Sort By、 Order By、Cluster By、Distribute By 区别

    Order by: order by 会对输入做全局排序,因此只有一个reducer(多个reducer无法保证全局有序)只有一个reducer,会导致当输入规模较大时,需要较长的计算时间.在hive ...

  2. [转]hive中order by,distribute by,sort by,cluster by

    转至http://my.oschina.net/repine/blog/296562 order by,distribute by,sort by,cluster by  查询使用说明 1 2 3 4 ...

  3. hive中order by、distribute by、sort by和cluster by的区别和联系

    hive中order by.distribute by.sort by和cluster by的区别和联系 order by order by 会对数据进行全局排序,和oracle和mysql等数据库中 ...

  4. hive中order by ,sort by ,distribute by, cluster by 的区别(**很详细**)

    hive 查询语法 select [all | distinct] select_ condition, select_ condition from table_name a [join table ...

  5. HiveQL之Sort by、Distribute by、Cluster by、Order By详解

    在这里解释一下select语法中的order by.sort by.distribute by.cluster by.order by语法. 一.order by语法 在hiveQL中Order by ...

  6. hive中order by,sort by, distribute by, cluster by作用以及用法

    1. order by     Hive中的order by跟传统的sql语言中的order by作用是一样的,会对查询的结果做一次全局排序,所以说,只有hive的sql中制定了order by所有的 ...

  7. Hive中的order by、sort by、distribute by、cluster by解释及测试

    结论: order by:全局排序,这也是4种排序手段中唯一一个能在终端输出中看出全局排序的方法,只有一个reduce,可能造成renduce任务时间过长,在严格模式下,要求必须具备limit子句. ...

  8. hive 排序 order by sort by distribute by cluster by

    order by:     order by是全局排序,受hive.mapred.mode的影响.       使用orderby有一些限制:     1.在严格模式下(hive.mapred.mod ...

  9. Hive中order by,sort by,distribute by,cluster by的区别

    一:order by order by会对输入做全局排序,因此只有一个Reducer(多个Reducer无法保证全局有序),然而只有一个Reducer,会导致当输入规模较大时,消耗较长的计算时间.关于 ...

随机推荐

  1. esp8266 串口通讯

    1.发送 调用uart_init(115200,115200);初始化串口,波特率设置为115200.前面一个是设置uart0的波特率.后面一个是设置.uart的波特率 然后就可以使用uart0_tx ...

  2. iOS Programming Views :Redrawing and UIScrollView

    iOS Programming Views :Redrawing and UIScrollView  1.1 event  You are going to see how views are red ...

  3. wps 图片代码 复制 粘贴

    <table><tr><td><img src="C:\Users\Administrator\Desktop\QQ截图20160921180946 ...

  4. Google浏览器开发者工具:CSSViewer(一个Css查看器)

    CSSViewer的简介 CSSViewer是一款可以帮助用户快速查看当前的网页元素的CSS属性的谷歌浏览器插件,在Chrome中安装了CSSViewer插件以后,用户就可以在设计网页的时候,快速地模 ...

  5. Mysql基本操作、C++Mysql简单应用、PythonMysql简单应用

    MySql基本操作 -- 当指定名称的数据库不存在时创建它并且指定使用的字符集和排序方式 CREATE DATABASE IF NOT EXISTS db_name CHARACTER SET UTF ...

  6. Linux之基础命令——文件查看

    cat(连接文件并打印) -n :由 1 开始对所有输出的行数编号. -b :和 -n 相似,只不过对于空白行不编号. -s :当遇到有连续两行以上的空白行,就代换为一行的空白行. [cat a b  ...

  7. day24-1 元类

    目录 元类 类的组成 内置函数 exec() class关键字创建类原理 自定义元类控制类的创建 自定义元类控制类实例化 自定义元类后对象属性查找顺序 元类 在python中一切皆对象,name我们用 ...

  8. ALTER SCHEMA - 修改一个模式的定义

    SYNOPSIS ALTER SCHEMA name RENAME TO newname DESCRIPTION 描述 ALTER SCHEMA 修改一个模式的定义. 现在它唯一的功能就是重命名模式. ...

  9. svn 版本库信息修改

    root@hpcstack hpcweb]# svn info 路径: . URL: http://svn.pyindex.com/hpcweb 版本库根: http://svn.pyindex.co ...

  10. 【C语言】控制台窗口图形界面编程(四):文本输出

    目录 00. 目录 01. FillConsoleOutputAttribute函数 02. FillConsoleOutputCharacter函数 03. WriteConsoleOutputCh ...