Created by Austin Appleby,Authored by Yonik Seeley

package util.hash;

/**
* The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain.
* This java port was authored by Yonik Seeley and also placed into the public domain.
* The author hereby disclaims copyright to this source code.
* <p>
* This produces exactly the same hash values as the final C++
* version of MurmurHash3 and is thus suitable for producing the same hash values across
* platforms.
* <p>
* The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids.
* murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash.
* <p>
* Note - The x86 and x64 versions do _not_ produce the same results, as the
* algorithms are optimized for their respective platforms.
* <p>
* See http://github.com/yonik/java_util for future updates to this file.
*/
public final class MurmurHash3 { /** 128 bits of state */
public static final class LongPair {
public long val1;
public long val2;
} public static final int fmix32(int h) {
h ^= h >>> 16;
h *= 0x85ebca6b;
h ^= h >>> 13;
h *= 0xc2b2ae35;
h ^= h >>> 16;
return h;
} public static final long fmix64(long k) {
k ^= k >>> 33;
k *= 0xff51afd7ed558ccdL;
k ^= k >>> 33;
k *= 0xc4ceb9fe1a85ec53L;
k ^= k >>> 33;
return k;
} /** Gets a long from a byte buffer in little endian byte order. */
public static final long getLongLittleEndian(byte[] buf, int offset) {
return ((long)buf[offset+7] << 56) // no mask needed
| ((buf[offset+6] & 0xffL) << 48)
| ((buf[offset+5] & 0xffL) << 40)
| ((buf[offset+4] & 0xffL) << 32)
| ((buf[offset+3] & 0xffL) << 24)
| ((buf[offset+2] & 0xffL) << 16)
| ((buf[offset+1] & 0xffL) << 8)
| ((buf[offset ] & 0xffL)); // no shift needed
} /** Returns the MurmurHash3_x86_32 hash. */
public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) { final int c1 = 0xcc9e2d51;
final int c2 = 0x1b873593; int h1 = seed;
int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block for (int i=offset; i<roundedEnd; i+=4) {
// little endian load order
int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24);
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2; h1 ^= k1;
h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
} // tail
int k1 = 0; switch(len & 0x03) {
case 3:
k1 = (data[roundedEnd + 2] & 0xff) << 16;
// fallthrough
case 2:
k1 |= (data[roundedEnd + 1] & 0xff) << 8;
// fallthrough
case 1:
k1 |= (data[roundedEnd] & 0xff);
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
} // finalization
h1 ^= len; // fmix(h1);
h1 ^= h1 >>> 16;
h1 *= 0x85ebca6b;
h1 ^= h1 >>> 13;
h1 *= 0xc2b2ae35;
h1 ^= h1 >>> 16; return h1;
} /** Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding
* the string to a temporary buffer. This is more than 2x faster than hashing the result
* of String.getBytes().
*/
public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) { final int c1 = 0xcc9e2d51;
final int c2 = 0x1b873593; int h1 = seed; int pos = offset;
int end = offset + len;
int k1 = 0;
int k2 = 0;
int shift = 0;
int bits = 0;
int nBytes = 0; // length in UTF8 bytes while (pos < end) {
int code = data.charAt(pos++);
if (code < 0x80) {
k2 = code;
bits = 8; /***
// optimized ascii implementation (currently slower!!! code size?)
if (shift == 24) {
k1 = k1 | (code << 24);
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
shift = 0;
nBytes += 4;
k1 = 0;
} else {
k1 |= code << shift;
shift += 8;
}
continue;
***/ }
else if (code < 0x800) {
k2 = (0xC0 | (code >> 6))
| ((0x80 | (code & 0x3F)) << 8);
bits = 16;
}
else if (code < 0xD800 || code > 0xDFFF || pos>=end) {
// we check for pos>=end to encode an unpaired surrogate as 3 bytes.
k2 = (0xE0 | (code >> 12))
| ((0x80 | ((code >> 6) & 0x3F)) << 8)
| ((0x80 | (code & 0x3F)) << 16);
bits = 24;
} else {
// surrogate pair
// int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
int utf32 = (int) data.charAt(pos++);
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
k2 = (0xff & (0xF0 | (utf32 >> 18)))
| ((0x80 | ((utf32 >> 12) & 0x3F))) << 8
| ((0x80 | ((utf32 >> 6) & 0x3F))) << 16
| (0x80 | (utf32 & 0x3F)) << 24;
bits = 32;
} k1 |= k2 << shift; // int used_bits = 32 - shift; // how many bits of k2 were used in k1.
// int unused_bits = bits - used_bits; // (bits-(32-shift)) == bits+shift-32 == bits-newshift shift += bits;
if (shift >= 32) {
// mix after we have a complete word k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2; h1 ^= k1;
h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
h1 = h1*5+0xe6546b64; shift -= 32;
// unfortunately, java won't let you shift 32 bits off, so we need to check for 0
if (shift != 0) {
k1 = k2 >>> (bits-shift); // bits used == bits - newshift
} else {
k1 = 0;
}
nBytes += 4;
} } // inner // handle tail
if (shift > 0) {
nBytes += shift >> 3;
k1 *= c1;
k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
} // finalization
h1 ^= nBytes; // fmix(h1);
h1 ^= h1 >>> 16;
h1 *= 0x85ebca6b;
h1 ^= h1 >>> 13;
h1 *= 0xc2b2ae35;
h1 ^= h1 >>> 16; return h1;
} /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */
public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) {
// The original algorithm does have a 32 bit unsigned seed.
// We have to mask to match the behavior of the unsigned types and prevent sign extension.
long h1 = seed & 0x00000000FFFFFFFFL;
long h2 = seed & 0x00000000FFFFFFFFL; final long c1 = 0x87c37b91114253d5L;
final long c2 = 0x4cf5ad432745937fL; int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block
for (int i=offset; i<roundedEnd; i+=16) {
long k1 = getLongLittleEndian(key, i);
long k2 = getLongLittleEndian(key, i+8);
k1 *= c1; k1 = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;
h1 = Long.rotateLeft(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
k2 *= c2; k2 = Long.rotateLeft(k2,33); k2 *= c1; h2 ^= k2;
h2 = Long.rotateLeft(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
} long k1 = 0;
long k2 = 0; switch (len & 15) {
case 15: k2 = (key[roundedEnd+14] & 0xffL) << 48;
case 14: k2 |= (key[roundedEnd+13] & 0xffL) << 40;
case 13: k2 |= (key[roundedEnd+12] & 0xffL) << 32;
case 12: k2 |= (key[roundedEnd+11] & 0xffL) << 24;
case 11: k2 |= (key[roundedEnd+10] & 0xffL) << 16;
case 10: k2 |= (key[roundedEnd+ 9] & 0xffL) << 8;
case 9: k2 |= (key[roundedEnd+ 8] & 0xffL);
k2 *= c2; k2 = Long.rotateLeft(k2, 33); k2 *= c1; h2 ^= k2;
case 8: k1 = ((long)key[roundedEnd+7]) << 56;
case 7: k1 |= (key[roundedEnd+6] & 0xffL) << 48;
case 6: k1 |= (key[roundedEnd+5] & 0xffL) << 40;
case 5: k1 |= (key[roundedEnd+4] & 0xffL) << 32;
case 4: k1 |= (key[roundedEnd+3] & 0xffL) << 24;
case 3: k1 |= (key[roundedEnd+2] & 0xffL) << 16;
case 2: k1 |= (key[roundedEnd+1] & 0xffL) << 8;
case 1: k1 |= (key[roundedEnd ] & 0xffL);
k1 *= c1; k1 = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;
} //----------
// finalization h1 ^= len; h2 ^= len; h1 += h2;
h2 += h1; h1 = fmix64(h1);
h2 = fmix64(h2); h1 += h2;
h2 += h1; out.val1 = h1;
out.val2 = h2;
} }

MurMurHash3的更多相关文章

  1. Metadata Lock原理8

    http://www.kancloud.cn/taobaomysql/monthly/67141 MySQL· 5.7优化·Metadata Lock子系统的优化 背景 引入MDL锁的目的,最初是为了 ...

  2. 剖析Elasticsearch集群系列第一篇 Elasticsearch的存储模型和读写操作

    剖析Elasticsearch集群系列涵盖了当今最流行的分布式搜索引擎Elasticsearch的底层架构和原型实例. 本文是这个系列的第一篇,在本文中,我们将讨论的Elasticsearch的底层存 ...

  3. ElasticSearch入门(2) —— 基础概念

    在Elasticsearch中,文档归属于一种类型(type),而这些类型存在于索引(index)中,我们可以画一些简单的对比图来类比传统关系型数据库: Relational DB -> Dat ...

  4. MinHash 原理

    最小哈希原理介绍 MinHash是基于Jaccard Index相似度(海量数据不可行)的算法,一种降维的方法A,B 两个集合:A = {s1, s3, s6, s8, s9}  B = {s3, s ...

  5. Shodan的http.favicon.hash语法详解与使用技巧

    在Shodan搜索中有一个关于网站icon图标的搜索语法,http.favicon.hash,我们可以使用这个语法来搜索出使用了同一icon图标的网站,不知道怎么用的朋友请参考我上一篇文章. 通过上一 ...

  6. Java Bloom filter几种实现比较

    英文原始出处: Bloom filter for Scala, the fastest for JVM 本文介绍的是用Scala实现的Bloom filter. 源代码在github上.依照性能测试结 ...

  7. redis 系列6 数据结构之字典(下)

    一.概述 接着上篇继续,这篇把数据结构之字典学习完, 这篇知识点包括:哈希算法,解决键冲突, rehash , 渐进式rehash,字典API. 1.1 哈希算法 当一个新的键值对 需要添加到字典里面 ...

  8. 【转】解决Maxwell发送Kafka消息数据倾斜问题

    最近用Maxwell解析MySQL的Binlog,发送到Kafka进行处理,测试的时候发现一个问题,就是Kafka的Offset严重倾斜,三个partition,其中一个的offset已经快200万了 ...

  9. 大数据量下的集合过滤—Bloom Filter

    算法背景 如果想判断一个元素是不是在一个集合里,一般想到的是将集合中所有元素保存起来,然后通过比较确定.链表.树.散列表(又叫哈希表,Hash table)等等数据结构都是这种思路,存储位置要么是磁盘 ...

随机推荐

  1. js模块化开发——require.js的用法详细介绍(含jsonp)

    RequireJS的目标是鼓励代码的模块化,它使用了不同于传统<script>标签脚本加载步骤.可以用它回事.优化代码,但其主要的目的还是为了代码的模块化.它鼓励在使用脚本以moudle ...

  2. 天兔(Lepus)监控系统慢查询分析平台安装配置

    被监控端要安装pt工具 [root@HE1~]## yum -y install perl-IO-Socket-SSL [root@HE1~]## yum -y install perl-DBI [r ...

  3. abstract、override、new、virtual、sealed使用和示例

    abstract修饰类名为抽象类,修饰方法为抽象方法.如果一个类为抽象类,则这个类智能是其他某个类的基类.抽象方法在抽象类中没有函数体.抽象类中的抽象方法是没有方法体的,继承其的子类必须实现抽象类的抽 ...

  4. 基于Xcode8插件开发~一键检测处理头文件引用

    Xcode8开放了新的一个Extension:Xcode Source Editor Extension,目的是让开发者可以正规的自主为IDE编写插件,虽然说系统现提供的功能还比较拮据,但是不妨碍我们 ...

  5. 微信小程序之快速接入七牛云

    小程序为什么要接入云? 目前,开发者在开发小程序过程中,主要遇到以下几个问题: 小程序发布大小超限 微信官方限制小程序的发布代码不能超过 1MB,而在实际开发过程中,一般的小程序难免会有图片等富媒体文 ...

  6. 私有云存储搭建(owncloud)

    第一步.搭建LAMP(基于linux7.1.1503) 1 配置yum(网络加本地,下面为网络) [vault.centos.org_7.1.1503_os_x86_64_] name=added f ...

  7. Android客户端连接服务器端,向服务器端发送请求HttpURLConnection

    在Java中想后台服务器发送请求一般都直接使用了Java的网络编程,或者使用HttpClient向后台服务器端发送HTTP请求.虽然在安卓中,所有Java的API都可以使用,而却使用其并不会出现什么问 ...

  8. ODBC

    ODBC是80年代末90年代初出现的技术,它为编写关系数据库的客户软件提供了统一的接口.ODBC只提供单一的API,可用于处理不同数据库的客户应用程序.使用ODBC API的应用程序可以与任何具有OD ...

  9. AFNetworking2.0和AFNetworking3.0 的HTTPS的配置

    前言: 由于苹果声明在前说是2017.01.01之后提交审核的APP,必须使用HTTPS请求,要不就直接驳回审核,吓得我们年前赶紧提交了一个版本,想着年后在弄这个https,结果又有消息说是苹果推迟了 ...

  10. springMVC整合Junit4进行单元测试

    springMVC整合Junit4进行单元测试 标签: springMVC整合Junit4junit单元测试教程springMVC入门教程   spring(10)  版权声明:本文为博主原创文章,未 ...