/////////////////////////////////////////////////////////////////////////
//
// Performance benchmarking program for various normalize functions
//
// by Elvic Liang
//
///////////////////////////////////////////////// #include <math.h>
#include <xmmintrin.h>
#include <time.h> struct Vector
{
float x, y, z; inline Vector() {} inline Vector(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {} inline Vector operator * (float rhs) const
{
Vector temp;
temp.x = x * rhs;
temp.y = y * rhs;
temp.z = z * rhs;
return temp;
}
}; template <typename T>
inline T max(T a, T b)
{
return ((a > b) ? a : b);
} inline float rcpf(float x)
{
#ifdef _MSC_VER
return 1.0f / x;
#else
const __m128 a = _mm_set_ss(x);
const __m128 r = _mm_rcp_ss(a);
// one more iteration
return _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(r, r), _mm_mul_ss(_mm_mul_ss(r, r), a)));
#endif
} inline float invsqrtf(float x)
{
const __m128 a = _mm_max_ss(_mm_set_ss(x), _mm_set_ss(1.0e-30f));
const __m128 r = _mm_rsqrt_ss(a);
// one more iteration
return _mm_cvtss_f32(_mm_mul_ss(r, _mm_add_ss(_mm_set_ss(1.5f),
_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), _mm_mul_ss(r, r)))));
} inline float fastinvsqrt(float x)
{
float xhalf = 0.5f * x;
int i = *(int *)&x;
i = 0x5f3759df - (i >> 1);
x = *(float *)&i;
x = x * (1.5f - xhalf * x * x);
return x;
} inline float fastsqrt(float x)
{
union {
int intPart;
float floatPart;
} convertor;
union {
int intPart;
float floatPart;
} convertor2;
convertor.floatPart = x;
convertor2.floatPart = x;
convertor.intPart = 0x1fbcf800 + (convertor.intPart >> 1);
convertor2.intPart = 0x5f3759df - (convertor2.intPart >> 1);
return 0.5f * (convertor.floatPart + (x * convertor2.floatPart));
} inline float dot(const Vector & a, const Vector & b)
{
return (a.x * b.x + a.y * b.y + a.z * b.z);
} inline float len(const Vector & a)
{
const float l = dot(a, a);
return sqrtf(max(0.0f, l));
} inline Vector normalize_ref(const Vector & a)
{
float length = sqrtf(max(0.0f, a.x * a.x + a.y * a.y + a.z * a.z));
// Using division gives higher precision than multiplying (1/length)
return Vector(a.x / length, a.y / length, a.z / length);
} inline Vector normalize(const Vector & a)
{
return a * invsqrtf(dot(a, a));
} inline Vector normalize_v1(const Vector & a)
{
const __m128 pa = _mm_max_ss(_mm_set_ss(a.x * a.x + a.y * a.y + a.z * a.z), _mm_set_ss(1.0e-30f));
const __m128 r = _mm_rsqrt_ss(pa);
// one more iteration
const float d = _mm_cvtss_f32(_mm_mul_ss(r, _mm_add_ss(_mm_set_ss(1.5f),
_mm_mul_ss(_mm_mul_ss(pa, _mm_set_ss(-0.5f)), _mm_mul_ss(r, r)))));
return a * d;
} inline Vector normalize_v2(const Vector & a)
{
return a * fastinvsqrt(dot(a, a));
} inline Vector normalize_v3(const Vector & a)
{
// TODO: Use SSE 4.2 dot product intrinsic when available
const __m128 x = _mm_set_ps(1.0f, a.z, a.y, a.x);
const __m128 s = _mm_mul_ps(x, x);
const __m128 t = _mm_add_ss(s, _mm_movehl_ps(s, s));
const __m128 pa = _mm_max_ss(_mm_add_ss(t, _mm_shuffle_ps(t, t, 1)), _mm_set_ss(1.0e-30f));
const __m128 r = _mm_rsqrt_ss(pa);
// one more iteration
return a * _mm_cvtss_f32(_mm_mul_ss(r, _mm_add_ss(_mm_set_ss(1.5f),
_mm_mul_ss(_mm_mul_ss(pa, _mm_set_ss(-0.5f)), _mm_mul_ss(r, r)))));
} inline float normalize_len(Vector & r, const Vector & a)
{
const float l = len(a);
const float d = max(l, 1.0e-30f);
r = a * rcpf(d);
return d;
} inline float normalize_len_v1(Vector & r, const Vector & a)
{
const float d = sqrtf(max(1.0e-30f, a.x * a.x + a.y * a.y + a.z * a.z));
r = a * rcpf(d);
return d;
} inline float normalize_len_v2(Vector & r, const Vector & a)
{
const float d = sqrtf(max(1.0e-30f, a.x * a.x + a.y * a.y + a.z * a.z));
const __m128 pa = _mm_set_ss(d);
const __m128 pr = _mm_rcp_ss(pa);
// one more iteration
const float rd = _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(pr, pr), _mm_mul_ss(_mm_mul_ss(pr, pr), pa)));
r = a * rd;
return d;
} inline float normalize_len_v3(Vector & r, const Vector & a)
{
const __m128 pa = _mm_sqrt_ss(_mm_max_ss(_mm_set_ss(1.0e-30f),
_mm_set_ss(a.x * a.x + a.y * a.y + a.z * a.z)));
const __m128 pr = _mm_rcp_ss(pa);
// one more iteration
const float rd = _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(pr, pr), _mm_mul_ss(_mm_mul_ss(pr, pr), pa)));
r = a * rd;
return _mm_cvtss_f32(pa);
} inline float normalize_len_v4(Vector & r, const Vector & a)
{
const float d = fastsqrt(max(1.0e-30f, a.x * a.x + a.y * a.y + a.z * a.z));
r = a * rcpf(d);
return d;
} inline float normalize_len_v5(Vector & r, const Vector & a)
{
// TODO: Use SSE 4.2 dot product intrinsic when available
const __m128 x = _mm_set_ps(1.0f, a.z, a.y, a.x);
const __m128 s = _mm_mul_ps(x, x);
const __m128 t = _mm_add_ss(s, _mm_movehl_ps(s, s));
const __m128 pa = _mm_sqrt_ss(
_mm_max_ss(_mm_add_ss(t, _mm_shuffle_ps(t, t, 1)), _mm_set_ss(1.0e-30f)));
const __m128 pr = _mm_rcp_ss(pa);
// one more iteration
r = a * _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(pr, pr), _mm_mul_ss(_mm_mul_ss(pr, pr), pa)));
return _mm_cvtss_f32(pa);
} struct Random
{
unsigned int state; inline Random(unsigned int seed = 0x9e3779b1)
{
state = hash(seed);
} inline unsigned int hash(unsigned int a)
{
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
} inline float next_float()
{
state = hash(state);
return (state & 0xFFFFFF) * (1.0f / float(1 << 24));
} inline float next()
{
return (next_float() * 1000.0f - 500.0f);
}
}; int get_time()
{
return (int)clock();
} int main(int argc, char* argv[])
{
const int NTEST = 100000000; int rand_time = 0;
{
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v;
v.x = random.next();
v.y = random.next();
v.z = random.next();
sum += (v.x + v.y + v.z);
}
rand_time = get_time() - start_time;
printf("random: sum = %f time = %d\n", sum, rand_time);
} printf("testing performance...\n"); {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r = normalize_ref(v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_ref (reference): sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r = normalize(v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize: sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r = normalize_v1(v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_v1: sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r = normalize_v2(v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_v2 (fast): sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r = normalize_v3(v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_v3: sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
normalize_len(r, v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_len: sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
normalize_len_v1(r, v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_len_v1: sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
normalize_len_v2(r, v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_len_v2: sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
normalize_len_v3(r, v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_len_v3: sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
normalize_len_v4(r, v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_len_v4 (fast): sum = %f time = %d\n", sum, done_time);
} {
int start_time = get_time();
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r;
v.x = random.next();
v.y = random.next();
v.z = random.next();
normalize_len_v5(r, v);
sum += (r.x + r.y + r.z);
}
int done_time = get_time() - start_time - rand_time;
printf("normalize_len_v5: sum = %f time = %d\n", sum, done_time);
} printf("testing precision...\n"); {
float max_error = 0.0f;
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r1, r2;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r1 = normalize_ref(v);
r2 = normalize_v1(v);
sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
}
printf("normalize_v1: sum = %f max. error = %.17f\n", sum, max_error);
} {
float max_error = 0.0f;
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r1, r2;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r1 = normalize_ref(v);
r2 = normalize_v2(v);
sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
}
printf("normalize_v2 (fast): sum = %f max. error = %.17f\n", sum, max_error);
} {
float max_error = 0.0f;
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r1, r2;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r1 = normalize_ref(v);
r2 = normalize_v3(v);
sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
}
printf("normalize_v3: sum = %f max. error = %.17f\n", sum, max_error);
} {
float max_error = 0.0f;
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r1, r2;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r1 = normalize_ref(v);
normalize_len_v3(r2, v);
sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
}
printf("normalize_len_v3: sum = %f max. error = %.17f\n", sum, max_error);
} {
float max_error = 0.0f;
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r1, r2;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r1 = normalize_ref(v);
normalize_len_v4(r2, v);
sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
}
printf("normalize_len_v4 (fast): sum = %f max. error = %.17f\n", sum, max_error);
} {
float max_error = 0.0f;
Random random;
double sum = 0.0;
for (int i = 0; i < NTEST; ++i)
{
Vector v, r1, r2;
v.x = random.next();
v.y = random.next();
v.z = random.next();
r1 = normalize_ref(v);
normalize_len_v5(r2, v);
sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
}
printf("normalize_len_v5: sum = %f max. error = %.17f\n", sum, max_error);
} return 0;
}

输出数据:

random: sum = 296444.360077 time = 2667
testing performance...
normalize_ref (reference): sum = 3368.279413 time = 2528
normalize: sum = 3368.280312 time = 1451
normalize_v1: sum = 3368.280312 time = 1530
normalize_v2 (fast): sum = 3360.893364 time = 671
normalize_v3: sum = 3368.279959 time = 1405
normalize_len: sum = 3368.279241 time = 2465
normalize_len_v1: sum = 3368.279241 time = 2200
normalize_len_v2: sum = 3368.280580 time = 2481
normalize_len_v3: sum = 3368.280580 time = 1717
normalize_len_v4 (fast): sum = 3399.301634 time = 2044
normalize_len_v5: sum = 3368.280429 time = 1514
testing precision...
normalize_v1: sum = 6736.559725 max. error = 0.00000005960464478
normalize_v2 (fast): sum = 6729.172777 max. error = 0.00128239393234253
normalize_v3: sum = 6736.559372 max. error = 0.00000011920928955
normalize_len_v3: sum = 6736.559993 max. error = 0.00000005960464478
normalize_len_v4 (fast): sum = 6767.581047 max. error = 0.01472616195678711

normalize_len_v5: sum = 6736.559842 max. error = 0.00000005960464478

最后结论:

normalize_v3 性价比最高。如果你在3D游戏、或者其它交互性要求很高的场合使用normalize,可以考虑使用这个快速实现,只需要平台支持SSE 2.0即可。如果你对精度要求不高,可以考虑使用normalize_v2 (fast)版本的实现,它使用了Quake 3游戏引擎中的快速开平方根算法。

原创:各种normalize函数实现的性能和精度大比拼的更多相关文章

  1. Spring AOP在函数接口调用性能分析及其日志处理方面的应用

    面向切面编程可以实现在不修改原来代码的情况下,增加我们所需的业务处理逻辑,比如:添加日志.本文AOP实例是基于Aspect Around注解实现的,我们需要在调用API函数的时候,统计函数调用的具体信 ...

  2. ElasticSearch 2 (24) - 语言处理系列之停用词:性能与精度

    ElasticSearch 2 (24) - 语言处理系列之停用词:性能与精度 摘要 在信息检索早期,磁盘和内存相较我们今天的使用只是很小的一部分.将索引空间保持在一个较小的水平是至关重要的,节省每个 ...

  3. 【原创·总结】影响sql查询性能的因素

     1.表定义 (1)如果字符串字段是经常需要用到的,可以冗余,否则不要冗余 (2)经常需要作为where的查询条件的字段,可以建索引:但是过多的索引会影响写入时的性能 (3)合理定义字段的数据类型 ( ...

  4. [daily][optimize] 一个小python程序的性能优化 (python类型转换函数引申的性能优化)

    前天,20161012,到望京面试.第四个职位,终于进了二面.好么,结果人力安排完了面试时间竟然没有通知我,也没有收到短信邀请.如果没有短信邀请门口的保安大哥是不让我进去大厦的.然后,我在11号接到了 ...

  5. 【学而思】利用shouldComponentUpdate钩子函数优化react性能以及引入immutable库的必要性

    凡是参阅过react官方英文文档的童鞋大体上都能知道对于一个组件来说,其state的改变(调用this.setState()方法)以及从父组件接受的props发生变化时,会导致组件重渲染,正所谓&qu ...

  6. JavaScript 小函数积累及性能优化

    获取值的类型: var toString = Object.prototype.toString; function getType(o) { return toString.call(o).slic ...

  7. 【react】利用shouldComponentUpdate钩子函数优化react性能以及引入immutable库的必要性

    凡是参阅过react官方英文文档的童鞋大体上都能知道对于一个组件来说,其state的改变(调用this.setState()方法)以及从父组件接受的props发生变化时,会导致组件重渲染,正所谓&qu ...

  8. Mysql聚合函数count(*) 的性能分析

    你首先要明确的是,在不同的 MySQL 引擎中,count(*) 有不同的实现方式. MyISAM 引擎把一个表的总行数存在了磁盘上,因此执行 count(*) 的时候会直接返回这个数,效率很高: 而 ...

  9. [原创]SQL表值函数:把用逗号分隔的字符串转换成表格数据

      我们日常开发过程中,非常常见的一种需求,把某一个用逗号或者/或者其他符号作为间隔的字符串分隔成一张表数据. 在前面我们介绍了 [原创]SQL 把表中字段存储的逗号隔开内容转换成列表形式,当然按照这 ...

随机推荐

  1. onlevelwasloaded的调用时机

    并不是任何场景加载完成后都会触发该事件,必须使用场景加载API才能触发onlevelwasloaded函数,场景加载API如 scenemanager.loadscene等,additive模式不触发 ...

  2. select sum也会返回null值

    SELECT  SUM(detail.VAL)  FROM   AI_SDP_ORDER_MONTH_DETAIL_201706    detail 如果所有的VAL都是null的话,或者根本就不存在 ...

  3. <转> win7下破解已安装的VS2008

    http://renfan.blog.51cto.com/3466724/754301 在win7系统下,已安装的VS2008维护界面是不显示序列号输入的,需要按以下步骤进行才能破解 1.点击“卸载/ ...

  4. Spring3.0+Hibernate+Atomikos集成构建JTA的分布式事务--解决多数据源跨库事务

    一.概念 分布式事务分布式事务是指事务的参与者.支持事务的服务器.资源服务器以及事务管理器分别位于不同的分布式系统的不同节点之上.简言之,同时操作多个数据库保持事务的统一,达到跨库事务的效果. JTA ...

  5. Java Socket编程之TCP

    基于TCP的Socket通信: 服务器端: 创建一个服务器端Socket,即ServerSocket,指定绑定的端口,并监听此端口 调用accept()方法开始监听,等待客户端的连接 连接建立后,通过 ...

  6. 从零开始搭建k8s-20180301

    yum install -y yum-utils git etcd yum-config-manager --add-repo https://download.docker.com/linux/ce ...

  7. Scala基础:类和构造器

    类 package com.zy.scala.cls /** * 在 Scala 中,类并不用声明为 public 类型的. * Scala 源文件中可以包含多个类,所有这些类都具有共有可见性. */ ...

  8. solr 基本命令二(权重查找)

    package zr.com.solr.utils; import java.io.IOException; import java.util.HashMap; import java.util.Li ...

  9. 5-SOM神经网络

    https://blog.csdn.net/gavinlyy/article/details/8302814 https://blog.csdn.net/u011314012/article/deta ...

  10. shiro 权限集成 sessionManager 配置 学习记录(三)

    1.shiro配置文件增加sessionManager管理 <!-- 6.shiro结合Session会话管理器 start --> <bean id="sessionMa ...