测试结果:

sum (fast) in clock 1562
sum (fast2) in clock 1407
sum (fast3) in clock 3156
sum in clock 7797
Error is 1.512115
Error2 is 0.030914
Error3 is 0.001389

#include <stdio.h>
#include <xmmintrin.h>
#define NOMINMAX
#include <windows.h>
#include <math.h>
#include <time.h> /*
* (c) Ian Stephenson
*
* ian@dctsystems.co.uk
*
* Fast pow() reference implementation
*/ /*
* http://www.dctsystems.co.uk/Software/power.html
* http://www.dctsystems.co.uk/Software/power.c
*/
const float shift23=(<<);
const float OOshift23=1.0/(<<); __forceinline float myFloorf(float a)
{
return (float)((int)a - (a < 0.0f));
} __forceinline float myLog2(float i)
{
float LogBodge=0.346607f;
float x;
float y;
x=(float)(*(int *)&i);
x*= OOshift23; //1/pow(2,23);
x=x-; y=x-myFloorf(x);
y=(y-y*y)*LogBodge;
return x+y;
}
__forceinline float myPow2(float i)
{
float PowBodge=0.33971f;
float x;
float y=i-myFloorf(i);
y=(y-y*y)*PowBodge; x=i+-y;
x*= shift23; //pow(2,23);
*(int*)&x=(int)x;
return x;
} __forceinline float myPow(float a, float b)
{
return myPow2(b*myLog2(a));
} /////////////////////////////////////// /* Code below are from http://code.google.com/p/fastapprox/ */
__forceinline float fastpow2(float p)
{
float offset = (p < ) ? 1.0f : 0.0f;
float clipp = (p < -) ? -126.0f : p;
int w = (int)clipp;
float z = clipp - w + offset;
union { unsigned int i; float f; } v = { (unsigned int)(( << ) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) };
return v.f;
} __forceinline float fastlog2(float x)
{
union { float f; unsigned int i; } vx = { x };
union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
float y = (float)vx.i;
y *= 1.1920928955078125e-7f;
return y - 124.22551499f
- 1.498030302f * mx.f
- 1.72587999f / (0.3520887068f + mx.f);
} __forceinline float fastpow(float x, float p)
{
return fastpow2(p * fastlog2(x));
} ///////////////////////////////////////////////// #define FLT_MIN 1.175494351e-38F
#define FLT_MAX 3.402823466e+38F template <typename T>
__forceinline T min(T a, T b)
{
return ((a < b) ? a : b);
} __forceinline float fast_fabs(float x)
{
union { float f; unsigned int i; } v = {x};
v.i &= 0x7FFFFFFF;
return v.f;
} /// Multiply and add: (a * b) + c
template <typename T>
__forceinline T madd (const T& a, const T& b, const T& c) {
// NOTE: in the future we may want to explicitly ask for a fused
// multiply-add in a specialized version for float.
// NOTE2: GCC/ICC will turn this (for float) into a FMA unless
// explicitly asked not to, clang seems to leave the code alone.
return a * b + c;
} template <typename IN_TYPE, typename OUT_TYPE>
__forceinline OUT_TYPE bit_cast (const IN_TYPE in) {
union { IN_TYPE in_val; OUT_TYPE out_val; } cvt;
cvt.in_val = in;
return cvt.out_val;
} __forceinline float fast_log2 (float x) {
// NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
if (x < FLT_MIN) x = FLT_MIN;
if (x > FLT_MAX) x = FLT_MAX;
// based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
unsigned bits = bit_cast<float, unsigned>(x);
int exponent = int(bits >> ) - ;
float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
// Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
// ulp histogram:
// 0 = 97.46%
// 1 = 2.29%
// 2 = 0.11%
float f2 = f * f;
float f4 = f2 * f2;
float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
hi = madd(f, hi, -0.13753123777116f);
hi = madd(f, hi, 0.24187369696082f);
hi = madd(f, hi, -0.34730547155299f);
lo = madd(f, lo, 1.442689881667200f);
return ((f4 * hi) + (f * lo)) + exponent;
} __forceinline float fast_exp2 (float x) {
// clamp to safe range for final addition
if (x < -126.0f) x = -126.0f;
if (x > 126.0f) x = 126.0f;
// range reduction
int m = int(x); x -= m;
x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
// 5th degree polynomial generated with sollya
// Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
// ulp histogram:
// 0 = 87.81%
// 1 = 4.18%
float r = 1.33336498402e-3f;
r = madd(x, r, 9.810352697968e-3f);
r = madd(x, r, 5.551834031939e-2f);
r = madd(x, r, 0.2401793301105f);
r = madd(x, r, 0.693144857883f);
r = madd(x, r, 1.0f);
// multiply by 2 ^ m by adding in the exponent
// NOTE: left-shift of negative number is undefined behavior
return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << ));
} __forceinline float fast_safe_pow (float x, float y) {
if (y == ) return 1.0f; // x^0=1
if (x == ) return 0.0f; // 0^y=0
// be cheap & exact for special case of squaring and identity
if (y == 1.0f)
return x;
if (y == 2.0f)
return min (x*x, FLT_MAX);
float sign = 1.0f;
if (x < ) {
// if x is negative, only deal with integer powers
// powf returns NaN for non-integers, we will return 0 instead
int ybits = bit_cast<float, int>(y) & 0x7fffffff;
if (ybits >= 0x4b800000) {
// always even int, keep positive
} else if (ybits >= 0x3f800000) {
// bigger than 1, check
int k = (ybits >> ) - ; // get exponent
int j = ybits >> ( - k); // shift out possible fractional bits
if ((j << ( - k)) == ybits) // rebuild number and check for a match
sign = bit_cast<int, float>(0x3f800000 | (j << )); // +1 for even, -1 for odd
else
return 0.0f; // not integer
} else {
return 0.0f; // not integer
}
}
return sign * fast_exp2(y * fast_log2(fast_fabs(x)));
} /////////
int main(int argc, char *argv[])
{
const int N = ;
float *buf = new float[N];
float *a = new float[N];
float *b = new float[N];
float *c = new float[N];
float *d = new float[N];
for (int i = ; i < N; ++i)
{
buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX;
} int start_time; start_time = clock();
for (int i = ; i < N; ++i)
{
a[i] = myPow(buf[i], 0.8f);
}
printf("sum (fast) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
c[i] = fastpow(buf[i], 0.8f);
}
printf("sum (fast2) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
d[i] = fast_safe_pow(buf[i], 0.8f);
}
printf("sum (fast3) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
b[i] = powf(buf[i], 0.8f);
}
printf("sum in clock %d\n", clock() - start_time); float max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(a[i] - b[i]);
if (err > max_err)
max_err = err;
}
printf("Error is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - c[i]);
if (err > max_err)
max_err = err;
}
printf("Error2 is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - d[i]);
if (err > max_err)
max_err = err;
}
printf("Error3 is %f\n", max_err); delete[]buf;
delete[]a;
delete[]b;
delete[]c;
delete[]d;
return ;
}

fast powf的更多相关文章

  1. opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较

    opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较 参考: http://wenku.baidu.com/link?url=1aDYAJBCrrK-uk2w3sSNai7h52x_ ...

  2. 基于Fast Bilateral Filtering 算法的 High-Dynamic Range(HDR) 图像显示技术。

    一.引言 本人初次接触HDR方面的知识,有描述不正确的地方烦请见谅. 为方便文章描述,引用部分百度中的文章对HDR图像进行简单的描述. 高动态范围图像(High-Dynamic Range,简称HDR ...

  3. Fast RCNN 训练自己的数据集(3训练和检测)

    转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ https://github.com/YihangLou/fas ...

  4. Fast RCNN 训练自己数据集 (2修改数据读取接口)

    Fast RCNN训练自己的数据集 (2修改读写接口) 转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ http ...

  5. 网格弹簧质点系统模拟(Spring-Mass System by Fast Method)附源码

    弹簧质点模型的求解方法包括显式欧拉积分和隐式欧拉积分等方法,其中显式欧拉积分求解快速,但积分步长小,两个可视帧之间需要多次积分,而隐式欧拉积分则需要求解线性方程组,但其稳定性好,能够取较大的积分步长. ...

  6. XiangBai——【AAAI2017】TextBoxes_A Fast Text Detector with a Single Deep Neural Network

    XiangBai--[AAAI2017]TextBoxes:A Fast Text Detector with a Single Deep Neural Network 目录 作者和相关链接 方法概括 ...

  7. 论文笔记--Fast RCNN

    很久之前试着写一篇深度学习的基础知识,无奈下笔之后发现这个话题确实太大,今天发一篇最近看的论文Fast RCNN.这篇文章是微软研究院的Ross Girshick大神的一篇作品,主要是对RCNN的一些 ...

  8. [转]Amazon DynamoDB – a Fast and Scalable NoSQL Database Service Designed for Internet Scale Applications

    This article is from blog of Amazon CTO Werner Vogels. -------------------- Today is a very exciting ...

  9. FAST特征点检测features2D

    #include <opencv2/core/core.hpp> #include <opencv2/features2d/features2d.hpp> #include & ...

随机推荐

  1. JavaScript对象继续总结

    1.字符串对象 18_1.查看字符串的长度 var a = "hello world" alert(a.length) 18_2.遍历整个字符串的,这里的是索引 for (var ...

  2. MQ基础概念和介绍

    一.中间件 MQ是一种中间件产品,至于什么是中间件,中间件能干什么,参见以下链接: http://baike.baidu.com/view/23710.htm 二.WebSphere MQ的原理 We ...

  3. dede 复制文章,远程图片无法本地化

    解决方法: 1.找到dede的后台目录,在后台目录下的inc下找到inc_archives_functions.php 2.搜索GetCurContent函数,找到如下这段代码: preg_match ...

  4. Java的8种包装类:Wrapper Class

    Java有8种基本数据类型,为什么又要出现对应的8种包装类: 1.Java的8种基本数据类型不支持面向对象编程机制 2.8种基本数据类型不具备“对象”的特性:没有成员变量.方法可供调用 3.例如:某个 ...

  5. svn.SvnX

    1. 使用SvnX的入门 http://www.divvun.no/doc/tools/docu-svn-user-svnx.html 2. SvnX的代码 https://code.google.c ...

  6. 什么是@guid

    我将给大家讲解史上最通俗一同guid与ID的区别...

  7. <td> 行高多层设置的问题

    在一个table中,设置了class,并且对应的样式设置了td的高度时,在其嵌套的table中的td高度不能设置大于父table的td的高度. 只有一种方法可以设置,如下: <table wid ...

  8. mysql实现分页的几种方式

    mysql实现分页的几种方式: 第一种:使用框架自带的pageable来进行分页 package com.cellstrain.icell.repository.repositoryImpl; imp ...

  9. 使用process_monitor.sh监控hadoop进程的crontab配置

    可以从下列链接找到process_monitor.sh:https://github.com/eyjian/libmooon/blob/master/shell/process_monitor.sh ...

  10. Node开发文件上传系统及向七牛云存储和亚马逊AWS S3的文件上传

    背景起,有奏乐: 有伟人曰:学习技能的最好途径莫过于理论与实践相结合. 初学Node这货时,每每读教程必会Fall asleep. 当真要开发系统时,顿觉精神百倍,即便踩坑无数也不失斗志. 因为同团队 ...