测试结果:

sum (fast) in clock 1562
sum (fast2) in clock 1407
sum (fast3) in clock 3156
sum in clock 7797
Error is 1.512115
Error2 is 0.030914
Error3 is 0.001389

#include <stdio.h>
#include <xmmintrin.h>
#define NOMINMAX
#include <windows.h>
#include <math.h>
#include <time.h> /*
* (c) Ian Stephenson
*
* ian@dctsystems.co.uk
*
* Fast pow() reference implementation
*/ /*
* http://www.dctsystems.co.uk/Software/power.html
* http://www.dctsystems.co.uk/Software/power.c
*/
const float shift23=(<<);
const float OOshift23=1.0/(<<); __forceinline float myFloorf(float a)
{
return (float)((int)a - (a < 0.0f));
} __forceinline float myLog2(float i)
{
float LogBodge=0.346607f;
float x;
float y;
x=(float)(*(int *)&i);
x*= OOshift23; //1/pow(2,23);
x=x-; y=x-myFloorf(x);
y=(y-y*y)*LogBodge;
return x+y;
}
__forceinline float myPow2(float i)
{
float PowBodge=0.33971f;
float x;
float y=i-myFloorf(i);
y=(y-y*y)*PowBodge; x=i+-y;
x*= shift23; //pow(2,23);
*(int*)&x=(int)x;
return x;
} __forceinline float myPow(float a, float b)
{
return myPow2(b*myLog2(a));
} /////////////////////////////////////// /* Code below are from http://code.google.com/p/fastapprox/ */
__forceinline float fastpow2(float p)
{
float offset = (p < ) ? 1.0f : 0.0f;
float clipp = (p < -) ? -126.0f : p;
int w = (int)clipp;
float z = clipp - w + offset;
union { unsigned int i; float f; } v = { (unsigned int)(( << ) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) };
return v.f;
} __forceinline float fastlog2(float x)
{
union { float f; unsigned int i; } vx = { x };
union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
float y = (float)vx.i;
y *= 1.1920928955078125e-7f;
return y - 124.22551499f
- 1.498030302f * mx.f
- 1.72587999f / (0.3520887068f + mx.f);
} __forceinline float fastpow(float x, float p)
{
return fastpow2(p * fastlog2(x));
} ///////////////////////////////////////////////// #define FLT_MIN 1.175494351e-38F
#define FLT_MAX 3.402823466e+38F template <typename T>
__forceinline T min(T a, T b)
{
return ((a < b) ? a : b);
} __forceinline float fast_fabs(float x)
{
union { float f; unsigned int i; } v = {x};
v.i &= 0x7FFFFFFF;
return v.f;
} /// Multiply and add: (a * b) + c
template <typename T>
__forceinline T madd (const T& a, const T& b, const T& c) {
// NOTE: in the future we may want to explicitly ask for a fused
// multiply-add in a specialized version for float.
// NOTE2: GCC/ICC will turn this (for float) into a FMA unless
// explicitly asked not to, clang seems to leave the code alone.
return a * b + c;
} template <typename IN_TYPE, typename OUT_TYPE>
__forceinline OUT_TYPE bit_cast (const IN_TYPE in) {
union { IN_TYPE in_val; OUT_TYPE out_val; } cvt;
cvt.in_val = in;
return cvt.out_val;
} __forceinline float fast_log2 (float x) {
// NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
if (x < FLT_MIN) x = FLT_MIN;
if (x > FLT_MAX) x = FLT_MAX;
// based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
unsigned bits = bit_cast<float, unsigned>(x);
int exponent = int(bits >> ) - ;
float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
// Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
// ulp histogram:
// 0 = 97.46%
// 1 = 2.29%
// 2 = 0.11%
float f2 = f * f;
float f4 = f2 * f2;
float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
hi = madd(f, hi, -0.13753123777116f);
hi = madd(f, hi, 0.24187369696082f);
hi = madd(f, hi, -0.34730547155299f);
lo = madd(f, lo, 1.442689881667200f);
return ((f4 * hi) + (f * lo)) + exponent;
} __forceinline float fast_exp2 (float x) {
// clamp to safe range for final addition
if (x < -126.0f) x = -126.0f;
if (x > 126.0f) x = 126.0f;
// range reduction
int m = int(x); x -= m;
x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
// 5th degree polynomial generated with sollya
// Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
// ulp histogram:
// 0 = 87.81%
// 1 = 4.18%
float r = 1.33336498402e-3f;
r = madd(x, r, 9.810352697968e-3f);
r = madd(x, r, 5.551834031939e-2f);
r = madd(x, r, 0.2401793301105f);
r = madd(x, r, 0.693144857883f);
r = madd(x, r, 1.0f);
// multiply by 2 ^ m by adding in the exponent
// NOTE: left-shift of negative number is undefined behavior
return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << ));
} __forceinline float fast_safe_pow (float x, float y) {
if (y == ) return 1.0f; // x^0=1
if (x == ) return 0.0f; // 0^y=0
// be cheap & exact for special case of squaring and identity
if (y == 1.0f)
return x;
if (y == 2.0f)
return min (x*x, FLT_MAX);
float sign = 1.0f;
if (x < ) {
// if x is negative, only deal with integer powers
// powf returns NaN for non-integers, we will return 0 instead
int ybits = bit_cast<float, int>(y) & 0x7fffffff;
if (ybits >= 0x4b800000) {
// always even int, keep positive
} else if (ybits >= 0x3f800000) {
// bigger than 1, check
int k = (ybits >> ) - ; // get exponent
int j = ybits >> ( - k); // shift out possible fractional bits
if ((j << ( - k)) == ybits) // rebuild number and check for a match
sign = bit_cast<int, float>(0x3f800000 | (j << )); // +1 for even, -1 for odd
else
return 0.0f; // not integer
} else {
return 0.0f; // not integer
}
}
return sign * fast_exp2(y * fast_log2(fast_fabs(x)));
} /////////
int main(int argc, char *argv[])
{
const int N = ;
float *buf = new float[N];
float *a = new float[N];
float *b = new float[N];
float *c = new float[N];
float *d = new float[N];
for (int i = ; i < N; ++i)
{
buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX;
} int start_time; start_time = clock();
for (int i = ; i < N; ++i)
{
a[i] = myPow(buf[i], 0.8f);
}
printf("sum (fast) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
c[i] = fastpow(buf[i], 0.8f);
}
printf("sum (fast2) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
d[i] = fast_safe_pow(buf[i], 0.8f);
}
printf("sum (fast3) in clock %d\n", clock() - start_time); start_time = clock();
for (int i = ; i < N; ++i)
{
b[i] = powf(buf[i], 0.8f);
}
printf("sum in clock %d\n", clock() - start_time); float max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(a[i] - b[i]);
if (err > max_err)
max_err = err;
}
printf("Error is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - c[i]);
if (err > max_err)
max_err = err;
}
printf("Error2 is %f\n", max_err); max_err = 0.0f;
for (int i = ; i < N; ++i)
{
float err = fabsf(b[i] - d[i]);
if (err > max_err)
max_err = err;
}
printf("Error3 is %f\n", max_err); delete[]buf;
delete[]a;
delete[]b;
delete[]c;
delete[]d;
return ;
}

fast powf的更多相关文章

  1. opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较

    opencv中的SIFT,SURF,ORB,FAST 特征描叙算子比较 参考: http://wenku.baidu.com/link?url=1aDYAJBCrrK-uk2w3sSNai7h52x_ ...

  2. 基于Fast Bilateral Filtering 算法的 High-Dynamic Range(HDR) 图像显示技术。

    一.引言 本人初次接触HDR方面的知识,有描述不正确的地方烦请见谅. 为方便文章描述,引用部分百度中的文章对HDR图像进行简单的描述. 高动态范围图像(High-Dynamic Range,简称HDR ...

  3. Fast RCNN 训练自己的数据集(3训练和检测)

    转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ https://github.com/YihangLou/fas ...

  4. Fast RCNN 训练自己数据集 (2修改数据读取接口)

    Fast RCNN训练自己的数据集 (2修改读写接口) 转载请注明出处,楼燚(yì)航的blog,http://www.cnblogs.com/louyihang-loves-baiyan/ http ...

  5. 网格弹簧质点系统模拟(Spring-Mass System by Fast Method)附源码

    弹簧质点模型的求解方法包括显式欧拉积分和隐式欧拉积分等方法,其中显式欧拉积分求解快速,但积分步长小,两个可视帧之间需要多次积分,而隐式欧拉积分则需要求解线性方程组,但其稳定性好,能够取较大的积分步长. ...

  6. XiangBai——【AAAI2017】TextBoxes_A Fast Text Detector with a Single Deep Neural Network

    XiangBai--[AAAI2017]TextBoxes:A Fast Text Detector with a Single Deep Neural Network 目录 作者和相关链接 方法概括 ...

  7. 论文笔记--Fast RCNN

    很久之前试着写一篇深度学习的基础知识,无奈下笔之后发现这个话题确实太大,今天发一篇最近看的论文Fast RCNN.这篇文章是微软研究院的Ross Girshick大神的一篇作品,主要是对RCNN的一些 ...

  8. [转]Amazon DynamoDB – a Fast and Scalable NoSQL Database Service Designed for Internet Scale Applications

    This article is from blog of Amazon CTO Werner Vogels. -------------------- Today is a very exciting ...

  9. FAST特征点检测features2D

    #include <opencv2/core/core.hpp> #include <opencv2/features2d/features2d.hpp> #include & ...

随机推荐

  1. 接触mybatis使用

    1.mybatis mybatis是一个自定义sql.存储过程和高级映射的持久层框架,是Apache下的顶级项目. mybatis可以让程序员将主要精力放在sql上,通过mybatis提供的映射方式. ...

  2. maven的下载

    1.maven的下载地址:http://maven.apache.org/download.cgi.下载3.3.9版本(在选择下载的版本是要与JDK契合). 2.下载之后,解压的文件放的位置尽量不要有 ...

  3. 一个新的threejs理论基础学习网站

    网站:  https://webglfundamentals.org/ 

  4. vue1.0学习

    vue 一片html代码配合上json,在new出来vue实例 Demo:1 数据双向绑定(v-model="message",{{message}}) <div id=&q ...

  5. Windows下误删资料的恢复

    只要三步,就能找回你删掉并清空回收站的东西 : 1.打开“运行”消息框,然后输入regedit (打开注册表) 2.依次展开:HEKEY——LOCAL——MACHIME/SOFTWARE/micros ...

  6. 关于UI设计行业的认识再到认识

    相信很多同学和我一样提及到UI行业时,尤其是连门槛都没有踏入半步时,总会一脸茫然. 我也是一样的,我刚接触UI的前半个月,文章读过好多,作品也看过好多,什么"小白入门UI的十大建议啊&quo ...

  7. jsp传中文乱码问题 encodeURIComponent()编码方法

    方法一: jQuery.ajax({            type:"POST",            url:"${ctx}/offer.do",     ...

  8. C++11新特性——The C++ standard library, 2nd Edition 笔记(一)

    前言 这是我阅读<The C++ standard library, 2nd Edition>所做读书笔记的第一篇.这个系列基本上会以一章一篇的节奏来写,少数以C++03为主的章节会和其它 ...

  9. 构建ASP.NET网站十大必备工具

    最近使用ASP.NET为公司构建了一个简单的公共网站(该网站的地址:http://superexpert.com/).在这个过程中,我们使用了数量很多的免费工具,如果把构建ASP.NET网站的必备工具 ...

  10. 2018.09.11 loj#10216.五指山(exgcd)

    传送门 就是一个exgcd的板子. 但注意算距离差的时候是在一个环上面算. 还有,答案要开long long233... 注意这两点之后就是exgcd板子了. 代码: #include<bits ...