opencl gauss filter优化(二)

1.buffer使用image的方式：Horizontal 与 Vertical 算法一样, 共需30ms,wait time 19ms.

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

__kernel void ImageGaussianFilterHorizontal(__read_only image2d_t  source, // Source image

                            __write_only image2d_t   dest,  // Intermediate dest image

                                             const int imgWidth ,                // Image width

                                             const int imgHeight)

{

    const int y = get_global_id();

    if(y>=(imgHeight))

        return;

    const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0};

    const int s = ;

    const int nStart = ;

    float lines[];

    for(int i=;i<;i++)

        lines[i] = read_imagef( source, sampler,  (int2) (i-, y) ).x;

    for(int j=;j<imgWidth;){

    float sum = lines[nStart] * m_nFilter[nStart];

#define    GaussianTwoLines(m) \

    sum += ( (lines[m] + lines[s--m])*m_nFilter[m] );

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()    

        write_imagef( dest, (int2) (j, y), sum );

        for(int i = ; i<s-; i++) lines[i] = lines[i+];

        j++;

        lines[s-] = read_imagef( source, sampler, (int2) (j+, y) ).x;

    }

}

__kernel void ImageGaussianFilterVertical(__read_only image2d_t  source, // Source image

                        __write_only image2d_t   dest,

                         const int imgWidth ,

                        const int imgHeight)

{

    const int x = get_global_id();

    if(x>=(imgWidth))

        return;

    const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0};

    const int s = ;

    const int nStart = ;

    float lines[];

    for(int i=;i<;i++)

        lines[i] = read_imagef( source, sampler,  (int2) (x ,i-) ).x;

    for(int j=;j<imgHeight;){

    float sum = lines[nStart] * m_nFilter[nStart];

#define    GaussianTwoLines(m) \

    sum += ( (lines[m] + lines[s--m])*m_nFilter[m] );

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        write_imagef( dest, (int2) (x, j), sum );

        for(int i = ; i<s-; i++) lines[i] = lines[i+];

        j++;

        lines[s-] = read_imagef( source, sampler, (int2) (x,j+) ).x;

    }

}

2.只运行 Horizontal 19ms,wait time 19ms. 注释掉 write_imagef 2.4ms(wait time,run time都是0.0xms)(更新：sum计算被优化,0.x ms就是读image的时间).

a.顺序调整为：

lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

write_imagef( dest, (int2) (j-1, y), sum );

16.9ms,很奇怪sum用固定的0,0.2替代时间只有3.9ms?????把计算部分注释掉，只读写imgage,也是3.9ms, 计算sum的部分被编译器优化掉了？

b. if(sum>0)

lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

write_imagef( dest, (int2) (j-1, y), 0.2 );

如此测试,17ms,看来是sum的计算被优化掉了.

c.if(sum>=0)

j++;

//lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

//write_imagef( dest, (int2) (j-1, y), sum );

只计算,5.7ms,但还是wait time 5.7ms？？？

3.使用float16 vector 计算,总共耗时15.6 ms,wait time 9.3ms,rum time 6.3ms.使用 __attribute__ 能减少1ms以内.其中Horizontal:wait time 9.4ms,rum time 0.008ms ,Vertical:wait time 0.07ms,rum time 6.4ms.

不知道为什么使用fma指令替代sum+= ,需要近2s,而且localWorksize最大只能32.

使用half16 精度，反而还要17ms,而且结果有1-2的误差。

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

__kernel __attribute__((work_group_size_hint(,,)))

void ImageGaussianFilterHorizontal(__read_only image2d_t  source, // Source image

                                    __write_only image2d_t   dest,  // Intermediate dest image

                                     const int imgWidth ,                // Image width

                                     const int imgHeight)

{

    const int y = get_global_id();

    if(y>=(imgHeight))

        return;

    const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0};

#define r(xc,y) read_imagef( source, sampler,  (int2) (xc, y) ).x

#define r16(x,y) (float16)( r(x,y),r(x+1,y),r(x+2,y),r(x+3,y),r(x+4,y),r(x+5,y),r(x+6,y),r(x+7,y),\

                r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y),r(x+,y))

#define w16(x,y,sum) write_imagef( dest, (int2) (x, y), sum.s0 );write_imagef( dest, (int2) (x+1, y), sum.s1 );\

        write_imagef( dest, (int2) (x+, y), sum.s2 );write_imagef( dest, (int2) (x+, y), sum.s3 );\

        write_imagef( dest, (int2) (x+, y), sum.s4 );write_imagef( dest, (int2) (x+, y), sum.s5 );\

        write_imagef( dest, (int2) (x+, y), sum.s6 );write_imagef( dest, (int2) (x+, y), sum.s7 );\

        write_imagef( dest, (int2) (x+, y), sum.s8 );write_imagef( dest, (int2) (x+, y), sum.s9 );\

        write_imagef( dest, (int2) (x+, y), sum.sa );write_imagef( dest, (int2) (x+, y), sum.sb );\

        write_imagef( dest, (int2) (x+, y), sum.sc );write_imagef( dest, (int2) (x+, y), sum.sd );\

        write_imagef( dest, (int2) (x+, y), sum.se );write_imagef( dest, (int2) (x+, y), sum.sf );

    float16 line0 =  r16(-,y);

    for(int j=;j<imgWidth;){

        float16 line1 =  r16(j-+,y);

        float16 temp0;

        float16 temp1;

        temp0 = line0;

        temp1.s0123 = line0.sabcd;

        temp1.s45 = line0.sef;

        temp1.s67 = line1.s01;

        temp1.s89abcdef = line1.s23456789;

        float16 sum =  ( temp0 + temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s0;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s9;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s1;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s8;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s2;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s7;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s3;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s6;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s4;

        sum += ( temp0 ) * m_nFilter[];

        line0 = line1;

        w16(j,y,sum );

        j+=;

    }

}

__kernel  __attribute__((work_group_size_hint(,,)))

void ImageGaussianFilterVertical(__read_only image2d_t  source, // Source image

                                __write_only image2d_t   dest,

                                 const int imgWidth ,

                                 const int imgHeight)

{

    const int x = get_global_id();

    if(x>=(imgWidth))

        return;

    const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0};

#define rv16(x,y) (float16)( r(x,y),r(x,y+1),r(x,y+2),r(x,y+3),r(x,y+4),r(x,y+5),r(x,y+6),r(x,y+7),\

                r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+))

#define wv16(x,y,sum) write_imagef( dest, (int2) (x,y), sum.s0 );write_imagef( dest, (int2) (x,y+1), sum.s1 );\

        write_imagef( dest, (int2) (x,y+), sum.s2 );write_imagef( dest, (int2) (x,y+), sum.s3 );\

        write_imagef( dest, (int2) (x,y+), sum.s4 );write_imagef( dest, (int2) (x,y+), sum.s5 );\

        write_imagef( dest, (int2) (x,y+), sum.s6 );write_imagef( dest, (int2) (x,y+), sum.s7 );\

        write_imagef( dest, (int2) (x,y+), sum.s8 );write_imagef( dest, (int2) (x,y+), sum.s9 );\

        write_imagef( dest, (int2) (x,y+), sum.sa );write_imagef( dest, (int2) (x,y+), sum.sb );\

        write_imagef( dest, (int2) (x,y+), sum.sc );write_imagef( dest, (int2) (x,y+), sum.sd );\

        write_imagef( dest, (int2) (x,y+), sum.se );write_imagef( dest, (int2) (x,y+), sum.sf );

    float16 line0 =  rv16(x,-);

    for(int j=;j<imgHeight;){

        float16 line1 =  rv16(x,j-+);

        float16 temp0;

        float16 temp1;

        temp0 = line0;

        temp1.s0123 = line0.sabcd;

        temp1.s45 = line0.sef;

        temp1.s67 = line1.s01;

        temp1.s89abcdef = line1.s23456789;

        float16 sum =  ( temp0 + temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s0;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s9;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s1;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s8;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s2;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s7;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s3;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s6;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s4;

        sum += ( temp0 ) * m_nFilter[];

        line0 = line1;

        wv16(x,j,sum );

        j+=;

    }

}

opencl gauss filter优化(二)的更多相关文章

opencl gauss filter优化(三)
1.根据前两次的最终结果: 使用普通buffer,Horizontal 5ms, Vertical 17 ms 使用image buffer:Horizontal 9.4ms, Vertical 6. ...
opencl gauss filter优化(一)
Platform: LG G3, Adreno 330 ,img size 3264x2448 C code neon GPU 300 60 29 单位:ms 1. 目前按如下行列分解的方式最快29m ...
Anisotropic gauss filter
最近一直在做版面分析,其中文本行检测方面,许多文章涉及到了Anigauss也就是各向异性高斯滤波. 顾名思义,简单的理解就是参数不同的二维高斯滤波. 在文章Fast Anisotropic Gauss ...
EMW 性能优化二之---并发配置
EMW 性能优化二之---并发配置在前一个日志中写到交货的异步更新,对于RFUI RF的前台操作会提升效率,异步更新不用等待更新状态的返回,启用更新队列的方式执行(SM13). 下面再补全性能相关的 ...
MySQL优化二（连接优化和缓存优化）
body { font-family: Helvetica, arial, sans-serif; font-size: 14px; line-height: 1.6; padding-top: 10 ...
mysql优化二之锁机制
mysql优化二之锁机制 mysql提供了锁机制和MVCC机制来保证并发操作的安全性,这里主要讨论锁机制, MVCC见下篇文章 mysql的锁按照锁粒度可分为行锁与表锁,按照操作类型划分可读锁和写锁 ...
Emacs 启动优化二三事
Emacs 启动优化二三事 */--> div.org-src-container { font-size: 85%; font-family: monospace; } p {font-siz ...
MySQL性能优化(二)：优化数据库的设计
原文:MySQL性能优化(二):优化数据库的设计版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https://blog.csdn.n ...
二维高斯滤波器（gauss filter）的实现
我们以一个二维矩阵表示二元高斯滤波器,显然此二维矩阵的具体形式仅于其形状(shape)有关: def gauss_filter(kernel_shape): 为实现二维高斯滤波器,需要首先定义二元高斯 ...

随机推荐

/etc/ld.so.conf 介绍
/etc/ld.so.conf 这个文件记录了编译时使用的动态链接库的路径,告诉链接器去哪个路径下寻找链接时需要用到的库,如果找不到,就会提示链接错误. 如果我们安装了第三方的库,而没有将它放在链接器 ...
java5、java6、java7、java8的新特性
Java5: 1.泛型 Generics: 引用泛型之后,允许指定集合里元素的类型,免去了强制类型转换,并且能在编译时刻进行类型检查的好处. Parameterized Type作为参数 ...
多态，虚拟方法，重写，接口，类库，委托，is，as运算符，泛型集合，万能变量
多态:简而言之就是龙生九子,各有不同有了继承,才有了多态 1.虚方法 virtual重写 override父类中的方法,在子类中并不适用,那么子类需要自主更改继承的方法或者是属性,那父类中加了vir ...
EXCEL中讲 10分10秒转换成610秒
前几天宝贝跟我打赌100W说我20天给她打电话不到10小时,我说绝对超过10小时了,但是由于宝贝的赖皮死活不承认,所以我被迫掉出通话记录,拿到通话记录我有点小郁闷,因为通话记录里的时间格式00分00秒 ...
JQuery多媒体插件jQuery Media Plugin使用详解
malsup jquery media plugin 该插件可以播放多种类型的多媒体文件包括:Flash, Quicktime, Windows Media Player, Real Player, ...
（1）定义一个接口CanFly，描述会飞的方法public void fly(); （2）分别定义类飞机和鸟，实现CanFly接口。（3）定义一个测试类，测试飞机和鸟，在main方法中创建飞机对象和鸟对象，再定义一个makeFly()方法，其中让会飞的事物飞。并在main方法中调用该方法，让飞机和鸟起飞。
package b; public interface CanFly { public void fly(); } package b; public class FeiJi implements C ...
Spring MVC 详解（一）
springmvc是spring的一个模块,提供web层解决方案(基于mvc设计架构),mvc是一个设计模式,在b/s系统的应用: spring 的架构 mvc设计模式 spring MVC的框架第 ...
ios获取摄像头与相册图片
iOS的一些设备上都安装了摄像头.现在绝大多数都有了. 在编程中,我们是用相应的东西来进行照相,录像等功能. 一.UIImagePickerController类 UIImagePickerCon ...
CentOS最小化安装后，增加GNOME桌面
背景:下载CentOS 7的安装包后,在虚拟机上安装. 上来就遇到一个问题:提示需要开启intel vt-x. 这个进入BIOS,在CPU的设置中开启即可. 然后怀着兴奋的心情,开始各种下一步的安装, ...
V-rep学习笔记：机器人逆运动学数值解法（The Pseudo Inverse Method）
There are two ways of using the Jacobian matrix to solve kinematics. One is to use the transpose of ...

opencl gauss filter优化(二)

opencl gauss filter优化(二)的更多相关文章

随机推荐

热门专题