opencl gauss filter优化(一)

Platform: LG G3, Adreno 330 ,img size 3264x2448

C code	neon	GPU
300	60	29

单位:ms

1.
目前按如下行列分解的方式最快29ms,Horizontal
kernel
globalWorksize[1] = {height+256-height%256};Vertical kernel
globalWorksize2[1] = {width+256-width%256};

localWorksize2[]
= {64}; localWorksize2 手动设为64时最快。

Porfile的结果为:Horizontal
kernel 的wait
time 有11ms,实际rum
time 18ms.

这个wait
time是什么呢？注释掉Horizontal
kernel中的
vstore16(convert_uchar16(sum>>(ushort)8),0,pOutLine+j)
; 则wait
time只有0.x
ms.并且
localWorksize
越小wait
time越长,为1时达到200ms,16时20ms.
难道是写内存等待时间，没有足够的ALU指令隐藏访存延时？写内存后进入下一个for循环，马上又读内存，所以没有ALU指令隐藏这个延时。然而Horizontal
kernel的profile结果实际run
time只有0.x
ms,所有时间基本都是在wait.(更正：注释掉vstore16后,sum的计算被优化掉了，0.x
ms是读内存的时间)

__kernel void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image

                            __global uchar* restrict  dest,  // Intermediate dest image

                                             const int imgWidth ,                // Image width

                                             const int imgHeight)

{

    const int y = get_global_id();

    if(y>=(imgHeight))

        return;

    const uchar m_nRightShiftNum = ;

    const uchar Rounding = ( << (m_nRightShiftNum - ));

    const uchar  m_nFilter[] = {,,,,,,,,,,};

    const int s = ;

    const int nStart = ;

    const int nWidth = imgWidth;

    __global const uchar* pInLine = source + y*nWidth;

    __global uchar* pOutLine = dest + y*nWidth;

    int j;

    for(j = ; j < nStart; j ++)

    {

        ushort sum = ;

        for (int m = ; m<s / ; m++)

        {

            int k1 = (j + m - nStart);

            k1 = k1< ? -k1 : k1;

            int k2 = (j + nStart - m );

            sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];

        }

        sum += pInLine[j] * m_nFilter[s / ];

        sum = (sum + Rounding) >> ;

        pOutLine[j] = (uchar)clamp(sum,(ushort),(ushort));

    }

    for ( ; (j+)<= (nWidth - nStart); j+=)

    {

#define GAUSSIAN_LINE_NEON(m) \

sum += ( convert_ushort16(vload16(,pInLine+j-nStart+m))* m_nFilter[m] );

        ushort16 sum =  (convert_ushort16(vload16(,pInLine+j-nStart)) * m_nFilter[]);

        GAUSSIAN_LINE_NEON();

        GAUSSIAN_LINE_NEON();

        GAUSSIAN_LINE_NEON();

        GAUSSIAN_LINE_NEON();

        GAUSSIAN_LINE_NEON();

        GAUSSIAN_LINE_NEON();

        GAUSSIAN_LINE_NEON();

        GAUSSIAN_LINE_NEON();

        GAUSSIAN_LINE_NEON();

        GAUSSIAN_LINE_NEON();

        sum += (ushort)Rounding;

        vstore16(convert_uchar16(sum>>(ushort)),,pOutLine+j) ;

    }

    for( ; j < nWidth; j ++)

    {

        ushort sum = ;

        for (int m = ; m<s / ; m++)

        {

            int k1 = (j + m - nStart);

            int k2 = (j + nStart - m );

            k2 = k2 >= nWidth ?  * nWidth -  - k2 : k2;

            sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];

        }

        sum += pInLine[j] * m_nFilter[s / ];

        sum = (sum + Rounding) >> m_nRightShiftNum;

        pOutLine[j] =  (uchar)clamp(sum,(ushort),(ushort));

    }

}

__kernel void ImageGaussianFilterVertical( __global uchar* restrict source,   // Intermediate image processed by ImageGaussianFilterHorizontal()

                        __global uchar* restrict dest,  // Final destination image

                        const int imgWidth,

                                         const int imgHeight

                                    )

{

    const int x = get_global_id();

    if(x>=(imgWidth))

        return;

    const int x_offset = x;

    const int s = ;

    const int nStart = s / ;

    const int m_nRightShiftNum = ;

    const int Rounding = ( << (m_nRightShiftNum - ));

    const uchar  m_nFilter[] = {,,,,,,,,,,};

    int y;

//    mem_fence(CLK_LOCAL_MEM_FENCE);

    ushort lines[];

    lines[nStart] = (ushort)( source[x_offset]  );

    for(y=;y<=nStart;y++)

    {

        lines[nStart+y] = (ushort)( source[y*imgWidth+x_offset]  );

        lines[nStart-y] = lines[nStart+y];

    }

    for(y=;y<(imgHeight-nStart-);)

    {

        ushort sum = lines[nStart] * m_nFilter[nStart];

#define    GaussianTwoLines(m) \

    sum += ( (lines[m] + lines[s--m])*m_nFilter[m] );

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        sum += (ushort)Rounding;

        dest[y*imgWidth+x_offset]  = (uchar)(sum>>(ushort));

        y++;

        for(int i = ; i<s-; i++) lines[i] = lines[i+];

        lines[s-] =  (ushort)( source[(y+nStart)*imgWidth+x_offset]  );

    }

    for(y=imgHeight-nStart-;y<(imgHeight-);)

    {

        ushort sum = lines[nStart] * m_nFilter[nStart];

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        GaussianTwoLines()

        sum += (ushort)Rounding;

        dest[y*imgWidth+x_offset]  = (uchar)(sum>>(ushort));

        y++;

        for(int i = ; i<s-; i++) {

            lines[i] = lines[i+];

        }

        lines[s-] = lines[(imgHeight-y)*-] ; //

    }

    //last y=imgHeight-1

    ushort sum = lines[nStart] * m_nFilter[nStart];

    GaussianTwoLines()

    GaussianTwoLines()

    GaussianTwoLines()

    GaussianTwoLines()

    GaussianTwoLines()

    sum += (ushort)Rounding;

    dest[y*imgWidth+x_offset]  = (uchar)(sum>>(ushort));

}

kernel

2.Horizontal kernel改进，预先load 2x16个所需的pixel,计算时从中提取,这样每次循环只需读一次内存。需要26ms,wait time 8ms.

    ushort16 line0 =  convert_ushort16(vload16(,pInLine+j-nStart));

    for ( ; (j+)<= (nWidth - nStart); j+=)

    {

        ushort16 line1 =  convert_ushort16(vload16(,pInLine+j-nStart+));

        ushort16 temp0;

        ushort16 temp1;

        temp0 = line0;

        temp1.s0123 = line0.sabcd;

        temp1.s45 = line0.sef;

        temp1.s67 = line1.s01;

        temp1.s89abcdef = line1.s23456789;

        ushort16 sum =  ( temp0 + temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s0;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s9;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s1;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s8;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s2;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s7;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s3;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s6;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s4;

        sum += ( temp0 ) * m_nFilter[];

        sum += (ushort)Rounding;

        line0 = line1;

        vstore16(convert_uchar16(sum>>(ushort)),,pOutLine+j) ;

    }

3.不计算，只读写内存测试。那么wait time 3.2 ms,run time 18.2 ms.说明Horizontal kernel 耗时的极限也需3.2ms. 但是只是注释掉vstore16，还保留了读和计算，反而wait time还只有0.x ms，这又是为何？是读几乎没有wait,3.2ms都是写的wait time? (更正：注释掉vstore16后,sum的计算被优化掉了，0.x ms是读内存的时间)

a.再次测试，只有读wait time 0.xms ,只有写wait time 3.2ms.写比读的周期长.

for ( ; (j+16)<= (nWidth - nStart); j+=16)

{

ushort16 line1 = convert_ushort16(vload16(0,pInLine+j-nStart+16));

vstore16(0,0,pOutLine+j) ;

}

b.另外发现使用*((__global uint4*)(pOutLine+j)) = as_uint4(result);比vstore16快，wait time 2.5ms.高通 80-N8592-1_L_OpenCL_Programming_Guide 中提到：

Vectorized load/store of a larger data type is more optimal than a small data type; e.g., a load of uint2* is more optimal than uchar8* .

For optimal SP to L2 bandwidth performance, align read access to a 32-bit address and write access to a 128-bit address.

c.原来写的内存没有对齐，使用*((__global uint4*)(pOutLine+j-5)) = as_uint4(result);wait time 1.9ms.

d.最后加上sum计算，采用的Horizontal kernel如下，localWorksize[] = {64};时时间最少，需要23ms,wait time 4.7ms , localWorksize = 128时,wait 6ms.

并且使用__attribute__((work_group_size_hint(64,1,1))) ,耗时22ms.

__kernel __attribute__((work_group_size_hint(,,)))

void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image

                        __global uchar* restrict  dest,  // Intermediate dest image

                                             const int imgWidth ,                // Image width

                                             const int imgHeight)

{

    const int y = get_global_id();

    if(y>=(imgHeight))

        return;

    const uchar m_nRightShiftNum = ;

    const uchar Rounding = ( << (m_nRightShiftNum - ));

    const uchar  m_nFilter[] = {,,,,,,,,,,};

    const int s = ;

    const int nStart = ;

    const int nWidth = imgWidth;

    __global const uchar* pInLine = source + y*nWidth;

    __global uchar* pOutLine = dest + y*nWidth;

    int j;

    uchar temp[];

    for(j = ; j < nStart; j ++)

    {

        ushort sum = ;

        for (int m = ; m<s / ; m++)

        {

            int k1 = (j + m - nStart);

            k1 = k1< ? -k1 : k1;

            int k2 = (j + nStart - m );

            sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];

        }

        sum += pInLine[j] * m_nFilter[s / ];

        sum = (sum + Rounding) >> ;

        temp[j] = (uchar)clamp(sum,(ushort),(ushort));

    }

    uchar16 result,pre_result;

    pre_result.sbcde = (uchar4)(temp[],temp[],temp[],temp[]);

    pre_result.sf = temp[];

    ushort16 line0 =  convert_ushort16(vload16(,pInLine+j-nStart));

    for ( ; (j+)<= (nWidth - nStart); j+=)

    {

        //prefetch(pInLine+j-nStart,32); //无变化

        ushort16 line1 =  convert_ushort16(vload16(,pInLine+j-nStart+));

        ushort16 temp0;

        ushort16 temp1;

        temp0 = line0;

        temp1.s0123 = line0.sabcd;

        temp1.s45 = line0.sef;

        temp1.s67 = line1.s01;

        temp1.s89abcdef = line1.s23456789;

        ushort16 sum =  ( temp0 + temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s0;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s9;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s1;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s8;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s2;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s7;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s3;

        temp1.s0123456789abcdef = temp1.s00123456789abcde;

        temp1.s0 = line0.s6;

        sum += ( temp0 +  temp1 ) * m_nFilter[];

        temp0.s0123456789abcdef = temp0.s123456789abcdeff;

        temp0.sf = line1.s4;

        sum += ( temp0 ) * m_nFilter[];

        sum += (ushort)Rounding;

        line0 = line1;

        result.s0123 = pre_result.sbcde;

        result.s4 = pre_result.sf;

        pre_result = convert_uchar16(sum>>(ushort)) ;

        result.s5 = pre_result.s0;

        result.s67 = pre_result.s12;

        result.s89abcdef = pre_result.s3456789a;

        *( (__global uint4*)(pOutLine+j-) ) =  (as_uint4)(result) ;

    }

    *( (__global uint*)(pOutLine+j-) ) = (as_uint)(pre_result.sbcde);//last 5 bytes

    pOutLine[j-] = pre_result.sf;

    for( ; j < nWidth; j ++)

    {

        ushort sum = ;

        for (int m = ; m<s / ; m++)

        {

            int k1 = (j + m - nStart);

            int k2 = (j + nStart - m );

            k2 = k2 >= nWidth ?  * nWidth -  - k2 : k2;

            sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];

        }

        sum += pInLine[j] * m_nFilter[s / ];

        sum = (sum + Rounding) >> m_nRightShiftNum;

        pOutLine[j] =  (uchar)clamp(sum,(ushort),(ushort));

    }

}

opencl gauss filter优化(一)的更多相关文章

opencl gauss filter优化(三)
1.根据前两次的最终结果: 使用普通buffer,Horizontal 5ms, Vertical 17 ms 使用image buffer:Horizontal 9.4ms, Vertical 6. ...
opencl gauss filter优化(二)
1.buffer使用image的方式:Horizontal 与 Vertical 算法一样, 共需30ms,wait time 19ms. const sampler_t sampler = CLK_ ...
Anisotropic gauss filter
最近一直在做版面分析,其中文本行检测方面,许多文章涉及到了Anigauss也就是各向异性高斯滤波. 顾名思义,简单的理解就是参数不同的二维高斯滤波. 在文章Fast Anisotropic Gauss ...
OpenCL Kernel设计优化
使用Intel® FPGA SDK for OpenCL™ 离线编译器,不需要调整kernel代码便可以将其最佳的适应于固定的硬件设备,而是离线编译器会根据kernel的要求自适应调整硬件的结构. 通 ...
FILTER优化
explain plan for select a.* from fxqd_list_20131115_new_100 a where (acct_no, oper_no, seqno, trans_ ...
二维高斯滤波器（gauss filter）的实现
我们以一个二维矩阵表示二元高斯滤波器,显然此二维矩阵的具体形式仅于其形状(shape)有关: def gauss_filter(kernel_shape): 为实现二维高斯滤波器,需要首先定义二元高斯 ...
一次性能优化将filter转换
有一条SQL性能有问题,在运行计划中发现filter.遇到它要小心了,类似于nestloop.我曾经的blog对它有研究探索运行计划中filter的原理.用exists极易引起filter. 优化前: ...
安卓平台ARM Mali OpenCL例子-灰度转换（转）
手头一块RK3288的板子,在板子上测试了一张1080p的彩色图灰度转换的OpenCL例子.OpenCL没有任何优化.例子请移步这里. 该例子是编译成安卓平台下的可执行程序. 进入jni文件夹,进行如 ...
OpenCV、OpenCL、OpenGL、OpenPCL
对于几个开源库的总结,作为标记,以前看过,现在开始重视起来!更详细资料请移步开源中国社区! 涉及:OpenCV,OpenCL,OpenGL,OpenPCL 截止到目前: OpenGL的最新版本为4. ...

随机推荐

递推，动态规划(DP)，字符串处理，最佳加法表达式
看了一些资料,竟然发现连百度文库也有错误的地方,在这里吐槽一下题目大意:http://wenku.baidu.com/link?url=DrUNNm19IqpPNZjKPX4Jg6shJiK_Nho6 ...
1. WP8.1学习笔记
数据绑定含义:将对象绑定到控件上 2.基本名词控件:绑定目标对象:绑定源(数据源) 控件与对象属性的联系:路径如何绑定创建对象,设置控件在控件需要数据绑定的地方使用拓展语法 <But ...
精通D3.js学习笔记（2）比例尺和坐标
1.线性比例尺 d3.scale.linear() 创建一个线性比例尺 .domain([0,500]) 定义域 .range([0,1000]) 值域 l ...
D3D 模板缓存的创建过程
下面是我对模板缓存创建的理解: 1. 模板缓存是和深度缓存一起被创建的,将深度缓存的一部分作为模板缓存使用. 深度缓存和模板缓存是在Direct3D初始化时创建的,D3DPRESENT_PARAMET ...
压缩和解压缩gz包
gz是Linux和OSX中常见的压缩文件格式,下面是用java压缩和解压缩gz包的例子 public class GZIPcompress { public static void FileCompr ...
用ubuntu下载电影：磁力链接，torrent，迅雷链接
用ubuntu下载电影:磁力链接,torrent,迅雷链接操作系统:Ubuntu 14.04 64位需要软件:Ktorent, Amule 安装软件: sudo apt-get install k ...
Lucky String
Lucky String -- 微软笔试标签(空格分隔): 算法 A string s is LUCKY if and only if the number of different charact ...
基础！winForm客户端最常用的几个基本属性
客户端应用程序 - 是需要安装在用户电脑上才可以使用的程序特点:不需要联网也可以打开使用部分功能但是现在的情况是许多功能依然需要互联网的支持代码部分在用户电脑上执行 WinForm常用窗体属性: 布 ...
ABAP WRITE、WRITE TO、FORMAT语句
声明:原创作品,转载时请注明文章来自SAP师太技术博客( 博/客/园www.cnblogs.com):www.cnblogs.com/jiangzhengjun,并以超链接形式标明文章原始出处,否则将 ...
Android 测试工具
有时候会发现给手机烧入的信息里少了某一些文件,比如一个图标,或者一个mp3文件之类的等等,为此做了一个小工具检查指定手机里面是否包含相应的文件. 通过程序执行手机的命令来操作手机,感觉还挺有意思的. ...

opencl gauss filter优化(一)

opencl gauss filter优化(一)的更多相关文章

随机推荐

热门专题