1.根据前两次的最终结果:

使用普通buffer,Horizontal 5ms, Vertical 17 ms

使用image buffer:Horizontal 9.4ms, Vertical 6.4 ms

那么使用 Horizontal普通buffer,Vertical image buffer 组合方式的话,是不是时间最少?只是Intermediate image仍使用image对象,Horizontal kernel中的写操作需要改变。

结果: Horizontal 的最大local_work_size只能是32, Horizontal 增至8ms, Vertical 6.4ms

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

#define r(xc,y) read_imagef( source, sampler,  (int2) (xc, y) ).x

#define w16(x,y,sum) write_imagef( dest, (int2) (x, y), sum.s0 );write_imagef( dest, (int2) (x+1, y), sum.s1 );\
write_imagef( dest, (int2) (x+, y), sum.s2 );write_imagef( dest, (int2) (x+, y), sum.s3 );\
write_imagef( dest, (int2) (x+, y), sum.s4 );write_imagef( dest, (int2) (x+, y), sum.s5 );\
write_imagef( dest, (int2) (x+, y), sum.s6 );write_imagef( dest, (int2) (x+, y), sum.s7 );\
write_imagef( dest, (int2) (x+, y), sum.s8 );write_imagef( dest, (int2) (x+, y), sum.s9 );\
write_imagef( dest, (int2) (x+, y), sum.sa );write_imagef( dest, (int2) (x+, y), sum.sb );\
write_imagef( dest, (int2) (x+, y), sum.sc );write_imagef( dest, (int2) (x+, y), sum.sd );\
write_imagef( dest, (int2) (x+, y), sum.se );write_imagef( dest, (int2) (x+, y), sum.sf ); __kernel __attribute__((work_group_size_hint(,,)))
void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
__write_only image2d_t dest, // Intermediate dest image
const int imgWidth , // Image width
const int imgHeight)
{
const int y = get_global_id();
if(y>=(imgHeight))
return;
const uchar m_nRightShiftNum = ;
const uchar Rounding = ( << (m_nRightShiftNum - ));
const uchar m_nFilter[] = {,,,,,,,,,,}; const int s = ;
const int nStart = ;
const int nWidth = imgWidth; __global const uchar* pInLine = source + y*nWidth; int j;
for(j = ; j < nStart; j ++)
{
ushort sum = ; for (int m = ; m<s / ; m++)
{
int k1 = (j + m - nStart);
k1 = k1< ? -k1 : k1; int k2 = (j + nStart - m );
sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
}
sum += pInLine[j] * m_nFilter[s / ];
//sum = (sum + Rounding) >> 8;
write_imagef( dest, (int2) (j, y), convert_float(sum)/(255.0*) );
} ushort16 line0 = convert_ushort16(vload16(,pInLine+j-nStart));
for ( ; (j+)<= (nWidth - nStart); j+=)
{
ushort16 line1 = convert_ushort16(vload16(,pInLine+j-nStart+)); ushort16 temp0;
ushort16 temp1;
temp0 = line0;
temp1.s0123 = line0.sabcd;
temp1.s45 = line0.sef;
temp1.s67 = line1.s01;
temp1.s89abcdef = line1.s23456789;
ushort16 sum = ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s0;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s9;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s1;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s8;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s2;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s7;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s3;
temp1.s0123456789abcdef = temp1.s00123456789abcde;
temp1.s0 = line0.s6;
sum += ( temp0 + temp1 ) * m_nFilter[];
temp0.s0123456789abcdef = temp0.s123456789abcdeff;
temp0.sf = line1.s4;
sum += ( temp0 ) * m_nFilter[]; line0 = line1; float16 sum2 = (convert_float16(sum))/(255.0*);
w16(j,y,sum2 );
} for( ; j < nWidth; j ++)
{
ushort sum = ; for (int m = ; m<s / ; m++)
{
int k1 = (j + m - nStart); int k2 = (j + nStart - m );
k2 = k2 >= nWidth ? * nWidth - - k2 : k2;
sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
}
sum += pInLine[j] * m_nFilter[s / ];
//sum = (sum + Rounding) >> m_nRightShiftNum;
write_imagef( dest, (int2) (j, y), convert_float(sum)/(255.0*) );
} }

2.使用各种办法,最终也只能降到13.7ms,Horizontal 7.5, Vertical 6ms,最终代码如下.

更新:H和V都 去掉__attribute__ 属性,local_work_size都设置NULL,让opencl自己选择,H 的最大local_work_size又变回了64,总时间13ms.因为在LG G4,adreno 418上运行却需要40ms,在adreno 418上的local_work_size最大可以是1024,却被强制设成了32.

a.使用mad指令做sum乘加,结果有误差,时间也略增.fma 是无限精度,mad 是快速方法,结果是近似值。

b.使用 pInTemp 读fisrt 16 bytes,避免重复读取,有0.x ms的优势

c.边界使用了mirror repeat

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

#define r(xc,y) read_imagef( source, sampler,  (int2) (xc, y) ).x

#define w16(x,y,sum) write_imagef( dest, (int2) (x, y), sum.s0 );write_imagef( dest, (int2) (x+1, y), sum.s1 );\
write_imagef( dest, (int2) (x+, y), sum.s2 );write_imagef( dest, (int2) (x+, y), sum.s3 );\
write_imagef( dest, (int2) (x+, y), sum.s4 );write_imagef( dest, (int2) (x+, y), sum.s5 );\
write_imagef( dest, (int2) (x+, y), sum.s6 );write_imagef( dest, (int2) (x+, y), sum.s7 );\
write_imagef( dest, (int2) (x+, y), sum.s8 );write_imagef( dest, (int2) (x+, y), sum.s9 );\
write_imagef( dest, (int2) (x+, y), sum.sa );write_imagef( dest, (int2) (x+, y), sum.sb );\
write_imagef( dest, (int2) (x+, y), sum.sc );write_imagef( dest, (int2) (x+, y), sum.sd );\
write_imagef( dest, (int2) (x+, y), sum.se );write_imagef( dest, (int2) (x+, y), sum.sf ); //line0 start from j-5,line1 from j-5+16
#define GaussianShift16 {\
temp0 = line0;\
temp1.s0123 = line0.sabcd;\
temp1.s45 = line0.sef;\
temp1.s67 = line1.s01;\
temp1.s89abcdef = line1.s23456789;\
sum = ( temp0 + temp1 ) * m_nFilter[];\
temp0.s0123456789abcdef = temp0.s123456789abcdeff;\
temp0.sf = line1.s0;\
temp1.s0123456789abcdef = temp1.s00123456789abcde;\
temp1.s0 = line0.s9;\
sum += ( temp0 + temp1 ) * m_nFilter[];\
temp0.s0123456789abcdef = temp0.s123456789abcdeff;\
temp0.sf = line1.s1;\
temp1.s0123456789abcdef = temp1.s00123456789abcde;\
temp1.s0 = line0.s8;\
sum += ( temp0 + temp1 ) * m_nFilter[];\
temp0.s0123456789abcdef = temp0.s123456789abcdeff;\
temp0.sf = line1.s2;\
temp1.s0123456789abcdef = temp1.s00123456789abcde;\
temp1.s0 = line0.s7;\
sum += ( temp0 + temp1 ) * m_nFilter[];\
temp0.s0123456789abcdef = temp0.s123456789abcdeff;\
temp0.sf = line1.s3;\
temp1.s0123456789abcdef = temp1.s00123456789abcde;\
temp1.s0 = line0.s6;\
sum += ( temp0 + temp1 ) * m_nFilter[];\
temp0.s0123456789abcdef = temp0.s123456789abcdeff;\
temp0.sf = line1.s4;\
sum += ( temp0 ) * m_nFilter[];} __kernel __attribute__((work_group_size_hint(,,)))
void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
__write_only image2d_t dest, // Intermediate dest image
const int imgWidth , // Image width
const int imgHeight)
{
const int y = get_global_id();
if(y>=(imgHeight))
return;
const uchar m_nFilter[] = {,,,,,,,,,,}; const int s = ;
const int nStart = ; __global const uchar* pInLine = source + y*imgWidth; int j;
uchar pInTemp[];
*( (uint4*)(pInTemp) ) = *((__global uint4*)(pInLine)) ;//first 16 bytes
for(j = ; j < nStart; j ++)
{
ushort sum = ;
for (int m = ; m<s / ; m++)
{
int k1 = (j + m - nStart);
k1 = k1< ? -k1 : k1; int k2 = (j + nStart - m );
sum += (pInTemp[k1] + pInTemp[k2])*m_nFilter[m];
}
sum += pInTemp[j] * m_nFilter[s / ];
write_imagef( dest, (int2) (j, y), convert_float(sum)/(255.0*) );
} ushort16 temp0;
ushort16 temp1;
ushort16 sum;
ushort16 line0,line1;
line0 = convert_ushort16(*((uchar16*)pInTemp));
for ( ; j< (imgWidth-); j+=)
{
line1 = convert_ushort16(vload16(,pInLine+j-nStart+));//convert_ushort16( as_uchar16(*((__global uint4*)(pInLine+j-nStart+16))) ) ; GaussianShift16
line0 = line1; float16 sum2 = (convert_float16(sum))/(255.0*);
w16(j,y,sum2 );
} {
//last 16 pixel,some pixels may caculate again
j = imgWidth-;
line0 = convert_ushort16(vload16(,pInLine+j-nStart));
//mirror repeat read
line1.s0123 = convert_ushort4( vload4(,pInLine+imgWidth-nStart) );
line1.s4567 = (ushort4)( pInLine[imgWidth-],line1.s3,line1.s21 ) ;
line1.s89 = (ushort2)(line1.s0,line0.sf); GaussianShift16
float16 sum2 = (convert_float16(sum))/(255.0*);
w16(j,y,sum2 );
}
} __kernel __attribute__((work_group_size_hint(,,)))
void ImageGaussianFilterVertical(__read_only image2d_t source, // Source image
__write_only image2d_t dest,
const int imgWidth ,
const int imgHeight)
{
const int x = get_global_id();
if(x>=(imgWidth))
return;
const float m_nFilter[] = {/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0,/256.0}; #define rv16(x,y) (float16)( r(x,y),r(x,y+1),r(x,y+2),r(x,y+3),r(x,y+4),r(x,y+5),r(x,y+6),r(x,y+7),\
r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+),r(x,y+)) #define wv16(x,y,sum) write_imagef( dest, (int2) (x,y), sum.s0 );write_imagef( dest, (int2) (x,y+1), sum.s1 );\
write_imagef( dest, (int2) (x,y+), sum.s2 );write_imagef( dest, (int2) (x,y+), sum.s3 );\
write_imagef( dest, (int2) (x,y+), sum.s4 );write_imagef( dest, (int2) (x,y+), sum.s5 );\
write_imagef( dest, (int2) (x,y+), sum.s6 );write_imagef( dest, (int2) (x,y+), sum.s7 );\
write_imagef( dest, (int2) (x,y+), sum.s8 );write_imagef( dest, (int2) (x,y+), sum.s9 );\
write_imagef( dest, (int2) (x,y+), sum.sa );write_imagef( dest, (int2) (x,y+), sum.sb );\
write_imagef( dest, (int2) (x,y+), sum.sc );write_imagef( dest, (int2) (x,y+), sum.sd );\
write_imagef( dest, (int2) (x,y+), sum.se );write_imagef( dest, (int2) (x,y+), sum.sf ); float16 temp0;
float16 temp1;
float16 sum;
float16 line0,line1; line0 = rv16(x,-);
line0.s0123 = line0.sa987;//mirror repeat
line0.s4 = line0.s6;
int j;
for(j=;j<imgHeight-;j+=){
line1 = rv16(x,j-+); GaussianShift16 line0 = line1;
wv16(x,j,sum );
}
//last 16 pixel,some pixels may caculate again if imgHeight not 16 bytes align
j = imgHeight-;
line0 = rv16(x,j-);
//mirror repeat read
const int y = imgHeight-;
line1.s0123 = (float4)( r(x,y),r(x,y+),r(x,y+),r(x,y+) );
line1.s4567 = (float4)( r(x,y+),line1.s3,line1.s21 );
line1.s89 = (float2)(line1.s0,line0.sf); GaussianShift16
wv16(x,j,sum );
}

总结:1.local_work_size 对时间的影响比较大,有时使用NULL默认的就可以,有时需要一个个去试。

使用vector 类型,local memory,kernel代码结构 都会对 local_work_size 最大值有影响

2.profile中的wait time可能是读写memory还有其它的等待时间,rum time是ALU计算执行的时间。

3.避免对global memory的重复读写,预先缓存下来再用

4.image buffer的读写比普通buffer快,也没有按行按列读写的效率差异.尽量使用image buffer

5.read/write_imageui 并不比 read/write_imagef 快,一般就使用float

6.write 比read 要慢很多,内存未对齐也会慢些

7.使用vector 读写,计算 都会更快.image buffer虽然是单点读,组合成vector计算也更快.

8.half类型存在精度问题,会引入误差,在这里也不比float快

9.如果不确定local_work_size,就设置成NULL,让opencl自己选择。

不同的GPU 上local_work_size最大值不一样,比如这个kernel 在Adreno 330上最大64,在adreno 418上最大1024.

opencl gauss filter优化(三)的更多相关文章

  1. opencl gauss filter优化(二)

    1.buffer使用image的方式:Horizontal 与 Vertical 算法一样, 共需30ms,wait time 19ms. const sampler_t sampler = CLK_ ...

  2. opencl gauss filter优化(一)

    Platform: LG G3, Adreno 330 ,img size 3264x2448 C code neon GPU 300 60 29 单位:ms 1. 目前按如下行列分解的方式最快29m ...

  3. Anisotropic gauss filter

    最近一直在做版面分析,其中文本行检测方面,许多文章涉及到了Anigauss也就是各向异性高斯滤波. 顾名思义,简单的理解就是参数不同的二维高斯滤波. 在文章Fast Anisotropic Gauss ...

  4. MySQL优化三(InnoDB优化)

    body { font-family: Helvetica, arial, sans-serif; font-size: 14px; line-height: 1.6; padding-top: 10 ...

  5. App架构师实践指南六之性能优化三

    App架构师实践指南六之性能优化三 2018年08月02日 13:57:57 nicolelili1 阅读数:190   内存性能优化1.内存机制和原理 1.1 内存管理内存时一个基础又高深的话题,从 ...

  6. 【SQL server初级】数据库性能优化三:程序操作优化

    数据库优化包含以下三部分,数据库自身的优化,数据库表优化,程序操作优化.此文为第三部分 数据库性能优化三:程序操作优化 概述:程序访问优化也可以认为是访问SQL语句的优化,一个好的SQL语句是可以减少 ...

  7. MySQL性能优化(三):索引

    原文:MySQL性能优化(三):索引 版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https://blog.csdn.net/vbi ...

  8. js加载优化三

    Javascript性能优化之异步加载和执行 Author:小欧2013-09-17 随着科技的发展,如今的网站和五六年前相比,现在的人们对web的要求越来越高了,用户体验,交互效果,视觉效果等等都有 ...

  9. Android 性能优化 三 布局优化ViewStub标签的使用

    小黑与小白的故事,通过虚拟这两个人物进行一问一答的形式来共同学习ViewStub的使用 小白:Hi,小黑,ViewStub是什么?听说能够用来进行布局优化. 小黑:ViewStub 是一个隐藏的,不占 ...

随机推荐

  1. Android属性动画完全解析(上)

    Android属性动画完全解析(上) 转载:http://blog.csdn.net/guolin_blog/article/details/43536355 在手机上去实现一些动画效果算是件比较炫酷 ...

  2. 2016年11月28日 星期一 --出埃及记 Exodus 20:19

    2016年11月28日 星期一 --出埃及记 Exodus 20:19 and said to Moses, "Speak to us yourself and we will listen ...

  3. python学习笔记二 数据类型(基础篇)

    Python基础 对于Python,一切事物都是对象,对象基于类创建         不同类型的类可以创造出字符串,数字,列表这样的对象,比如"koka".24.['北京', '上 ...

  4. FlashFXP命令行

    flashfxp.exe -upload ftp://user:pass@ip:port -localpath="本地路径"  -remotepath="远程FTP上的路 ...

  5. ubuntu16.04 搭建 Mysql服务器

    ubuntu16.04 安装 mysql5.7 ,并配置远程访问 安装mysql及其可选组件 apt-get install mysql-serverapt-get install mysql-cli ...

  6. C语言中的结构体,结构体数组

    C语言中的结构体是一个小难点,下面我们详细来讲一下:至于什么是结构体,结构体为什么会产生,我就不说了,原因很简单,但是要注意到是结构体也是连续存储的,但要注意的是结构体里面类型各异,所以必然会产生内存 ...

  7. MySQL中别名的使用

    MySQL中别名的使用 为数据列或者表达式起别名时,别名紧跟数据列,中间以空格隔开,或者用关键字as隔开. #为表达式起别名 使用as关键字隔开 as T_ID from teacher_table; ...

  8. [Objective-C]__bridge,__bridge_retained和__bridge_transfer的意思,区别与使用

    使用ARC能帮我们减轻不少内存管理方面的负担,尤其是对用惯了Java的程序员来说.但是像我这种Java基础本身就不牢固,做了两年的iOS已经习惯手动管理内存的半吊子,使用ARC还是经常碰壁. 对于CF ...

  9. crontab执行shell脚本

    */5 * * * * cd /data/**/ && ./*.sh * * * * * /bin/sh /home/*.sh

  10. mysql更改已有数据表的字符集,保留原有数据内容

    mysql更改已有数据表的字符集,保留原有数据内容     原文网址:http://blog.csdn.net/learn_2/article/details/6460370 环境:在应用开始阶段没有 ...