▶ 书上的代码改进而成,从文件读入一张 256 阶灰度图,按照给定的卷积窗口计算卷积,并输出到文件中。

● 代码,使用 9 格的均值窗口,居然硬读写 .bmp 文件,算是了解一下该文件的具体格式,留作纪念吧。

 // convolution.cl
__kernel void convolution01(__read_only image2d_t inputImage, __write_only image2d_t outputImage,
int imageRow, int imageCol, __constant float* filter, int filterWidth, sampler_t sampler)
{
const int row = get_global_id(), col = get_global_id(); // 注意工作项的顺序,可以和图像读取不一样
const int halfWidth = filterWidth / ;
float4 sum = { 0.0f, 0.0f, 0.0f, 0.0f }, pixel; // 输出数据类型是四元浮点数
int i, j, filterIdx; // 卷积窗口单独用一个下标遍历
for (filterIdx = , i = -halfWidth; i <= halfWidth; i++)
{
for (j = -halfWidth; j <= halfWidth; j++)
{
pixel = read_imagef(inputImage, sampler, (int2)(col + j, row + i)); // 读取目标坐标,注意列在前行在后
sum.x += pixel.x * filter[filterIdx++]; // 采用了单通道,只有第一分量有效
}
}
if (row < imageRow && col < imageCol) // 将落在有效范围内的计算数据输出
write_imagef(outputImage, (int2)(col, row), sum);
return;
}
 // main.cpp
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cl.h> #pragma warning(disable : 4996) char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl";
const char *inputFile = "R:/input.bmp";
const char *outputFile = "R:/output.bmp"; bool floatEq(const float a, const float b)// 相等返回 1
{
return (b == ) ? fabs(a) < 0.001 : fabs(a / b - ) < 0.001;
} int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度
{
FILE *fp;
int size;
//printf("<readText> File: %s\n", kernelPath);
fopen_s(&fp, kernelPath, "rb");
if (!fp)
{
printf("<readText> Open file failed\n");
getchar();
exit(-);
}
if (fseek(fp, , SEEK_END) != )
{
printf("<readText> Seek end of file failed\n");
getchar();
exit(-);
}
if ((size = ftell(fp)) < )
{
printf("<readText> Get file position failed\n");
getchar();
exit(-);
}
rewind(fp);
if ((*pcode = (char *)malloc(size + )) == NULL)
{
printf("<readText> Allocate space failed\n");
getchar();
exit(-);
}
fread(*pcode, , size, fp);
(*pcode)[size] = '\0';
fclose(fp);
return size + ;
} void storeImage(float *imageOut, const char *filename, const char *refFilename)// 输出图片
{
FILE *ifp, *ofp;
unsigned char *metaData, temp;
int offset, i, j, row, col, mod; if (fopen_s(&ifp, refFilename, "rb") != )// 从 参考图片(输入文件)中读取需要的行列数
{
printf(filename);
exit(-);
}
fseek(ifp, , SEEK_SET);
fread(&offset, , , ifp);
fseek(ifp, , SEEK_SET);
fread(&col, , , ifp);
fread(&row, , , ifp);
fseek(ifp, , SEEK_SET);
if ((metaData = (unsigned char *)malloc(offset)) == NULL)
{
printf("<storeImage> Allocate space failed\n");
getchar();
exit(-);
}
fread(metaData, , offset, ifp); // 从输入文件中读取元信息
//printf("Output image %s\n", filename);
if (fopen_s(&ofp, filename, "wb") != )
{
printf("<storeImage> Open output file failed\n");
getchar();
exit(-);
}
if (fwrite(metaData, , offset, ofp) != offset) // 将元信息原封不动的放入输出文件中
{
printf("<storeImage> Write output metaData failed\n");
getchar();
exit(-);
}
for (i = row - , mod = (col % == ? : - col % ); i >= ; i--)// .bmp 行是颠倒的,倒着填充
{
for (j = ; j < col; j++)
{
temp = (unsigned char)imageOut[i * col + j];
fwrite(&temp, sizeof(unsigned char), , ofp);
}
for (j = ; j < mod; fwrite(&temp, sizeof(unsigned char), , ofp), j++);// 列数非 4 的倍数时补上 junk padding } fclose(ifp);
fclose(ofp);
free(metaData);
return;
} float *readImage(const char *filename, int *outputRow, int *outputCol)// 从文件读取图片
{
unsigned char temp;
int i, j, row, col, offset, mod;
float *outputImage;
FILE *fp; if (fopen_s(&fp, filename, "rb") != )
{
printf("<readImage> Open file failed\n");
getchar();
exit(-);
}
fseek(fp, , SEEK_SET); // 第 10 字节的位置
fread(&offset, , , fp); // 元信息大小
fseek(fp, , SEEK_SET); // 第 18 字节位置
fread(&col, , , fp); // 读取列数和行数
fread(&row, , , fp);
printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row); if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL)
{
printf("<readImage> Allocate space failed\n");
getchar();
exit(-);
}
fseek(fp, offset, SEEK_SET);// 元信息结束的地方,开始读图像数据
fflush(NULL);
for (i = row - , mod = (col % == ? : - col % ); i >= ; i--)// .bmp 行是颠倒的,顺着读文件,倒着填充,mod 为列的 junk pading 厚度
{
for (j = ; j < col; j++)
{
fread(&temp, sizeof(unsigned char), , fp);
outputImage[i * col + j] = (float)temp;
}
for (j = ; j < mod; fread(&temp, sizeof(unsigned char), , fp), j++);// 读取 junk padding,不传入数据中(.bmp 文件中有这几列,但是不显示)
}
fclose(fp); *outputRow = row;
*outputCol = col;
return outputImage;
} int main()
{
int imageRow, imageCol, dataSize, row, col, i, j, correct;
float *inputImage, *outputImage, sum; inputImage = readImage(inputFile, &imageRow, &imageCol);// 从文件读取图像数据和行列信息
dataSize = imageRow * imageCol * sizeof(float);
outputImage = (float*)malloc(dataSize); const int filterWidth = , filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / ;
float filter[] =
{ , , , , , , ,
, , , , , , ,
, ,.f / ,.f / ,.f / , , ,
, ,.f / ,.f / ,.f / , , ,
, ,.f / ,.f / ,.f / , , ,
, , , , , , ,
, , , , , ,
}; cl_int status;
cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); cl_image_format format; // 图像格式描述符
format.image_channel_order = CL_R; // 单通道
format.image_channel_data_type = CL_FLOAT; // 浮点类型(读进来还是 uchar,但是计算需要浮点) cl_mem d_inputImage, d_outputImage, d_filter;
if (true)// 旧 OpenCL 函数 clCreateImage2D
{
d_inputImage = clCreateImage2D(context, , &format, imageCol, imageRow, , NULL, &status);
d_outputImage = clCreateImage2D(context, , &format, imageCol, imageRow, , NULL, &status);
}
if (false)// 新 OpenCL 使用描述符图像描述符 cl_image_desc 和函数 clCreateImage
{
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = imageCol;
desc.image_height = imageRow;
desc.image_depth = ;
desc.image_array_size = ;
desc.image_row_pitch = ;
desc.image_slice_pitch = ;
desc.num_mip_levels = ;
desc.num_samples = ;
desc.buffer = NULL; d_inputImage = clCreateImage(context, CL_MEM_READ_ONLY, &format, &desc, NULL, &status);
d_outputImage = clCreateImage(context, CL_MEM_WRITE_ONLY, &format, &desc, NULL, &status);
}
d_filter = clCreateBuffer(context, , filterSize * sizeof(float), NULL, &status); size_t origin[] = { , , }, region[] = { imageCol, imageRow, }; // 拷贝图像数据用的原点和尺寸,注意尺寸是先数列再数行
clEnqueueWriteImage(queue, d_inputImage, CL_TRUE, origin, region, , , inputImage, , NULL, NULL);
clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, , filterSize * sizeof(float), filter, , NULL, NULL); cl_sampler sampler = clCreateSampler(context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);// 采样器 char *code;
size_t length = readText(sourceText, &code);
cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, NULL);
clBuildProgram(program, , listDevice, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "convolution01", &status); clSetKernelArg(kernel, , sizeof(cl_mem), &d_inputImage);
clSetKernelArg(kernel, , sizeof(cl_mem), &d_outputImage);
clSetKernelArg(kernel, , sizeof(int), &imageRow);
clSetKernelArg(kernel, , sizeof(int), &imageCol);
clSetKernelArg(kernel, , sizeof(cl_mem), &d_filter);
clSetKernelArg(kernel, , sizeof(int), &filterWidth);
clSetKernelArg(kernel, , sizeof(cl_sampler), &sampler); size_t globalSize[] = { imageRow, imageCol };// localSize = { 1, 1}; 可以用 NULL 代替
status = clEnqueueNDRangeKernel(queue, kernel, , NULL, globalSize, NULL, , NULL, NULL);
clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin, region, , , outputImage, , NULL, NULL); storeImage(outputImage, outputFile, inputFile);// 将输出图像写入文件中 for (row = , correct = ; row < imageRow && correct; row++)// 检查计算结果
{
for (col = ; col < imageCol && correct; col++)
{
sum = ;
for (i = -halfFilterWidth; i <= halfFilterWidth; i++)
{
for (j = -halfFilterWidth; j <= halfFilterWidth; j++)
{
if (row + i >= && row + i < imageRow && col + j >= && col + j < imageCol)
sum += inputImage[(row + i) * imageCol + col + j] * filter[(i + halfFilterWidth) * filterWidth + j + halfFilterWidth];
}
}
if (row >= halfFilterWidth && row < imageRow - halfFilterWidth && col >= halfFilterWidth && col < imageCol - halfFilterWidth &&
!floatEq(outputImage[row * imageCol + col], sum))
{
printf("Error at [%d,%d], output:%f, ref:%f\n", row, col, outputImage[row*imageCol + col], sum);
correct = ;
}
}
}
if (correct)
printf("Result correct.\n"); free(listPlatform);
free(listDevice);
free(inputImage);
free(outputImage);
free(code);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseMemObject(d_inputImage);
clReleaseMemObject(d_outputImage);
clReleaseMemObject(d_filter);
clReleaseSampler(sampler);
getchar();
return ;
}

● 输出结果,给了一张 400 × 400 的图片参与,另外,计算一张 5040 × 7000 的图片需要 23 ms。

<readImage> Input image R:/input.bmp, width = , height =
Output image R:/output.bmp
Result correct.

  

● 代码,使用局部内存优化

 // convolution.cl
__kernel void convolution02(__global float* inputImage, __global float* outputImage, int imageRow, int imageCol,
__constant float* filter, int filterWidth, __local float* localMem, int localMemRow, int localMemCol)
{
const int groupCol = get_group_id() * get_local_size(), groupRow = get_group_id() * get_local_size();
const int localCol = get_local_id(), localRow = get_local_id();
const int globalCol = groupCol + localCol, globalRow = groupRow + localRow;
const int halfWidth = filterWidth / ;
int i, j, curRow, curCol, filterIndex;
float sum; // 将源图数据读入局部内存
for (i = localRow; i < localMemRow; i += get_local_size())
{
curRow = groupRow + i;
for (j = localCol; j < localMemCol; j += get_local_size())
{
curCol = groupCol + j;
if (curRow < imageRow && curCol < imageCol)
localMem[i * localMemCol + j] = inputImage[curRow * imageCol + curCol];
}
}
barrier(CLK_LOCAL_MEM_FENCE); // 计算卷积
if (globalRow < imageRow - filterWidth + && globalCol < imageCol - filterWidth + )// 选取位于有效范围内的工作组
{
sum = 0.0f, filterIndex = ;
for (i = localRow; i < localRow + filterWidth; i++)
{
for (j = localCol; j < localCol + filterWidth; j++)
sum += localMem[i * localMemCol + j] * filter[filterIndex++];
}
// 循环展开
/*
for (i = localRow; i < localRow + filterWidth; i++)
{
int offset = i * localMemCol + localCol;
sum += localMem[offset++] * filter[filterIndex++];// 行数等于 filterWidth
sum += localMem[offset++] * filter[filterIndex++];
sum += localMem[offset++] * filter[filterIndex++];
sum += localMem[offset++] * filter[filterIndex++];
sum += localMem[offset++] * filter[filterIndex++];
sum += localMem[offset++] * filter[filterIndex++];
sum += localMem[offset++] * filter[filterIndex++];
}
*/
// 循环完全展开
/*
int offset = localRow*localMemCol + localCol;
sum += localMem[offset + 0] * filter[filterIndex++];
sum += localMem[offset + 1] * filter[filterIndex++];
sum += localMem[offset + 2] * filter[filterIndex++];
sum += localMem[offset + 3] * filter[filterIndex++];
sum += localMem[offset + 4] * filter[filterIndex++];
sum += localMem[offset + 5] * filter[filterIndex++];
sum += localMem[offset + 6] * filter[filterIndex++];
offset += localMemCol;
sum += localMem[offset + 0] * filter[filterIndex++];
sum += localMem[offset + 1] * filter[filterIndex++];
sum += localMem[offset + 2] * filter[filterIndex++];
sum += localMem[offset + 3] * filter[filterIndex++];
sum += localMem[offset + 4] * filter[filterIndex++];
sum += localMem[offset + 5] * filter[filterIndex++];
sum += localMem[offset + 6] * filter[filterIndex++];
offset += localMemCol;
sum += localMem[offset + 0] * filter[filterIndex++];
sum += localMem[offset + 1] * filter[filterIndex++];
sum += localMem[offset + 2] * filter[filterIndex++];
sum += localMem[offset + 3] * filter[filterIndex++];
sum += localMem[offset + 4] * filter[filterIndex++];
sum += localMem[offset + 5] * filter[filterIndex++];
sum += localMem[offset + 6] * filter[filterIndex++];
offset += localMemCol;
sum += localMem[offset + 0] * filter[filterIndex++];
sum += localMem[offset + 1] * filter[filterIndex++];
sum += localMem[offset + 2] * filter[filterIndex++];
sum += localMem[offset + 3] * filter[filterIndex++];
sum += localMem[offset + 4] * filter[filterIndex++];
sum += localMem[offset + 5] * filter[filterIndex++];
sum += localMem[offset + 6] * filter[filterIndex++];
offset += localMemCol;
sum += localMem[offset + 0] * filter[filterIndex++];
sum += localMem[offset + 1] * filter[filterIndex++];
sum += localMem[offset + 2] * filter[filterIndex++];
sum += localMem[offset + 3] * filter[filterIndex++];
sum += localMem[offset + 4] * filter[filterIndex++];
sum += localMem[offset + 5] * filter[filterIndex++];
sum += localMem[offset + 6] * filter[filterIndex++];
offset += localMemCol;
sum += localMem[offset + 0] * filter[filterIndex++];
sum += localMem[offset + 1] * filter[filterIndex++];
sum += localMem[offset + 2] * filter[filterIndex++];
sum += localMem[offset + 3] * filter[filterIndex++];
sum += localMem[offset + 4] * filter[filterIndex++];
sum += localMem[offset + 5] * filter[filterIndex++];
sum += localMem[offset + 6] * filter[filterIndex++];
offset += localMemCol;
sum += localMem[offset + 0] * filter[filterIndex++];
sum += localMem[offset + 1] * filter[filterIndex++];
sum += localMem[offset + 2] * filter[filterIndex++];
sum += localMem[offset + 3] * filter[filterIndex++];
sum += localMem[offset + 4] * filter[filterIndex++];
sum += localMem[offset + 5] * filter[filterIndex++];
sum += localMem[offset + 6] * filter[filterIndex++];
*/
// 数据输出
outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = sum;
}
return;
} __kernel void convolution03(__global float4* inputImage, __global float* outputImage, int imageRow, int imageCol,
__constant float* filter, int filterWidth, __local float* localMem, int localMemRow, int localMemCol)
{
const int groupCol4 = get_group_id() * get_local_size() / , groupRow4 = get_group_id() * get_local_size();
const int localId = get_local_id() * get_local_size() + get_local_id();
int localCol = (localId % (localMemCol / )), localRow = (localId / (localMemCol / ));
int globalCol = groupCol4 + localCol, globalRow = groupRow4 + localRow;
const int halfWidth = filterWidth / ; __local float4 *localImage4 = (__local float4*)&localMem[localRow*localMemCol + localCol * ];// 局部内存数据 if (globalRow < imageRow && globalCol < imageCol / && localRow < localMemRow)
localImage4[] = inputImage[globalRow*imageCol / + globalCol];
barrier(CLK_LOCAL_MEM_FENCE); // 重设 坐标以输出
localCol = get_local_id();
localRow = get_local_id();
globalCol = get_group_id()*get_local_size() + localCol;
globalRow = get_group_id()*get_local_size() + localRow; // 计算卷积
int i, j, filterIndex;
float sum;
if (globalRow < imageRow - filterWidth + && globalCol < imageCol - filterWidth + )
{
sum = 0.0f, filterIndex = ;
for (i = localRow; i < localRow + filterWidth; i++)
{
for (int j = localCol; j < localCol + filterWidth; j++)
sum += localMem[i * localMemCol + j] * filter[filterIndex++];
}
// 循环展开同 convolution02
// 输出数据
outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = ;//sum;
}
return;
}
 // main.cpp
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cl.h> #pragma warning(disable : 4996) #define NON_OPTIMIZED // 不使用优化,使用函数 convolution02
//#define READ_ALIGNED // 使用内存对齐优化,使用函数 convolution02
//#define READ4 // 局部内存使用 float4 读取优化,使用函数 convolution03,有点问题
#define WGX 16 // 工作组尺寸
#define WGY 16 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl";
const char *inputFile = "R:/input.bmp";
const char *outputFile = "R:/output.bmp"; unsigned int roundUp(unsigned int value, unsigned int base)// 将 value 向上取整到 multiple 的整数倍
{
unsigned int remainder = value % base;
return remainder == ? value : (value + base - remainder);
} int readText(const char* kernelPath, char **pcode)
{
FILE *fp;
int size;
//printf("<readText> File: %s\n", kernelPath);
fopen_s(&fp, kernelPath, "rb");
if (!fp)
{
printf("<readText> Open file failed\n");
getchar();
exit(-);
}
if (fseek(fp, , SEEK_END) != )
{
printf("<readText> Seek end of file failed\n");
getchar();
exit(-);
}
if ((size = ftell(fp)) < )
{
printf("<readText> Get file position failed\n");
getchar();
exit(-);
}
rewind(fp);
if ((*pcode = (char *)malloc(size + )) == NULL)
{
printf("<readText> Allocate space failed\n");
getchar();
exit(-);
}
fread(*pcode, , size, fp);
(*pcode)[size] = '\0';
fclose(fp);
return size + ;
} void storeImage(float *imageOut, const char *filename, const char *refFilename)
{
FILE *ifp, *ofp;
unsigned char *metaData, temp;
int offset, i, j, row, col, mod; if (fopen_s(&ifp, refFilename, "rb") != )
{
printf(filename);
exit(-);
}
fseek(ifp, , SEEK_SET);
fread(&offset, , , ifp);
fseek(ifp, , SEEK_SET);
fread(&col, , , ifp);
fread(&row, , , ifp);
fseek(ifp, , SEEK_SET);
if ((metaData = (unsigned char *)malloc(offset)) == NULL)
{
printf("<storeImage> Allocate space failed\n");
getchar();
exit(-);
}
fread(metaData, , offset, ifp); if (fopen_s(&ofp, filename, "wb") != )
{
printf("<storeImage> Open output file failed\n");
getchar();
exit(-);
}
if (fwrite(metaData, , offset, ofp) != offset)
{
printf("<storeImage> Write output metaData failed\n");
getchar();
exit(-);
}
for (i = row - , mod = (col % == ? : - col % ); i >= ; i--)
{
for (j = ; j < col; j++)
{
temp = (unsigned char)imageOut[i * col + j];
fwrite(&temp, sizeof(unsigned char), , ofp);
}
for (j = ; j < mod; fwrite(&temp, sizeof(unsigned char), , ofp), j++); } fclose(ifp);
fclose(ofp);
free(metaData);
return;
} float *readImage(const char *filename, int *outputRow, int *outputCol)
{
unsigned char temp;
int i, j, row, col, offset, mod;
float *outputImage;
FILE *fp; if (fopen_s(&fp, filename, "rb") != )
{
printf("<readImage> Open file failed\n");
getchar();
exit(-);
}
fseek(fp, , SEEK_SET);
fread(&offset, , , fp);
fseek(fp, , SEEK_SET);
fread(&col, , , fp);
fread(&row, , , fp);
printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row); if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL)
{
printf("<readImage> Allocate space failed\n");
getchar();
exit(-);
}
fseek(fp, offset, SEEK_SET);
fflush(NULL);
for (i = row - , mod = (col % == ? : - col % ); i >= ; i--)
{
for (j = ; j < col; j++)
{
fread(&temp, sizeof(unsigned char), , fp);
outputImage[i * col + j] = (float)temp;
}
for (j = ; j < mod; fread(&temp, sizeof(unsigned char), , fp), j++);
}
fclose(fp); *outputRow = row;
*outputCol = col;
return outputImage;
} int main()
{
int imageRow, imageCol, dataSize, deviceRow, deviceCol, deviceDataSize;
float *inputImage, *outputImage;
inputImage = readImage(inputFile, &imageRow, &imageCol);
dataSize = imageRow * imageCol * sizeof(float);
outputImage = (float*)malloc(dataSize); // 调整列数
#ifdef NON_OPTIMIZED // 不调整
deviceCol = imageCol;
#else // 增加道工作组尺寸的整数倍
deviceCol = roundUp(imageCol, WGX);
#endif
deviceRow = imageRow; // 行数不变
deviceDataSize = sizeof(float) * deviceRow * deviceCol; const int filterWidth = , filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / ;
float filter[] =
{
, , , , , , ,
, , , , , , ,
, ,.f / ,.f / ,.f / , , ,
, ,.f / ,.f / ,.f / , , ,
, ,.f / ,.f / ,.f / , , ,
, , , , , , ,
, , , , , ,
}; cl_int status;
cl_uint nPlatform;
clGetPlatformIDs(, NULL, &nPlatform);
cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = ;
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);
cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));
clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);
cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status); // 使用普通的缓冲区,而不用 image
cl_mem d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, deviceDataSize, NULL, NULL);
cl_mem d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, deviceDataSize, NULL, NULL);
cl_mem d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * filterSize, NULL, NULL); // 缓冲区写入
#if defined NON_OPTIMIZED // 直接写入
clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, , deviceDataSize, inputImage, , NULL, NULL);
#else // 对齐写入
size_t d_origin[] = { ,, }, h_origin[] = { ,, }, region[] = { sizeof(float) * deviceCol, deviceRow, };
clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, , sizeof(float) * imageCol, , inputImage, , NULL, NULL);
#endif
clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, , sizeof(float) * filterSize, filter, , NULL, NULL); char *code;
size_t length = readText(sourceText, &code);
cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, &status);
status = clBuildProgram(program, , listDevice, NULL, NULL, NULL); // 创建不同的内核
#if defined NON_OPTIMIZED || defined READ_ALIGNED
cl_kernel kernel = clCreateKernel(program, "convolution02", NULL);
#else
cl_kernel kernel = clCreateKernel(program, "convolution03", NULL);
#endif size_t globalSize[] = { roundUp(imageCol - filterWidth + , WGX), roundUp(imageRow - filterWidth + , WGY) }, localSize[] = { WGX, WGY }; // 局部内存大小
int localRow, localCol;
localRow = localSize[] + filterWidth - ; // 把一个工作组的大小垫起光环元素的宽度
#if defined NON_OPTIMIZED || defined READ_ALIGNED
localCol = localSize[] + filterWidth - ;
#else
localCol = roundUp(localSize[] + filterWidth - , ); // 垫起之外还要对齐到 4 的倍数上
#endif
clSetKernelArg(kernel, , sizeof(cl_mem), &d_inputImage);
clSetKernelArg(kernel, , sizeof(cl_mem), &d_outputImage);
clSetKernelArg(kernel, , sizeof(int), &deviceRow);
clSetKernelArg(kernel, , sizeof(int), &deviceCol);
clSetKernelArg(kernel, , sizeof(cl_mem), &d_filter);
clSetKernelArg(kernel, , sizeof(int), &filterWidth);
clSetKernelArg(kernel, , sizeof(float) * localCol * localRow, NULL);
clSetKernelArg(kernel, , sizeof(int), &localRow);
clSetKernelArg(kernel, , sizeof(int), &localCol); status = clEnqueueNDRangeKernel(queue, kernel, , NULL, globalSize, localSize, , NULL, NULL); // 结果写回
#if defined NON_OPTIMIZED
clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, , deviceDataSize, outputImage, , NULL, NULL);
#else // 最边上一圈 filterWidth / 2 的部分不写回
d_origin[] = * sizeof(float), d_origin[] = , d_origin[] = ;
h_origin[] = * sizeof(float), h_origin[] = , h_origin[] = ;
region[] = (imageCol - filterWidth + ) * sizeof(float), region[] = (imageRow - filterWidth + ), region[] = ;
clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, , sizeof(float) * imageCol, , outputImage, , NULL, NULL);
#endif
storeImage(outputImage, outputFile, inputFile); // 去掉了检查结果的部分
free(listPlatform);
free(listDevice);
free(inputImage);
free(outputImage);
free(code);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseMemObject(d_inputImage);
clReleaseMemObject(d_outputImage);
clReleaseMemObject(d_filter);
printf("Finshed.\n");
getchar();
return ;
}

● 输出结果,与上面的简单方法相同

● 用到的函数和定义

 //cl.h
// 采样器越界处理方案
#define CL_ADDRESS_NONE 0x1130
#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
#define CL_ADDRESS_CLAMP 0x1132
#define CL_ADDRESS_REPEAT 0x1133
#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 // 插值方案
#define CL_FILTER_NEAREST 0x1140
#define CL_FILTER_LINEAR 0x1141 // 用到的采样器和描述符的定义
typedef struct _cl_sampler* cl_sampler;
typedef struct _cl_image_format
{
cl_channel_order image_channel_order;
cl_channel_type image_channel_data_type;
} cl_image_format; typedef struct _cl_image_desc
{
cl_mem_object_type image_type;
size_t image_width;
size_t image_height;
size_t image_depth;
size_t image_array_size;
size_t image_row_pitch;
size_t image_slice_pitch;
cl_uint num_mip_levels;
cl_uint num_samples;
cl_mem buffer;
} cl_image_desc; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage2D(// OpenCL1.2 中废弃的 image 创建函数
cl_context, // 上下文
cl_mem_flags, // 特殊标志
const cl_image_format *,// image 描述符
size_t, // 宽度
size_t, // 高度
size_t, // 行跨步
void *, // 自动传入主机数据
cl_int * // 返回结果状态的指针
) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY cl_mem CL_API_CALL clCreateImage(// OpenCL1.2 中开始使用的 image 创建函数
cl_context, // 上下文
cl_mem_flags, // 特殊标志
const cl_image_format *,// image 格式描述符
const cl_image_desc *, // image 描述符
void *, // 主机数据
cl_int * // 返回结果状态的指针
) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_sampler CL_API_CALL clCreateSampler(// 初始化采样器
cl_context, // 上下文
cl_bool, // 是否使用归一化坐标
cl_addressing_mode, // 越界处理方案
cl_filter_mode, // 差值方案
cl_int * // 返回结果状态的指针
) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBufferRect(// 矩形缓冲区写入
cl_command_queue, // 命令队列
cl_mem, // 目标缓冲区
cl_bool, // 阻塞标记
const size_t *, // 缓冲区写入起点
const size_t *, // 源数据写入起点
const size_t *, // 写入范围,是一个三维数组,分别为:一行数据量(Byte),行数,层数
size_t, // 缓冲区行间跨度
size_t, // 缓冲区层间跨度
size_t, // 源数据行间跨度
size_t, // 源数据层间跨度
const void *, // 源数据指针
cl_uint, // 时间列表长度
const cl_event *, // 时间列表
cl_event * // 本事件标记
) CL_API_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBufferRect(// 矩形缓冲区读出,参数定义同上
cl_command_queue,
cl_mem,
cl_bool,
const size_t *,
const size_t *,
const size_t *,
size_t,
size_t,
size_t,
size_t,
void *,
cl_uint,
const cl_event *,
cl_event *
) CL_API_SUFFIX__VERSION_1_1;

OpenCL 图像卷积 1的更多相关文章

  1. OpenCL 图像卷积 2

    ▶ 上一篇图像卷积 http://www.cnblogs.com/cuancuancuanhao/p/8535569.html.这篇使用了 OpenCV 从文件读取彩色的 jpeg 图像,进行边缘检测 ...

  2. OpenCL 图像卷积 3 使用 CPU

    ▶ CPU 图像卷积,共四种方法.分别为基本串行,使用模板,使用局部内存,使用AVX指令优化 ● 全部的代码,仅在主函数中选择调用的函数名即可. #include <stdio.h> #i ...

  3. 图像卷积、相关以及在MATLAB中的操作

    图像卷积.相关以及在MATLAB中的操作 2016年7月11日 20:34:35, By ChrisZZ 区分卷积和相关 图像处理中常常需要用一个滤波器做空间滤波操作.空间滤波操作有时候也被叫做卷积滤 ...

  4. SSE图像算法优化系列十一:使用FFT变换实现图像卷积。

    本文重点主要不在于FFT的SSE优化,而在于使用FFT实现快速卷积的相关技巧和过程. 关于FFT变换,有很多参考的代码,特别是对于长度为2的整数次幂的序列,实现起来也是非常简易的,而对于非2次幂的序列 ...

  5. zz图像卷积与滤波的一些知识点

    Xinwei: 写的通俗易懂,终于让我这个不搞CV.不搞图像的外行理解卷积和滤波了. 图像卷积与滤波的一些知识点 zouxy09@qq.com http://blog.csdn.net/zouxy09 ...

  6. 对抗生成网络-图像卷积-mnist数据生成(代码) 1.tf.layers.conv2d(卷积操作) 2.tf.layers.conv2d_transpose(反卷积操作) 3.tf.layers.batch_normalize(归一化操作) 4.tf.maximum(用于lrelu) 5.tf.train_variable(训练中所有参数) 6.np.random.uniform(生成正态数据

    1. tf.layers.conv2d(input, filter, kernel_size, stride, padding) # 进行卷积操作 参数说明:input输入数据, filter特征图的 ...

  7. UFLDL教程笔记及练习答案五(自编码线性解码器与处理大型图像**卷积与池化)

    自己主动编码线性解码器 自己主动编码线性解码器主要是考虑到稀疏自己主动编码器最后一层输出假设用sigmoid函数.因为稀疏自己主动编码器学习是的输出等于输入.simoid函数的值域在[0,1]之间,这 ...

  8. TensorFlow实现图像卷积并可视化示例

    图片尺寸要自己修改. 看起来好像没啥意思,不知道下一步能干什么,先卷了再说.由于weights是随机生成的(tf.random_normal作用:用于从服从指定正太分布的数值中取出随机数),所以每次卷 ...

  9. opencv:图像卷积

    卷积基本概念 C++代码实现卷积 #include <opencv2/opencv.hpp> #include <iostream> using namespace cv; u ...

随机推荐

  1. (GoRails)ActiveRecord --explain方法:(优化你的查询)

    https://gorails.com/episodes/activerecord-explain?autoplay=1 比如没有加index的查询和加了index的查询,调用数据库的速度就差5倍. ...

  2. vue全家桶实现笔记本功能

    一个通过vue实现的练手小项目,数据保存和导出通过node进行处理 成品截图: 安装vue-cli,webpack: cnpm install webpack -g cnpm install vue- ...

  3. ansible常用套路(一)

    一.SSH互信 1 配置/etc/ansible/hosts 文件 [zabbix_agent] 172.26.4.203 172.26.4.204 172.26.4.205 [zabbix_agen ...

  4. linux下redis的安装及配置启动

    linux下redis的安装及配置启动 标签: redisnosql 2014-10-24 14:04 19732人阅读 评论(0) 收藏 举报  分类: 数据与性能(41)  wget http:/ ...

  5. 99%的人都理解错了HTTP中GET与POST的区别(转自知乎)

    作者:Larry链接:https://zhuanlan.zhihu.com/p/22536382来源:知乎著作权归作者所有.商业转载请联系作者获得授权,非商业转载请注明出处. GET和POST是HTT ...

  6. java 生成xml文件

    这里也使用的是import org.w3c.dom.Document; 首先创建document对象,给该对象赋值,然后将document对象使用transformer的transformer转换方法 ...

  7. css中用#id.class的形式定义样式,为什么这样用,不直接写成.class.代码如下:#skin_0.selected{}这种的

    <ul class="skin"> <li id="skin_0" title="蓝色" class="sele ...

  8. timer Compliant Controller project (1)--Product introduction meeting

    Last week ,I lead the meeting for new project. i'm  very excited. The meeting is divided into the fo ...

  9. ReactNative 问题集合

    在线流畅播放视频 (建议调用原生的视频组件, 或者在成熟的安卓 ios组件封装一层) 如何指定页面横屏 (在这个组件react-native-orientation基础上改写) webSocket不和 ...

  10. nmcli 命令的基本使用

    nmcli命令 地址配置工具:nmcli nmcli  device  查看所有网卡的信息 nmcli  device  status 和numcli device 相同 nmcli  device ...