OpenCL 图像卷积 1

▶ 书上的代码改进而成，从文件读入一张 256 阶灰度图，按照给定的卷积窗口计算卷积，并输出到文件中。

● 代码，使用 9 格的均值窗口，居然硬读写 .bmp 文件，算是了解一下该文件的具体格式，留作纪念吧。

 // convolution.cl

 __kernel void convolution01(__read_only image2d_t inputImage, __write_only image2d_t outputImage,

     int imageRow, int imageCol, __constant float* filter, int filterWidth, sampler_t sampler)

 {

     const int row = get_global_id(), col = get_global_id();   // 注意工作项的顺序，可以和图像读取不一样

     const int halfWidth = filterWidth / ;

     float4 sum = { 0.0f, 0.0f, 0.0f, 0.0f }, pixel;             // 输出数据类型是四元浮点数

     int i, j, filterIdx;                                        // 卷积窗口单独用一个下标遍历

     for (filterIdx = , i = -halfWidth; i <= halfWidth; i++)

     {

         for (j = -halfWidth; j <= halfWidth; j++)

         {

             pixel = read_imagef(inputImage, sampler, (int2)(col + j, row + i)); // 读取目标坐标，注意列在前行在后

             sum.x += pixel.x * filter[filterIdx++];                             // 采用了单通道，只有第一分量有效

         }

     }

     if (row < imageRow && col < imageCol)                       // 将落在有效范围内的计算数据输出

         write_imagef(outputImage, (int2)(col, row), sum);

     return;

 }

 // main.cpp

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>

 #include <cl.h>

 #pragma warning(disable : 4996)

 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl";

 const char *inputFile = "R:/input.bmp";

 const char *outputFile = "R:/output.bmp";

 bool floatEq(const float a, const float b)// 相等返回 1

 {

     return (b == ) ? fabs(a) < 0.001 : fabs(a / b - ) < 0.001;

 }

 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode，返回字符串长度

 {

     FILE *fp;

     int size;

     //printf("<readText> File: %s\n", kernelPath);

     fopen_s(&fp, kernelPath, "rb");

     if (!fp)

     {

         printf("<readText> Open file failed\n");

         getchar();

         exit(-);

     }

     if (fseek(fp, , SEEK_END) != )

     {

         printf("<readText> Seek end of file failed\n");

         getchar();

         exit(-);

     }

     if ((size = ftell(fp)) < )

     {

         printf("<readText> Get file position failed\n");

         getchar();

         exit(-);

     }

     rewind(fp);

     if ((*pcode = (char *)malloc(size + )) == NULL)

     {

         printf("<readText> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fread(*pcode, , size, fp);

     (*pcode)[size] = '\0';

     fclose(fp);

     return size + ;

 }

 void storeImage(float *imageOut, const char *filename, const char *refFilename)// 输出图片

 {

     FILE *ifp, *ofp;

     unsigned char *metaData, temp;

     int offset, i, j, row, col, mod;

     if (fopen_s(&ifp, refFilename, "rb") != )// 从 参考图片（输入文件）中读取需要的行列数

     {

         printf(filename);

         exit(-);

     }

     fseek(ifp, , SEEK_SET);

     fread(&offset, , , ifp);

     fseek(ifp, , SEEK_SET);

     fread(&col, , , ifp);

     fread(&row, , , ifp);

     fseek(ifp, , SEEK_SET);

     if ((metaData = (unsigned char *)malloc(offset)) == NULL)

     {

         printf("<storeImage> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fread(metaData, , offset, ifp);                  // 从输入文件中读取元信息

     //printf("Output image %s\n", filename);

     if (fopen_s(&ofp, filename, "wb") != )

     {

         printf("<storeImage> Open output file failed\n");

         getchar();

         exit(-);

     }

     if (fwrite(metaData, , offset, ofp) != offset)   //  将元信息原封不动的放入输出文件中

     {

         printf("<storeImage> Write output metaData failed\n");

         getchar();

         exit(-);

     }

     for (i = row - , mod = (col %  ==  ?  :  - col % ); i >= ; i--)// .bmp 行是颠倒的，倒着填充

     {

         for (j = ; j < col; j++)

         {

             temp = (unsigned char)imageOut[i * col + j];

             fwrite(&temp, sizeof(unsigned char), , ofp);

         }

         for (j = ; j < mod; fwrite(&temp, sizeof(unsigned char), , ofp), j++);// 列数非 4 的倍数时补上 junk padding

     }

     fclose(ifp);

     fclose(ofp);

     free(metaData);

     return;

 }

 float *readImage(const char *filename, int *outputRow, int *outputCol)// 从文件读取图片

 {

     unsigned char temp;

     int i, j, row, col, offset, mod;

     float *outputImage;

     FILE *fp;

     if (fopen_s(&fp, filename, "rb") != )

     {

         printf("<readImage> Open file failed\n");

         getchar();

         exit(-);

     }

     fseek(fp, , SEEK_SET);    // 第 10 字节的位置

     fread(&offset, , , fp);   // 元信息大小

     fseek(fp, , SEEK_SET);    // 第 18 字节位置

     fread(&col, , , fp);      // 读取列数和行数

     fread(&row, , , fp);

     printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row);

     if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL)

     {

         printf("<readImage> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fseek(fp, offset, SEEK_SET);// 元信息结束的地方，开始读图像数据

     fflush(NULL);

     for (i = row - , mod = (col %  ==  ?  :  - col % ); i >= ; i--)// .bmp 行是颠倒的，顺着读文件，倒着填充，mod 为列的 junk pading 厚度

     {

         for (j = ; j < col; j++)

         {

             fread(&temp, sizeof(unsigned char), , fp);

             outputImage[i * col + j] = (float)temp;

         }

         for (j = ; j < mod; fread(&temp, sizeof(unsigned char), , fp), j++);// 读取 junk padding，不传入数据中(.bmp 文件中有这几列，但是不显示)

     }

     fclose(fp);

     *outputRow = row;

     *outputCol = col;

     return outputImage;

 }

 int main()

 {

     int imageRow, imageCol, dataSize, row, col, i, j, correct;

     float *inputImage, *outputImage, sum;

     inputImage = readImage(inputFile, &imageRow, &imageCol);// 从文件读取图像数据和行列信息

     dataSize = imageRow * imageCol * sizeof(float);

     outputImage = (float*)malloc(dataSize);

     const int filterWidth = , filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / ;

     float filter[] =

     { ,      ,      ,      ,      ,      ,      ,

         ,      ,      ,      ,      ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,      ,      ,      ,      ,      ,

         ,      ,      ,      ,      ,      ,

     };

     cl_int status;

     cl_uint nPlatform;

     clGetPlatformIDs(, NULL, &nPlatform);

     cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));

     clGetPlatformIDs(nPlatform, listPlatform, NULL);

     cl_uint nDevice = ;

     clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);

     cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));

     clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);

     cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);

     cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status);

     cl_image_format format;                     // 图像格式描述符

     format.image_channel_order = CL_R;          // 单通道

     format.image_channel_data_type = CL_FLOAT;  // 浮点类型（读进来还是 uchar，但是计算需要浮点）

     cl_mem d_inputImage, d_outputImage, d_filter;

     if (true)// 旧 OpenCL 函数 clCreateImage2D

     {

         d_inputImage = clCreateImage2D(context, , &format, imageCol, imageRow, , NULL, &status);

         d_outputImage = clCreateImage2D(context, , &format, imageCol, imageRow, , NULL, &status);

     }

     if (false)// 新 OpenCL 使用描述符图像描述符 cl_image_desc 和函数 clCreateImage

     {

         cl_image_desc desc;

         desc.image_type = CL_MEM_OBJECT_IMAGE2D;

         desc.image_width = imageCol;

         desc.image_height = imageRow;

         desc.image_depth = ;

         desc.image_array_size = ;

         desc.image_row_pitch = ;

         desc.image_slice_pitch = ;

         desc.num_mip_levels = ;

         desc.num_samples = ;

         desc.buffer = NULL;

         d_inputImage = clCreateImage(context, CL_MEM_READ_ONLY, &format, &desc, NULL, &status);

         d_outputImage = clCreateImage(context, CL_MEM_WRITE_ONLY, &format, &desc, NULL, &status);

     }

     d_filter = clCreateBuffer(context, , filterSize * sizeof(float), NULL, &status);

     size_t origin[] = { , ,  }, region[] = { imageCol, imageRow,  };  // 拷贝图像数据用的原点和尺寸，注意尺寸是先数列再数行

     clEnqueueWriteImage(queue, d_inputImage, CL_TRUE, origin, region, , , inputImage, , NULL, NULL);

     clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, , filterSize * sizeof(float), filter, , NULL, NULL);

     cl_sampler sampler = clCreateSampler(context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);// 采样器

     char *code;

     size_t length = readText(sourceText, &code);

     cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, NULL);

     clBuildProgram(program, , listDevice, NULL, NULL, NULL);

     cl_kernel kernel = clCreateKernel(program, "convolution01", &status);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_inputImage);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_outputImage);

     clSetKernelArg(kernel, , sizeof(int), &imageRow);

     clSetKernelArg(kernel, , sizeof(int), &imageCol);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_filter);

     clSetKernelArg(kernel, , sizeof(int), &filterWidth);

     clSetKernelArg(kernel, , sizeof(cl_sampler), &sampler);

     size_t globalSize[] = { imageRow, imageCol };// localSize = { 1, 1}; 可以用 NULL 代替

     status = clEnqueueNDRangeKernel(queue, kernel, , NULL, globalSize, NULL, , NULL, NULL);

     clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin, region, , , outputImage, , NULL, NULL);

     storeImage(outputImage, outputFile, inputFile);// 将输出图像写入文件中

     for (row = , correct = ; row < imageRow && correct; row++)// 检查计算结果

     {

         for (col = ; col < imageCol && correct; col++)

         {

             sum = ;

             for (i = -halfFilterWidth; i <= halfFilterWidth; i++)

             {

                 for (j = -halfFilterWidth; j <= halfFilterWidth; j++)

                 {

                     if (row + i >=  && row + i < imageRow && col + j >=  && col + j < imageCol)

                         sum += inputImage[(row + i) * imageCol + col + j] * filter[(i + halfFilterWidth) * filterWidth + j + halfFilterWidth];

                 }

             }

             if (row >= halfFilterWidth && row < imageRow - halfFilterWidth && col >= halfFilterWidth && col < imageCol - halfFilterWidth &&

                 !floatEq(outputImage[row * imageCol + col], sum))

             {

                 printf("Error at [%d,%d], output:%f, ref:%f\n", row, col, outputImage[row*imageCol + col], sum);

                 correct = ;

             }

         }

     }

     if (correct)

         printf("Result correct.\n");

     free(listPlatform);

     free(listDevice);

     free(inputImage);

     free(outputImage);

     free(code);

     clReleaseContext(context);

     clReleaseCommandQueue(queue);

     clReleaseProgram(program);

     clReleaseKernel(kernel);

     clReleaseMemObject(d_inputImage);

     clReleaseMemObject(d_outputImage);

     clReleaseMemObject(d_filter);

     clReleaseSampler(sampler);

     getchar();

     return ;

 }

● 输出结果，给了一张 400 × 400 的图片参与，另外，计算一张 5040 × 7000 的图片需要 23 ms。

<readImage> Input image R:/input.bmp, width = , height =

Output image R:/output.bmp

Result correct.

● 代码，使用局部内存优化

 // convolution.cl

 __kernel void convolution02(__global float* inputImage, __global float* outputImage, int  imageRow, int  imageCol,

     __constant float* filter, int filterWidth, __local float* localMem, int  localMemRow, int  localMemCol)

 {

     const int groupCol = get_group_id() * get_local_size(), groupRow = get_group_id() * get_local_size();

     const int localCol = get_local_id(), localRow = get_local_id();

     const int globalCol = groupCol + localCol, globalRow = groupRow + localRow;

     const int halfWidth = filterWidth / ;

     int i, j, curRow, curCol, filterIndex;

     float sum;

     // 将源图数据读入局部内存

     for (i = localRow; i < localMemRow; i += get_local_size())

     {

         curRow = groupRow + i;

         for (j = localCol; j < localMemCol; j += get_local_size())

         {

             curCol = groupCol + j;

             if (curRow < imageRow && curCol < imageCol)

                 localMem[i * localMemCol + j] = inputImage[curRow * imageCol + curCol];

         }

     }

     barrier(CLK_LOCAL_MEM_FENCE);     

     // 计算卷积

     if (globalRow < imageRow - filterWidth +  && globalCol < imageCol - filterWidth + )// 选取位于有效范围内的工作组

     {

         sum = 0.0f, filterIndex = ;

         for (i = localRow; i < localRow + filterWidth; i++)

         {

             for (j = localCol; j < localCol + filterWidth; j++)

                 sum += localMem[i * localMemCol + j] * filter[filterIndex++];

         }

         // 循环展开

         /*

         for (i = localRow; i < localRow + filterWidth; i++)

         {

             int offset = i * localMemCol + localCol;

             sum += localMem[offset++] * filter[filterIndex++];// 行数等于 filterWidth

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

             sum += localMem[offset++] * filter[filterIndex++];

         }

         */

         // 循环完全展开

         /*

         int offset = localRow*localMemCol + localCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         offset += localMemCol;

         sum += localMem[offset + 0] * filter[filterIndex++];

         sum += localMem[offset + 1] * filter[filterIndex++];

         sum += localMem[offset + 2] * filter[filterIndex++];

         sum += localMem[offset + 3] * filter[filterIndex++];

         sum += localMem[offset + 4] * filter[filterIndex++];

         sum += localMem[offset + 5] * filter[filterIndex++];

         sum += localMem[offset + 6] * filter[filterIndex++];

         */

         // 数据输出

         outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = sum;

     }

     return;

 }

 __kernel void convolution03(__global float4* inputImage, __global float* outputImage, int  imageRow, int imageCol,

     __constant float* filter, int filterWidth, __local float* localMem, int  localMemRow, int  localMemCol)

 {

     const int groupCol4 = get_group_id() * get_local_size() / , groupRow4 = get_group_id() * get_local_size();

     const int localId = get_local_id() * get_local_size() + get_local_id();

     int localCol = (localId % (localMemCol / )), localRow = (localId / (localMemCol / ));

     int globalCol = groupCol4 + localCol, globalRow = groupRow4 + localRow;

     const int halfWidth = filterWidth / ; 

     __local float4 *localImage4 = (__local float4*)&localMem[localRow*localMemCol + localCol * ];// 局部内存数据

     if (globalRow < imageRow && globalCol < imageCol /  && localRow < localMemRow)

         localImage4[] = inputImage[globalRow*imageCol /  + globalCol];

     barrier(CLK_LOCAL_MEM_FENCE);

     // 重设 坐标以输出

     localCol = get_local_id();

     localRow = get_local_id();

     globalCol = get_group_id()*get_local_size() + localCol;

     globalRow = get_group_id()*get_local_size() + localRow;

     // 计算卷积

     int i, j, filterIndex;

     float sum;

     if (globalRow < imageRow - filterWidth +  && globalCol < imageCol - filterWidth + )

     {

         sum = 0.0f, filterIndex = ;

         for (i = localRow; i < localRow + filterWidth; i++)

         {

             for (int j = localCol; j < localCol + filterWidth; j++)

                 sum += localMem[i * localMemCol + j] * filter[filterIndex++];

         }

         // 循环展开同 convolution02

         // 输出数据

         outputImage[(globalRow + halfWidth) * imageCol + (globalCol + halfWidth)] = ;//sum;

     }

     return;

 }

 // main.cpp

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>

 #include <cl.h>

 #pragma warning(disable : 4996)

 #define NON_OPTIMIZED   // 不使用优化，使用函数 convolution02

 //#define READ_ALIGNED  // 使用内存对齐优化，使用函数 convolution02

 //#define READ4         // 局部内存使用 float4 读取优化，使用函数 convolution03，有点问题

 #define WGX 16          // 工作组尺寸

 #define WGY 16

 char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/convolution.cl";

 const char *inputFile = "R:/input.bmp";

 const char *outputFile = "R:/output.bmp";

 unsigned int roundUp(unsigned int value, unsigned int base)// 将 value 向上取整到 multiple 的整数倍

 {

     unsigned int remainder = value % base;

     return remainder ==  ? value : (value + base - remainder);

 }

 int readText(const char* kernelPath, char **pcode)

 {

     FILE *fp;

     int size;

     //printf("<readText> File: %s\n", kernelPath);

     fopen_s(&fp, kernelPath, "rb");

     if (!fp)

     {

         printf("<readText> Open file failed\n");

         getchar();

         exit(-);

     }

     if (fseek(fp, , SEEK_END) != )

     {

         printf("<readText> Seek end of file failed\n");

         getchar();

         exit(-);

     }

     if ((size = ftell(fp)) < )

     {

         printf("<readText> Get file position failed\n");

         getchar();

         exit(-);

     }

     rewind(fp);

     if ((*pcode = (char *)malloc(size + )) == NULL)

     {

         printf("<readText> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fread(*pcode, , size, fp);

     (*pcode)[size] = '\0';

     fclose(fp);

     return size + ;

 }

 void storeImage(float *imageOut, const char *filename, const char *refFilename)

 {

     FILE *ifp, *ofp;

     unsigned char *metaData, temp;

     int offset, i, j, row, col, mod;

     if (fopen_s(&ifp, refFilename, "rb") != )

     {

         printf(filename);

         exit(-);

     }

     fseek(ifp, , SEEK_SET);

     fread(&offset, , , ifp);

     fseek(ifp, , SEEK_SET);

     fread(&col, , , ifp);

     fread(&row, , , ifp);

     fseek(ifp, , SEEK_SET);

     if ((metaData = (unsigned char *)malloc(offset)) == NULL)

     {

         printf("<storeImage> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fread(metaData, , offset, ifp);

     if (fopen_s(&ofp, filename, "wb") != )

     {

         printf("<storeImage> Open output file failed\n");

         getchar();

         exit(-);

     }

     if (fwrite(metaData, , offset, ofp) != offset)

     {

         printf("<storeImage> Write output metaData failed\n");

         getchar();

         exit(-);

     }

     for (i = row - , mod = (col %  ==  ?  :  - col % ); i >= ; i--)

     {

         for (j = ; j < col; j++)

         {

             temp = (unsigned char)imageOut[i * col + j];

             fwrite(&temp, sizeof(unsigned char), , ofp);

         }

         for (j = ; j < mod; fwrite(&temp, sizeof(unsigned char), , ofp), j++);

     }

     fclose(ifp);

     fclose(ofp);

     free(metaData);

     return;

 }

 float *readImage(const char *filename, int *outputRow, int *outputCol)

 {

     unsigned char temp;

     int i, j, row, col, offset, mod;

     float *outputImage;

     FILE *fp;

     if (fopen_s(&fp, filename, "rb") != )

     {

         printf("<readImage> Open file failed\n");

         getchar();

         exit(-);

     }

     fseek(fp, , SEEK_SET);

     fread(&offset, , , fp);

     fseek(fp, , SEEK_SET);

     fread(&col, , , fp);

     fread(&row, , , fp);

     printf("<readImage> Input image %s, col = %d, row = %d\n", filename, col, row);

     if ((outputImage = (float*)malloc(sizeof(float) * col * row)) == NULL)

     {

         printf("<readImage> Allocate space failed\n");

         getchar();

         exit(-);

     }

     fseek(fp, offset, SEEK_SET);

     fflush(NULL);

     for (i = row - , mod = (col %  ==  ?  :  - col % ); i >= ; i--)

     {

         for (j = ; j < col; j++)

         {

             fread(&temp, sizeof(unsigned char), , fp);

             outputImage[i * col + j] = (float)temp;

         }

         for (j = ; j < mod; fread(&temp, sizeof(unsigned char), , fp), j++);

     }

     fclose(fp);

     *outputRow = row;

     *outputCol = col;

     return outputImage;

 }

 int main()

 {

     int imageRow, imageCol, dataSize, deviceRow, deviceCol, deviceDataSize;

     float *inputImage, *outputImage;

     inputImage = readImage(inputFile, &imageRow, &imageCol);

     dataSize = imageRow * imageCol * sizeof(float);

     outputImage = (float*)malloc(dataSize);

     // 调整列数

 #ifdef NON_OPTIMIZED        // 不调整

     deviceCol = imageCol;

 #else                       // 增加道工作组尺寸的整数倍

     deviceCol = roundUp(imageCol, WGX);

 #endif

     deviceRow = imageRow;   // 行数不变

     deviceDataSize = sizeof(float) * deviceRow * deviceCol;

     const int filterWidth = , filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / ;

     float filter[] =

     {

         ,      ,      ,      ,      ,      ,      ,

         ,      ,      ,      ,      ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,.f / ,.f / ,.f / ,      ,      ,

         ,      ,      ,      ,      ,      ,      ,

         ,      ,      ,      ,      ,      ,

     };

     cl_int status;

     cl_uint nPlatform;

     clGetPlatformIDs(, NULL, &nPlatform);

     cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));

     clGetPlatformIDs(nPlatform, listPlatform, NULL);

     cl_uint nDevice = ;

     clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, , NULL, &nDevice);

     cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id));

     clGetDeviceIDs(listPlatform[], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL);

     cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);

     cl_command_queue queue = clCreateCommandQueue(context, listDevice[], , &status);

     // 使用普通的缓冲区，而不用 image

     cl_mem d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, deviceDataSize, NULL, NULL);

     cl_mem d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, deviceDataSize, NULL, NULL);

     cl_mem d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * filterSize, NULL, NULL);

     // 缓冲区写入

 #if defined NON_OPTIMIZED   // 直接写入

     clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, , deviceDataSize, inputImage, , NULL, NULL);

 #else                       // 对齐写入

     size_t d_origin[] = { ,, }, h_origin[] = { ,, }, region[] = { sizeof(float) * deviceCol, deviceRow,  };

     clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, , sizeof(float) * imageCol, , inputImage, , NULL, NULL);

 #endif

     clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, , sizeof(float) * filterSize, filter, , NULL, NULL);

     char *code;

     size_t length = readText(sourceText, &code);

     cl_program program = clCreateProgramWithSource(context, , (const char **)&code, &length, &status);

     status = clBuildProgram(program, , listDevice, NULL, NULL, NULL);

     // 创建不同的内核

 #if defined NON_OPTIMIZED || defined READ_ALIGNED

     cl_kernel kernel = clCreateKernel(program, "convolution02", NULL);

 #else

     cl_kernel kernel = clCreateKernel(program, "convolution03", NULL);

 #endif

     size_t globalSize[] = { roundUp(imageCol - filterWidth + , WGX), roundUp(imageRow - filterWidth + , WGY) }, localSize[] = { WGX, WGY };

     // 局部内存大小

     int localRow, localCol;

     localRow = localSize[] + filterWidth - ;              // 把一个工作组的大小垫起光环元素的宽度

 #if defined NON_OPTIMIZED || defined READ_ALIGNED

     localCol = localSize[] + filterWidth - ;

 #else

     localCol = roundUp(localSize[] + filterWidth - , );  //  垫起之外还要对齐到 4 的倍数上

 #endif

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_inputImage);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_outputImage);

     clSetKernelArg(kernel, , sizeof(int), &deviceRow);

     clSetKernelArg(kernel, , sizeof(int), &deviceCol);

     clSetKernelArg(kernel, , sizeof(cl_mem), &d_filter);

     clSetKernelArg(kernel, , sizeof(int), &filterWidth);

     clSetKernelArg(kernel, , sizeof(float) * localCol * localRow, NULL);

     clSetKernelArg(kernel, , sizeof(int), &localRow);

     clSetKernelArg(kernel, , sizeof(int), &localCol);

     status = clEnqueueNDRangeKernel(queue, kernel, , NULL, globalSize, localSize, , NULL, NULL);

     // 结果写回

 #if defined NON_OPTIMIZED

     clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, , deviceDataSize, outputImage, , NULL, NULL);

 #else   // 最边上一圈 filterWidth / 2 的部分不写回

     d_origin[] =  * sizeof(float), d_origin[] = , d_origin[] = ;

     h_origin[] =  * sizeof(float), h_origin[] = , h_origin[] = ;

     region[] = (imageCol - filterWidth + ) * sizeof(float), region[] = (imageRow - filterWidth + ), region[] = ;

     clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, d_origin, h_origin, region, sizeof(float) * deviceCol, , sizeof(float) * imageCol, , outputImage, , NULL, NULL);

 #endif

     storeImage(outputImage, outputFile, inputFile);

     // 去掉了检查结果的部分

     free(listPlatform);

     free(listDevice);

     free(inputImage);

     free(outputImage);

     free(code);

     clReleaseContext(context);

     clReleaseCommandQueue(queue);

     clReleaseProgram(program);

     clReleaseKernel(kernel);

     clReleaseMemObject(d_inputImage);

     clReleaseMemObject(d_outputImage);

     clReleaseMemObject(d_filter);

     printf("Finshed.\n");

     getchar();

     return ;

 }

● 输出结果，与上面的简单方法相同

● 用到的函数和定义

 //cl.h

 // 采样器越界处理方案

 #define CL_ADDRESS_NONE                             0x1130

 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131

 #define CL_ADDRESS_CLAMP                            0x1132

 #define CL_ADDRESS_REPEAT                           0x1133

 #define CL_ADDRESS_MIRRORED_REPEAT                  0x1134

 // 插值方案

 #define CL_FILTER_NEAREST                           0x1140

 #define CL_FILTER_LINEAR                            0x1141

 // 用到的采样器和描述符的定义

 typedef struct _cl_sampler* cl_sampler;

 typedef struct _cl_image_format

 {

     cl_channel_order        image_channel_order;

     cl_channel_type         image_channel_data_type;

 } cl_image_format;

 typedef struct _cl_image_desc

 {

     cl_mem_object_type      image_type;

     size_t                  image_width;

     size_t                  image_height;

     size_t                  image_depth;

     size_t                  image_array_size;

     size_t                  image_row_pitch;

     size_t                  image_slice_pitch;

     cl_uint                 num_mip_levels;

     cl_uint                 num_samples;

     cl_mem                  buffer;

 } cl_image_desc;

 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage2D(// OpenCL1.2 中废弃的 image 创建函数

     cl_context,             // 上下文

     cl_mem_flags,           // 特殊标志

     const cl_image_format *,// image 描述符

     size_t,                 // 宽度

     size_t,                 // 高度

     size_t,                 // 行跨步

     void *,                 // 自动传入主机数据

     cl_int *                // 返回结果状态的指针

 ) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateImage(// OpenCL1.2 中开始使用的 image 创建函数

     cl_context,             // 上下文

     cl_mem_flags,           // 特殊标志

     const cl_image_format *,// image 格式描述符

     const cl_image_desc *,  // image 描述符

     void *,                 // 主机数据

     cl_int *                // 返回结果状态的指针

 ) CL_API_SUFFIX__VERSION_1_2;

 extern CL_API_ENTRY cl_sampler CL_API_CALL clCreateSampler(// 初始化采样器

     cl_context,         // 上下文

     cl_bool,            // 是否使用归一化坐标

     cl_addressing_mode, // 越界处理方案

     cl_filter_mode,     // 差值方案

     cl_int *            // 返回结果状态的指针

 ) CL_API_SUFFIX__VERSION_1_0;

 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBufferRect(// 矩形缓冲区写入

     cl_command_queue,   // 命令队列

     cl_mem,             // 目标缓冲区

     cl_bool,            // 阻塞标记

     const size_t *,     // 缓冲区写入起点

     const size_t *,     // 源数据写入起点

     const size_t *,     // 写入范围，是一个三维数组，分别为：一行数据量（Byte），行数，层数

     size_t,             // 缓冲区行间跨度

     size_t,             // 缓冲区层间跨度

     size_t,             // 源数据行间跨度

     size_t,             // 源数据层间跨度

     const void *,       // 源数据指针

     cl_uint,            // 时间列表长度

     const cl_event *,   // 时间列表

     cl_event *          // 本事件标记

 ) CL_API_SUFFIX__VERSION_1_1;

 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBufferRect(// 矩形缓冲区读出，参数定义同上

     cl_command_queue,

     cl_mem,

     cl_bool,

     const size_t *,

     const size_t *,

     const size_t *,

     size_t,

     size_t,

     size_t,

     size_t,

     void *,

     cl_uint,

     const cl_event *,

     cl_event *

 ) CL_API_SUFFIX__VERSION_1_1;

OpenCL 图像卷积 1的更多相关文章

OpenCL 图像卷积 2
▶ 上一篇图像卷积 http://www.cnblogs.com/cuancuancuanhao/p/8535569.html.这篇使用了 OpenCV 从文件读取彩色的 jpeg 图像,进行边缘检测 ...
OpenCL 图像卷积 3 使用 CPU
▶ CPU 图像卷积,共四种方法.分别为基本串行,使用模板,使用局部内存,使用AVX指令优化 ● 全部的代码,仅在主函数中选择调用的函数名即可. #include <stdio.h> #i ...
图像卷积、相关以及在MATLAB中的操作
图像卷积.相关以及在MATLAB中的操作 2016年7月11日 20:34:35, By ChrisZZ 区分卷积和相关图像处理中常常需要用一个滤波器做空间滤波操作.空间滤波操作有时候也被叫做卷积滤 ...
SSE图像算法优化系列十一：使用FFT变换实现图像卷积。
本文重点主要不在于FFT的SSE优化,而在于使用FFT实现快速卷积的相关技巧和过程. 关于FFT变换,有很多参考的代码,特别是对于长度为2的整数次幂的序列,实现起来也是非常简易的,而对于非2次幂的序列 ...
zz图像卷积与滤波的一些知识点
Xinwei: 写的通俗易懂,终于让我这个不搞CV.不搞图像的外行理解卷积和滤波了. 图像卷积与滤波的一些知识点 zouxy09@qq.com http://blog.csdn.net/zouxy09 ...
对抗生成网络-图像卷积-mnist数据生成(代码) 1.tf.layers.conv2d(卷积操作) 2.tf.layers.conv2d_transpose(反卷积操作) 3.tf.layers.batch_normalize(归一化操作) 4.tf.maximum(用于lrelu) 5.tf.train_variable(训练中所有参数) 6.np.random.uniform(生成正态数据
1. tf.layers.conv2d(input, filter, kernel_size, stride, padding) # 进行卷积操作参数说明:input输入数据, filter特征图的 ...
UFLDL教程笔记及练习答案五（自编码线性解码器与处理大型图像**卷积与池化）
自己主动编码线性解码器自己主动编码线性解码器主要是考虑到稀疏自己主动编码器最后一层输出假设用sigmoid函数.因为稀疏自己主动编码器学习是的输出等于输入.simoid函数的值域在[0,1]之间,这 ...
TensorFlow实现图像卷积并可视化示例
图片尺寸要自己修改. 看起来好像没啥意思,不知道下一步能干什么,先卷了再说.由于weights是随机生成的(tf.random_normal作用:用于从服从指定正太分布的数值中取出随机数),所以每次卷 ...
opencv：图像卷积
卷积基本概念 C++代码实现卷积 #include <opencv2/opencv.hpp> #include <iostream> using namespace cv; u ...

随机推荐

ObservableCollection<T> 的同类 ListCollectionView
1:ListCollectionView : CollectionView : INotifyCollectionChanged, INotifyPropertyChanged 2:Observab ...
linux三尖刀
序我们都知道,一个可执行程序的基本的生命过程是如此的: (编辑)源文件--->(编译)目标文件--->(链接)可执行文件--->(调试排错)稳定执行所以,在这个过程中,我们很容易 ...
UVA-10806 Dijkstra, Dijkstra. （最小费用流，网络流建模）
题目大意:给一张带权简单图,找出一条经过起点s和终点t的最小回路. 题目分析:建立网络,以s为源点,t为汇点,另每条边的容量为1,单位费用为边权值.求最小费用流,增广两次后的最小费用便是答案. 代码如 ...
oracle用户密码永不过期
ALTER PROFILE DEFAULT LIMIT PASSWORD_LIFE_TIME UNLIMITED;
Nginx安装配置详解
http://nginx.org/download/ 下载对应的Nginx 安装nginx之前需要安装依赖包 yum install gcc gcc-c++ zlib-devel pcre-devel ...
hadoop安装单机
java环境安装 http://www.cnblogs.com/zeze/p/5902124.html java 环境安装配置 etc/profile: export JAVA_HOME=/usr/j ...
iOS-----使用AddressBook添加联系人
使用AddressBook添加联系人添加联系人的步骤如下: 1 创建ABAddressBookRef,这就得到了对地址簿的引用. 2 调用ABPersonCreate()函数创建一个空的ABReco ...
2018-2019-1 20165212 《信息安全系统设计基础》第八周学习总结（pwd）
2018-2019-1 20165212 <信息安全系统设计基础>第八周学习总结一.知识点总结 1.三种并发方式构造并发程序的方法有三种: 进程线程 I/O多路复用进程:用内核来调 ...
Linux环境编程之同步(四)：Posix信号量
信号量是一种用于提供不同进程间或一个给定进程的不同线程间同步手段的原语.有三种类型:Posix有名信号量,使用Posix IPC名字标识.Posix基于内存的信号量,存放在共享内存区中:System ...
mysql having,group by查询去除重复记录
http://m.jb51.net/article/39302.htm 可以这样去理解group by和聚合函数 http://www.cnblogs.com/wuguanglei/p/4229938 ...

OpenCL 图像卷积 1

OpenCL 图像卷积 1的更多相关文章

随机推荐

热门专题