0_Simple__simpleTexture + 0_Simple_

使用纹理引用来旋转图片，并在使用了静态编译和运行时编译两种环境。

▶ 源代码：静态编译

 #include <stdio.h>

 #include <windows.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_functions.h>

 #include <helper_cuda.h>

 #define MAX_EPSILON_ERROR 5e-3f

 const float angle = 0.5f;

 texture<float, , cudaReadModeElementType> tex;

 __global__ void transformKernel(float *outputData, int width, int height, float theta)

 {

     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

     float u = x / (float)width - 0.5f;

     float v = y / (float)height - 0.5f;

     outputData[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);

 }

 int main()

 {

     printf("\n\tStart.\n");

     // 读取图片数据

     float *h_data = NULL, *h_dataRef = NULL;

     unsigned int width, height, size;

     sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程

     size = width * height * sizeof(float);

     sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\ref_rotated.pgm", &h_dataRef, &width, &height);

     printf("\n\tLoad input files, %d x %d pixels\n", width, height);

     // 申请设备内存

     float *d_data = NULL;

     cudaMalloc((void **)&d_data, size);

     cudaArray *cuArray;

     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(, , , , cudaChannelFormatKindFloat);

     cudaMallocArray(&cuArray, &channelDesc, width, height);

     cudaMemcpyToArray(cuArray, , , h_data, size, cudaMemcpyHostToDevice);// 与 simpleSurfaceWrite 中不同，直接拷贝进 cuArray 

     // 绑定纹理引用

     tex.addressMode[] = cudaAddressModeWrap;

     tex.addressMode[] = cudaAddressModeWrap;

     tex.filterMode = cudaFilterModeLinear;

     tex.normalized = true;

     cudaBindTextureToArray(tex, cuArray, channelDesc);

     // 预跑

     dim3 dimBlock(, , );

     dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, );

     transformKernel << <dimGrid, dimBlock,  >> >(d_data, width, height, angle);

     cudaDeviceSynchronize();

     StopWatchInterface *timer = NULL;

     sdkCreateTimer(&timer);

     sdkStartTimer(&timer);

     transformKernel << <dimGrid, dimBlock,  >> >(d_data, width, height, angle);

     cudaDeviceSynchronize();

     sdkStopTimer(&timer);

     printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);

     sdkDeleteTimer(&timer);

     // 结果回收、输出和检验

     cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);

     sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height);

     printf("\n\tSave output file.\n");

     printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.0f) ? "Passed" : "Failed");

     cudaFree(d_data);

     cudaFreeArray(cuArray);

     getchar();

     return ;

 }

▶ 输出结果

    Start.

    Load input files,  x  pixels

    Cost time: 0.362788 ms, 722.58 Mpixels/sec

    Save output file.

    Finish, return Passed.

▶ 源代码：运行时编译

 // simpleTexture_kernel.cu

 #ifndef _SIMPLETEXTURE_KERNEL_H_

 #define _SIMPLETEXTURE_KERNEL_H_

 texture<float, , cudaReadModeElementType> tex;

 extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta)

 {

     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

     float u = x / (float)width - 0.5f;

     float v = y / (float)height - 0.5f;

     g_odata[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);

 }

 #endif

 // simpleTextureDrv.cpp

 #include <stdio.h>

 #include <iostream>

 #include <helper_functions.h>

 #include <cuda.h>

 #define MAX_EPSILON_ERROR 5e-3f

 #define PATH "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTextureDrv\\data\\"

 using namespace std;

 float angle = 0.5f;

 CUmodule cuModule;

 CUcontext cuContext;

 CUfunction initCUDA()

 {

     CUfunction cuFunction = ;

     string module_path, ptx_source;

     cuInit();                      // 初始化设备，类似于 runtime 中的函数 cudaSetDevice()

     cuCtxCreate(&cuContext, , );  // 创建上下文，后两个参数分别是标志参数和设备号

     // 读取 .ptx 文件

     module_path = PATH"simpleTexture_kernel64.ptx";

     FILE *fp = fopen(module_path.c_str(), "rb");

     fseek(fp, , SEEK_END);

     int file_size = ftell(fp);

     char *buf = new char[file_size + ];

     fseek(fp, , SEEK_SET);

     fread(buf, sizeof(char), file_size, fp);

     fclose(fp);

     buf[file_size] = '\0';

     ptx_source = buf;

     delete[] buf;

     if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx，需要运行时编译

     {

         // 设定编译参数，CUjit_option 放置参数名，jitOptVals 放置参数值

         const unsigned int jitNumOptions = ;

         CUjit_option *jitOptions = new CUjit_option[jitNumOptions];

         void **jitOptVals = new void *[jitNumOptions];

         // 编译日志长度

         jitOptions[] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;

         int jitLogBufferSize = ;

         jitOptVals[] = (void *)(size_t)jitLogBufferSize;

         // 编译日志内容

         jitOptions[] = CU_JIT_INFO_LOG_BUFFER;

         char *jitLogBuffer = new char[jitLogBufferSize];

         jitOptVals[] = jitLogBuffer;

         // 设定一个内核使用的寄存器数量

         jitOptions[] = CU_JIT_MAX_REGISTERS;

         int jitRegCount = ;

         jitOptVals[] = (void *)(size_t)jitRegCount;

         // 编译模块

         cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);

         //printf("\n\tPTX JIT log:\n%s\n", jitLogBuffer);// 输出编译日志

     }

     else// 使用的是 .cubin，不用编译（本例中不经过这个分支）

         cuModuleLoad(&cuModule, module_path.c_str());

     // 取出编译好的模块中的函数

     cuModuleGetFunction(&cuFunction, cuModule, "transformKernel");

     return cuFunction;// 删掉了错误检查，如果中间某一步出错，则应该先销毁上下文再退出

 }

 int main()

 {

     printf("\n\tStart.\n"); 

     // 初始化设备，编译 PTX

     CUfunction transform = initCUDA();

     // 读取图片数据

     float *h_data = NULL, *h_dataRef = NULL;

     unsigned int width, height, size;

     sdkLoadPGM(PATH"lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程

     size = width * height * sizeof(float);

     sdkLoadPGM(PATH"ref_rotated.pgm", &h_dataRef, &width, &height);

     printf("\n\tLoad input files, %d x %d pixels\n", width, height);

     // 申请设备内存

     CUdeviceptr d_data = (CUdeviceptr)NULL;

     cuMemAlloc(&d_data, size);

     CUarray cu_array;

     CUDA_ARRAY_DESCRIPTOR desc;

     desc.Format = CU_AD_FORMAT_FLOAT;

     desc.NumChannels = ;

     desc.Width = width;

     desc.Height = height;

     cuArrayCreate(&cu_array, &desc);

     CUDA_MEMCPY2D copyParam;

     memset(&copyParam, , sizeof(copyParam));

     copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;

     copyParam.dstArray = cu_array;

     copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;

     copyParam.srcHost = h_data;

     copyParam.srcPitch = width * sizeof(float);

     copyParam.WidthInBytes = copyParam.srcPitch;

     copyParam.Height = height;

     cuMemcpy2D(&copyParam);

     // 绑定纹理引用

     CUtexref cu_texref;

     cuModuleGetTexRef(&cu_texref, cuModule, "tex");

     cuTexRefSetArray(cu_texref, cu_array, CU_TRSA_OVERRIDE_FORMAT);

     cuTexRefSetAddressMode(cu_texref, , CU_TR_ADDRESS_MODE_WRAP);

     cuTexRefSetAddressMode(cu_texref, , CU_TR_ADDRESS_MODE_WRAP);

     cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_LINEAR);

     cuTexRefSetFlags(cu_texref, CU_TRSF_NORMALIZED_COORDINATES);

     cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_FLOAT, );

     cuParamSetTexRef(transform, CU_PARAM_TR_DEFAULT, cu_texref);

     int block_size = ;

     StopWatchInterface *timer = NULL;

     // 两种调用 Driver API 的方式

     if ()

     {

         void *args[] = {&d_data, &width, &height, &angle};

         // 预跑

         cuLaunchKernel(transform, (width / block_size), (height / block_size), , block_size, block_size, , , NULL, args, NULL);

         cuCtxSynchronize();

         // 再跑一次测试性能

         sdkCreateTimer(&timer);

         sdkStartTimer(&timer);

         cuLaunchKernel(transform, (width / block_size), (height / block_size), , block_size, block_size, , , NULL, args, NULL);

     }

     else

     {

         int offset = ;

         char argBuffer[];

         // 在一个 CUdeviceptr（unsigned long long）长度的空间里写入调用参数

         *((CUdeviceptr *)&argBuffer[offset]) = d_data;

         offset += sizeof(d_data);

         *((unsigned int *)&argBuffer[offset]) = width;

         offset += sizeof(width);

         *((unsigned int *)&argBuffer[offset]) = height;

         offset += sizeof(height);

         *((float *)&argBuffer[offset]) = angle;

         offset += sizeof(angle);

         void *kernel_launch_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END };

         // 预跑

         cuLaunchKernel(transform, (width / block_size), (height / block_size), ,block_size, block_size, ,,NULL, NULL, (void **)&kernel_launch_config);

         cuCtxSynchronize();

         // 再跑一次测试性能

         sdkCreateTimer(&timer);

         sdkStartTimer(&timer);

         cuLaunchKernel(transform, (width / block_size), (height / block_size), ,block_size, block_size, ,, ,NULL, (void **)&kernel_launch_config);

     }

     cuCtxSynchronize();

     sdkStopTimer(&timer);

     printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);

     sdkDeleteTimer(&timer);

     // 结果回收、输出和检验

     cuMemcpyDtoH(h_data, d_data, size);

     sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height);

     printf("\n\tSave output file.\n");

     printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.15f) ? "Passed" : "Failed");

     cuMemFree(d_data);

     cuArrayDestroy(cu_array);

     cuCtxDestroy(cuContext);

     getchar();

     return ;

 }

▶ 输出结果：

    Start.

    Load input files,  x  pixels

    Cost time: 0.355230 ms, 737.96 Mpixels/sec

    Save output file.

    Finish, return Passed.

▶ 涨姿势

● 一般，与 0_Simple__simpleSurfaceWrite 类似。

0_Simple__simpleTexture + 0_Simple__simpleTextureDrv的更多相关文章

随机推荐

UVA 10815：Andy's First Dictionary（STL）
题意:给出一段英文,里面包含一些单词,空格和标点,单词不区分大小写,默认都为小写.按照字典序输出这些单词(这些单词不能有重复,字母全部变成小写) stringstream:包含在头文件#include ...
Android和Linux下设备节点的创建笔记
1. Linux kernel创建的/dev/下的设备节点是不对的, 其实是kernel仅负责在/sys/(基于内存的虚拟文件系统)创建一大堆下目录和文件,而真正的设备节点是在用户空间程序创建的,应该 ...
SQL Support and Workarounds
此文章来自官方文档,说明了,对于不支持pg 标准的sql 查询的变通方法,实际使用的时候有很大的指导意义 As Citus provides distributed functionality by ...
yugabyte 集成JanusGraph测试
yugabyte 集成图数据库JanusGraph,原理比较简单就是yugabyte 内置Cassandra,配置好JanusGraph 的访问就可以了. 使用docker 模式部署创建yugaby ...
vue 知识点
Vue 中的 slot: 概念:槽/slot是组件在模板中为调用者预留的位置,使用<slot>元素声明一个槽.在最终的视图中,调用者模板中被调用组件的内容,将填充<slot> ...
php单例模式实现对象只被创建一次 mysql单例操作类
这是我在php面试题中遇到的一道试题,单例模式按字面来看就是某一个类只有一个实例,这样做的好处还是很大的,比如说数据库的连接,我们只需要实例化一次,不需要每次都去new了,这样极大的降低了资源的耗费. ...
PHP版本VC6与VC9/VC11/VC14、Thread Safe与None-Thread Safe等的区别
最近正好在弄一个PHP的程序,在这之前一直没有怎么以接触,发现对PHP版本知识了解不是很清楚,自己看了不少类似的文章,还是感觉不够明确和全面, 网上的结论又都是模棱两可,在此,给出最完整甚至武断的解释 ...
关闭IE 对剪切板访问的提示
在internet 选项-“安全”选项卡-自定义级别. 在“脚本”下面找到“允许对剪切板进行编程访问”,选择“启用”即可. -END
FPGA配置方式
FPGA有多种配置/加载方式.粗略可以分为主动和被动两种.主动加载是指由FPGA控制配置流程,被动加载是指FPGA仅仅被动接收配置数据. 最常见的被动配置模式就是JTAG下载bit文件.此模式下,主动 ...
volatile关键字的作用、原理
在只有双重检查锁,没有volatile的懒加载单例模式中,由于指令重排序的问题,我确实不会拿到两个不同的单例了,但我会拿到"半个"单例. 而发挥神奇作用的volatile,可以当之 ...

0_Simple__simpleTexture + 0_Simple__simpleTextureDrv

0_Simple__simpleTexture + 0_Simple__simpleTextureDrv的更多相关文章

随机推荐

热门专题