使用纹理引用来旋转图片,并在使用了静态编译和运行时编译两种环境。

▶ 源代码:静态编译

 #include <stdio.h>
#include <windows.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h> #define MAX_EPSILON_ERROR 5e-3f
const float angle = 0.5f;
texture<float, , cudaReadModeElementType> tex; __global__ void transformKernel(float *outputData, int width, int height, float theta)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
float u = x / (float)width - 0.5f;
float v = y / (float)height - 0.5f; outputData[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);
} int main()
{
printf("\n\tStart.\n"); // 读取图片数据
float *h_data = NULL, *h_dataRef = NULL;
unsigned int width, height, size;
sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程
size = width * height * sizeof(float);
sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\ref_rotated.pgm", &h_dataRef, &width, &height);
printf("\n\tLoad input files, %d x %d pixels\n", width, height); // 申请设备内存
float *d_data = NULL;
cudaMalloc((void **)&d_data, size);
cudaArray *cuArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(, , , , cudaChannelFormatKindFloat);
cudaMallocArray(&cuArray, &channelDesc, width, height);
cudaMemcpyToArray(cuArray, , , h_data, size, cudaMemcpyHostToDevice);// 与 simpleSurfaceWrite 中不同,直接拷贝进 cuArray // 绑定纹理引用
tex.addressMode[] = cudaAddressModeWrap;
tex.addressMode[] = cudaAddressModeWrap;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true;
cudaBindTextureToArray(tex, cuArray, channelDesc); // 预跑
dim3 dimBlock(, , );
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, );
transformKernel << <dimGrid, dimBlock, >> >(d_data, width, height, angle);
cudaDeviceSynchronize(); StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer); transformKernel << <dimGrid, dimBlock, >> >(d_data, width, height, angle);
cudaDeviceSynchronize(); sdkStopTimer(&timer);
printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
sdkDeleteTimer(&timer); // 结果回收、输出和检验
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height);
printf("\n\tSave output file.\n");
printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.0f) ? "Passed" : "Failed"); cudaFree(d_data);
cudaFreeArray(cuArray);
getchar();
return ;
}

▶ 输出结果

    Start.

    Load input files,  x  pixels

    Cost time: 0.362788 ms, 722.58 Mpixels/sec

    Save output file.

    Finish, return Passed.

▶ 源代码:运行时编译

 // simpleTexture_kernel.cu
#ifndef _SIMPLETEXTURE_KERNEL_H_
#define _SIMPLETEXTURE_KERNEL_H_ texture<float, , cudaReadModeElementType> tex; extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
float u = x / (float)width - 0.5f;
float v = y / (float)height - 0.5f; g_odata[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);
} #endif
 // simpleTextureDrv.cpp
#include <stdio.h>
#include <iostream>
#include <helper_functions.h>
#include <cuda.h> #define MAX_EPSILON_ERROR 5e-3f
#define PATH "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTextureDrv\\data\\"
using namespace std;
float angle = 0.5f;
CUmodule cuModule;
CUcontext cuContext; CUfunction initCUDA()
{
CUfunction cuFunction = ;
string module_path, ptx_source;
cuInit(); // 初始化设备,类似于 runtime 中的函数 cudaSetDevice()
cuCtxCreate(&cuContext, , ); // 创建上下文,后两个参数分别是标志参数和设备号 // 读取 .ptx 文件
module_path = PATH"simpleTexture_kernel64.ptx";
FILE *fp = fopen(module_path.c_str(), "rb");
fseek(fp, , SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + ];
fseek(fp, , SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptx_source = buf;
delete[] buf; if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx,需要运行时编译
{
// 设定编译参数,CUjit_option 放置参数名,jitOptVals 放置参数值
const unsigned int jitNumOptions = ;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions]; // 编译日志长度
jitOptions[] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = ;
jitOptVals[] = (void *)(size_t)jitLogBufferSize; // 编译日志内容
jitOptions[] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[] = jitLogBuffer; // 设定一个内核使用的寄存器数量
jitOptions[] = CU_JIT_MAX_REGISTERS;
int jitRegCount = ;
jitOptVals[] = (void *)(size_t)jitRegCount; // 编译模块
cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
//printf("\n\tPTX JIT log:\n%s\n", jitLogBuffer);// 输出编译日志
}
else// 使用的是 .cubin,不用编译(本例中不经过这个分支)
cuModuleLoad(&cuModule, module_path.c_str()); // 取出编译好的模块中的函数
cuModuleGetFunction(&cuFunction, cuModule, "transformKernel");
return cuFunction;// 删掉了错误检查,如果中间某一步出错,则应该先销毁上下文再退出
} int main()
{
printf("\n\tStart.\n"); // 初始化设备,编译 PTX
CUfunction transform = initCUDA(); // 读取图片数据
float *h_data = NULL, *h_dataRef = NULL;
unsigned int width, height, size;
sdkLoadPGM(PATH"lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程
size = width * height * sizeof(float);
sdkLoadPGM(PATH"ref_rotated.pgm", &h_dataRef, &width, &height);
printf("\n\tLoad input files, %d x %d pixels\n", width, height); // 申请设备内存
CUdeviceptr d_data = (CUdeviceptr)NULL;
cuMemAlloc(&d_data, size);
CUarray cu_array;
CUDA_ARRAY_DESCRIPTOR desc;
desc.Format = CU_AD_FORMAT_FLOAT;
desc.NumChannels = ;
desc.Width = width;
desc.Height = height;
cuArrayCreate(&cu_array, &desc);
CUDA_MEMCPY2D copyParam;
memset(&copyParam, , sizeof(copyParam));
copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
copyParam.dstArray = cu_array;
copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
copyParam.srcHost = h_data;
copyParam.srcPitch = width * sizeof(float);
copyParam.WidthInBytes = copyParam.srcPitch;
copyParam.Height = height;
cuMemcpy2D(&copyParam); // 绑定纹理引用
CUtexref cu_texref;
cuModuleGetTexRef(&cu_texref, cuModule, "tex");
cuTexRefSetArray(cu_texref, cu_array, CU_TRSA_OVERRIDE_FORMAT);
cuTexRefSetAddressMode(cu_texref, , CU_TR_ADDRESS_MODE_WRAP);
cuTexRefSetAddressMode(cu_texref, , CU_TR_ADDRESS_MODE_WRAP);
cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_LINEAR);
cuTexRefSetFlags(cu_texref, CU_TRSF_NORMALIZED_COORDINATES);
cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_FLOAT, );
cuParamSetTexRef(transform, CU_PARAM_TR_DEFAULT, cu_texref); int block_size = ;
StopWatchInterface *timer = NULL; // 两种调用 Driver API 的方式
if ()
{
void *args[] = {&d_data, &width, &height, &angle};
// 预跑
cuLaunchKernel(transform, (width / block_size), (height / block_size), , block_size, block_size, , , NULL, args, NULL);
cuCtxSynchronize();
// 再跑一次测试性能
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
cuLaunchKernel(transform, (width / block_size), (height / block_size), , block_size, block_size, , , NULL, args, NULL);
}
else
{
int offset = ;
char argBuffer[];
// 在一个 CUdeviceptr(unsigned long long)长度的空间里写入调用参数
*((CUdeviceptr *)&argBuffer[offset]) = d_data;
offset += sizeof(d_data);
*((unsigned int *)&argBuffer[offset]) = width;
offset += sizeof(width);
*((unsigned int *)&argBuffer[offset]) = height;
offset += sizeof(height);
*((float *)&argBuffer[offset]) = angle;
offset += sizeof(angle);
void *kernel_launch_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END };
// 预跑
cuLaunchKernel(transform, (width / block_size), (height / block_size), ,block_size, block_size, ,,NULL, NULL, (void **)&kernel_launch_config);
cuCtxSynchronize();
// 再跑一次测试性能
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
cuLaunchKernel(transform, (width / block_size), (height / block_size), ,block_size, block_size, ,, ,NULL, (void **)&kernel_launch_config);
}
cuCtxSynchronize();
sdkStopTimer(&timer);
printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
sdkDeleteTimer(&timer); // 结果回收、输出和检验
cuMemcpyDtoH(h_data, d_data, size);
sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height);
printf("\n\tSave output file.\n");
printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.15f) ? "Passed" : "Failed"); cuMemFree(d_data);
cuArrayDestroy(cu_array);
cuCtxDestroy(cuContext);
getchar();
return ;
}

▶ 输出结果:

    Start.

    Load input files,  x  pixels

    Cost time: 0.355230 ms, 737.96 Mpixels/sec

    Save output file.

    Finish, return Passed.

▶ 涨姿势

● 一般,与 0_Simple__simpleSurfaceWrite 类似。

0_Simple__simpleTexture + 0_Simple__simpleTextureDrv的更多相关文章

随机推荐

  1. UVA 10815:Andy's First Dictionary(STL)

    题意:给出一段英文,里面包含一些单词,空格和标点,单词不区分大小写,默认都为小写.按照字典序输出这些单词(这些单词不能有重复,字母全部变成小写) stringstream:包含在头文件#include ...

  2. Android和Linux下设备节点的创建笔记

    1. Linux kernel创建的/dev/下的设备节点是不对的, 其实是kernel仅负责在/sys/(基于内存的虚拟文件系统)创建一大堆下目录和文件,而真正的设备节点是在用户空间程序创建的,应该 ...

  3. SQL Support and Workarounds

    此文章来自官方文档,说明了,对于不支持pg 标准的sql 查询的变通方法,实际使用的时候有很大的指导意义 As Citus provides distributed functionality by ...

  4. yugabyte 集成JanusGraph测试

    yugabyte 集成图数据库JanusGraph,原理比较简单就是yugabyte 内置Cassandra,配置好JanusGraph 的访问就可以了. 使用docker 模式部署 创建yugaby ...

  5. vue 知识点

    Vue 中的 slot: 概念:槽/slot是组件在模板中为调用者预留的位置,使用<slot>元素声明一个 槽.在最终的视图中,调用者模板中被调用组件的内容,将填充<slot> ...

  6. php单例模式实现对象只被创建一次 mysql单例操作类

    这是我在php面试题中遇到的一道试题,单例模式按字面来看就是某一个类只有一个实例,这样做的好处还是很大的,比如说数据库的连接,我们只需要实例化一次,不需要每次都去new了,这样极大的降低了资源的耗费. ...

  7. PHP版本VC6与VC9/VC11/VC14、Thread Safe与None-Thread Safe等的区别

    最近正好在弄一个PHP的程序,在这之前一直没有怎么以接触,发现对PHP版本知识了解不是很清楚,自己看了不少类似的文章,还是感觉不够明确和全面, 网上的结论又都是模棱两可,在此,给出最完整甚至武断的解释 ...

  8. 关闭IE 对剪切板访问的提示

    在internet 选项-“安全”选项卡-自定义级别. 在“脚本”下面找到“允许对剪切板进行编程访问”,选择“启用”即可. -END

  9. FPGA配置方式

    FPGA有多种配置/加载方式.粗略可以分为主动和被动两种.主动加载是指由FPGA控制配置流程,被动加载是指FPGA仅仅被动接收配置数据. 最常见的被动配置模式就是JTAG下载bit文件.此模式下,主动 ...

  10. volatile关键字的作用、原理

    在只有双重检查锁,没有volatile的懒加载单例模式中,由于指令重排序的问题,我确实不会拿到两个不同的单例了,但我会拿到"半个"单例. 而发挥神奇作用的volatile,可以当之 ...