0_Simple__simpleTexture + 0_Simple__simpleTextureDrv
使用纹理引用来旋转图片,并在使用了静态编译和运行时编译两种环境。
▶ 源代码:静态编译
#include <stdio.h>
#include <windows.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h> #define MAX_EPSILON_ERROR 5e-3f
const float angle = 0.5f;
texture<float, , cudaReadModeElementType> tex; __global__ void transformKernel(float *outputData, int width, int height, float theta)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
float u = x / (float)width - 0.5f;
float v = y / (float)height - 0.5f; outputData[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);
} int main()
{
printf("\n\tStart.\n"); // 读取图片数据
float *h_data = NULL, *h_dataRef = NULL;
unsigned int width, height, size;
sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程
size = width * height * sizeof(float);
sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\ref_rotated.pgm", &h_dataRef, &width, &height);
printf("\n\tLoad input files, %d x %d pixels\n", width, height); // 申请设备内存
float *d_data = NULL;
cudaMalloc((void **)&d_data, size);
cudaArray *cuArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(, , , , cudaChannelFormatKindFloat);
cudaMallocArray(&cuArray, &channelDesc, width, height);
cudaMemcpyToArray(cuArray, , , h_data, size, cudaMemcpyHostToDevice);// 与 simpleSurfaceWrite 中不同,直接拷贝进 cuArray // 绑定纹理引用
tex.addressMode[] = cudaAddressModeWrap;
tex.addressMode[] = cudaAddressModeWrap;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true;
cudaBindTextureToArray(tex, cuArray, channelDesc); // 预跑
dim3 dimBlock(, , );
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, );
transformKernel << <dimGrid, dimBlock, >> >(d_data, width, height, angle);
cudaDeviceSynchronize(); StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer); transformKernel << <dimGrid, dimBlock, >> >(d_data, width, height, angle);
cudaDeviceSynchronize(); sdkStopTimer(&timer);
printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
sdkDeleteTimer(&timer); // 结果回收、输出和检验
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height);
printf("\n\tSave output file.\n");
printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.0f) ? "Passed" : "Failed"); cudaFree(d_data);
cudaFreeArray(cuArray);
getchar();
return ;
}
▶ 输出结果
Start. Load input files, x pixels Cost time: 0.362788 ms, 722.58 Mpixels/sec Save output file. Finish, return Passed.
▶ 源代码:运行时编译
// simpleTexture_kernel.cu
#ifndef _SIMPLETEXTURE_KERNEL_H_
#define _SIMPLETEXTURE_KERNEL_H_ texture<float, , cudaReadModeElementType> tex; extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
float u = x / (float)width - 0.5f;
float v = y / (float)height - 0.5f; g_odata[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);
} #endif
// simpleTextureDrv.cpp
#include <stdio.h>
#include <iostream>
#include <helper_functions.h>
#include <cuda.h> #define MAX_EPSILON_ERROR 5e-3f
#define PATH "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTextureDrv\\data\\"
using namespace std;
float angle = 0.5f;
CUmodule cuModule;
CUcontext cuContext; CUfunction initCUDA()
{
CUfunction cuFunction = ;
string module_path, ptx_source;
cuInit(); // 初始化设备,类似于 runtime 中的函数 cudaSetDevice()
cuCtxCreate(&cuContext, , ); // 创建上下文,后两个参数分别是标志参数和设备号 // 读取 .ptx 文件
module_path = PATH"simpleTexture_kernel64.ptx";
FILE *fp = fopen(module_path.c_str(), "rb");
fseek(fp, , SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + ];
fseek(fp, , SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptx_source = buf;
delete[] buf; if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx,需要运行时编译
{
// 设定编译参数,CUjit_option 放置参数名,jitOptVals 放置参数值
const unsigned int jitNumOptions = ;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions]; // 编译日志长度
jitOptions[] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = ;
jitOptVals[] = (void *)(size_t)jitLogBufferSize; // 编译日志内容
jitOptions[] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[] = jitLogBuffer; // 设定一个内核使用的寄存器数量
jitOptions[] = CU_JIT_MAX_REGISTERS;
int jitRegCount = ;
jitOptVals[] = (void *)(size_t)jitRegCount; // 编译模块
cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
//printf("\n\tPTX JIT log:\n%s\n", jitLogBuffer);// 输出编译日志
}
else// 使用的是 .cubin,不用编译(本例中不经过这个分支)
cuModuleLoad(&cuModule, module_path.c_str()); // 取出编译好的模块中的函数
cuModuleGetFunction(&cuFunction, cuModule, "transformKernel");
return cuFunction;// 删掉了错误检查,如果中间某一步出错,则应该先销毁上下文再退出
} int main()
{
printf("\n\tStart.\n"); // 初始化设备,编译 PTX
CUfunction transform = initCUDA(); // 读取图片数据
float *h_data = NULL, *h_dataRef = NULL;
unsigned int width, height, size;
sdkLoadPGM(PATH"lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程
size = width * height * sizeof(float);
sdkLoadPGM(PATH"ref_rotated.pgm", &h_dataRef, &width, &height);
printf("\n\tLoad input files, %d x %d pixels\n", width, height); // 申请设备内存
CUdeviceptr d_data = (CUdeviceptr)NULL;
cuMemAlloc(&d_data, size);
CUarray cu_array;
CUDA_ARRAY_DESCRIPTOR desc;
desc.Format = CU_AD_FORMAT_FLOAT;
desc.NumChannels = ;
desc.Width = width;
desc.Height = height;
cuArrayCreate(&cu_array, &desc);
CUDA_MEMCPY2D copyParam;
memset(©Param, , sizeof(copyParam));
copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
copyParam.dstArray = cu_array;
copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
copyParam.srcHost = h_data;
copyParam.srcPitch = width * sizeof(float);
copyParam.WidthInBytes = copyParam.srcPitch;
copyParam.Height = height;
cuMemcpy2D(©Param); // 绑定纹理引用
CUtexref cu_texref;
cuModuleGetTexRef(&cu_texref, cuModule, "tex");
cuTexRefSetArray(cu_texref, cu_array, CU_TRSA_OVERRIDE_FORMAT);
cuTexRefSetAddressMode(cu_texref, , CU_TR_ADDRESS_MODE_WRAP);
cuTexRefSetAddressMode(cu_texref, , CU_TR_ADDRESS_MODE_WRAP);
cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_LINEAR);
cuTexRefSetFlags(cu_texref, CU_TRSF_NORMALIZED_COORDINATES);
cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_FLOAT, );
cuParamSetTexRef(transform, CU_PARAM_TR_DEFAULT, cu_texref); int block_size = ;
StopWatchInterface *timer = NULL; // 两种调用 Driver API 的方式
if ()
{
void *args[] = {&d_data, &width, &height, &angle};
// 预跑
cuLaunchKernel(transform, (width / block_size), (height / block_size), , block_size, block_size, , , NULL, args, NULL);
cuCtxSynchronize();
// 再跑一次测试性能
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
cuLaunchKernel(transform, (width / block_size), (height / block_size), , block_size, block_size, , , NULL, args, NULL);
}
else
{
int offset = ;
char argBuffer[];
// 在一个 CUdeviceptr(unsigned long long)长度的空间里写入调用参数
*((CUdeviceptr *)&argBuffer[offset]) = d_data;
offset += sizeof(d_data);
*((unsigned int *)&argBuffer[offset]) = width;
offset += sizeof(width);
*((unsigned int *)&argBuffer[offset]) = height;
offset += sizeof(height);
*((float *)&argBuffer[offset]) = angle;
offset += sizeof(angle);
void *kernel_launch_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END };
// 预跑
cuLaunchKernel(transform, (width / block_size), (height / block_size), ,block_size, block_size, ,,NULL, NULL, (void **)&kernel_launch_config);
cuCtxSynchronize();
// 再跑一次测试性能
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
cuLaunchKernel(transform, (width / block_size), (height / block_size), ,block_size, block_size, ,, ,NULL, (void **)&kernel_launch_config);
}
cuCtxSynchronize();
sdkStopTimer(&timer);
printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
sdkDeleteTimer(&timer); // 结果回收、输出和检验
cuMemcpyDtoH(h_data, d_data, size);
sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height);
printf("\n\tSave output file.\n");
printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.15f) ? "Passed" : "Failed"); cuMemFree(d_data);
cuArrayDestroy(cu_array);
cuCtxDestroy(cuContext);
getchar();
return ;
}
▶ 输出结果:
Start. Load input files, x pixels Cost time: 0.355230 ms, 737.96 Mpixels/sec Save output file. Finish, return Passed.
▶ 涨姿势
● 一般,与 0_Simple__simpleSurfaceWrite 类似。
0_Simple__simpleTexture + 0_Simple__simpleTextureDrv的更多相关文章
随机推荐
- NASSA’s Robot
NASSA的机器人降落到了火星,降落的地方可以用X-Y坐标表示.机器人最开始在(0, 0).由于传输问题,部分指令可能会混淆,现在给出确定的命令与未知命令,请帮忙确认机器人的X.Y坐标最小最大值分别是 ...
- 大家一起做训练 第一场 G CD
题目来源:UVA 624 题目的意思就是:我现在需要从 t 张CD中拿出一部分来,尽可能的凑出接近 N 这么久的音乐,但是不能超过 N. CD不超过20张,每张长度不超过 N ,不能重复选. 一个很简 ...
- 【MVC】使用MvcPager进行分页
1.添加引用: mvcPager 版本高的提供的功能也更多. 注:下载了第一个,但是里面的一些字段是只读的.(eg:PagedList<T> .TotalItemCount)这是不符合的. ...
- 【liunx】端口号的占用情况查看
Linux如何查看端口 1.lsof -i:端口号 用于查看某一端口的占用情况,比如查看8000端口使用情况,lsof -i:8000 # lsof -i:8000 COMMAND PID USER ...
- 部署tomcat到Linux
1. alt+p 放文件 2.解压到自定义 apps文件夹中 tar -zxvf apache-tomcat-7.0.68.tar.gz -C apps 3.进入文件启动tomcat/bin ./ ...
- proc文件系统漫谈
1. /proc/buddyinfo:/proc/buddyinfo是linuxbuddy系统管理物理内存的debug信息. 在linux中使用buddy算法解决物理内存的外碎片问题,其把所有空闲的内 ...
- nginx-opentracing 简单使用
nginx 的日常的应用开发中占比还是比较高的,opentracing 是一个分布式追踪标准 相应是开源实现也比较多.nginx-opentracing 是一个opentracing nginx 的插 ...
- 多线程安全问题之Lock显示锁
package com.hls.juc; import java.util.concurrent.locks.Lock;import java.util.concurrent.locks.Reentr ...
- 关于 Vue 方法前面的美元符号
关于 Vue 方法前面的美元符号 学到这一段,不明白什么意思,然后手贱把 $ 删除了,出现未定义方法. vm.$watch('counter', function(nval, oval) { aler ...
- JUC集合之 CopyOnWriteArrayList
CopyOnWriteArrayList介绍 它相当于线程安全的ArrayList.和ArrayList一样,它是个可变数组:但是和ArrayList不同的时,它具有以下特性: 它最适合于具有以下特征 ...