使用 C++ 的模板

▶ 源代码:静态使用

 // sharedmem.cuh
#ifndef _SHAREDMEM_H_
#define _SHAREDMEM_H_ // SharedMemory 的封装
template <typename T> struct SharedMemory
{
__device__ T *getPointer()
{
extern __device__ void error(void);
error();
return NULL;
}
}; // SharedMemory 的各种数据类型的实现
template <> struct SharedMemory <int>
{
__device__ int *getPointer()
{
extern __shared__ int s_int[];
return s_int;
}
}; template <> struct SharedMemory <unsigned int>
{
__device__ unsigned int *getPointer()
{
extern __shared__ unsigned int s_uint[];
return s_uint;
}
}; template <> struct SharedMemory <char>
{
__device__ char *getPointer()
{
extern __shared__ char s_char[];
return s_char;
}
}; template <> struct SharedMemory <unsigned char>
{
__device__ unsigned char *getPointer()
{
extern __shared__ unsigned char s_uchar[];
return s_uchar;
}
}; template <> struct SharedMemory <short>
{
__device__ short *getPointer()
{
extern __shared__ short s_short[];
return s_short;
}
}; template <> struct SharedMemory <unsigned short>
{
__device__ unsigned short *getPointer()
{
extern __shared__ unsigned short s_ushort[];
return s_ushort;
}
}; template <> struct SharedMemory <long>
{
__device__ long *getPointer()
{
extern __shared__ long s_long[];
return s_long;
}
}; template <> struct SharedMemory <unsigned long>
{
__device__ unsigned long *getPointer()
{
extern __shared__ unsigned long s_ulong[];
return s_ulong;
}
}; template <> struct SharedMemory <bool>
{
__device__ bool *getPointer()
{
extern __shared__ bool s_bool[];
return s_bool;
}
}; template <> struct SharedMemory <float>
{
__device__ float *getPointer()
{
extern __shared__ float s_float[];
return s_float;
}
}; template <> struct SharedMemory <double>
{
__device__ double *getPointer()
{
extern __shared__ double s_double[];
return s_double;
}
}; #endif
 // simpleTemplates.cu
#include <stdio.h>
#include <timer.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h>
#include "sharedmem.cuh" template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
{
SharedMemory<T> smem;
T *sdata = smem.getPointer();
// 以上两行结合,等效于 extern __shared__ T sdata[];
const unsigned int tid = threadIdx.x; sdata[tid] = g_idata[tid];
__syncthreads();
sdata[tid] = (T) blockDim.x * sdata[tid];
__syncthreads();
g_odata[tid] = sdata[tid];
} template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
{
const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
for (unsigned int i = ; i < len; ++i)
reference[i] = idata[i] * T_len;
} // ArrayComparator 的封装
template<class T> class ArrayComparator
{
public:
bool compare(const T *reference, T *data, unsigned int len)
{
fprintf(stderr, "Error: no comparison function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
template<> class ArrayComparator<int>
{
public:
bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
}; template<> class ArrayComparator<float>
{
public:
bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
}; // ArrayFileWriter 的封装
template<class T> class ArrayFileWriter
{
public:
bool write(const char *filename, T *data, unsigned int len, float epsilon)
{
fprintf(stderr, "Error: no file write function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
template<> class ArrayFileWriter<int>
{
public:
bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; template<> class ArrayFileWriter<float>
{
public:
bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; template<class T> bool test(int len)
{
unsigned int mem_size = sizeof(T) * len;
dim3 grid(, , );
dim3 threads(len, , );
ArrayComparator<T> comparator;
ArrayFileWriter<T> writer;
cudaSetDevice();
StartTimer(); // 申请内存
T *h_idata, *h_odata, *d_idata, *d_odata;
h_idata = (T *)malloc(mem_size);
h_odata = (T *)malloc(mem_size);
cudaMalloc((void **)&d_idata, mem_size);
cudaMalloc((void **)&d_odata, mem_size);
for (unsigned int i = ; i < len; ++i)
h_idata[i] = (T) i;
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice); // 计算和计时
testKernel<T> << < grid, threads, mem_size >> > (d_idata, d_odata);
cudaMemcpy(h_odata, d_odata, sizeof(T) * len, cudaMemcpyDeviceToHost);
printf("\n\tProcessing time: %f ms\n", GetTimer()); // 检查结果
computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
bool result = comparator.compare(h_idata, h_odata, len);
//writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);// 写入文件的部分 free(h_idata);
free(h_odata);
cudaFree(d_idata);
cudaFree(d_odata);
return result;
} int main()
{
printf("\n\tStart.\n");
printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
printf("\n\t> test<float, 64>, result: %s.\n", test<float>() ? "Passed" : "Failed"); getchar();
return ;
}

▶ 输出结果:

    Start.

    Processing time: 107.394216 ms

    > test<float, >, result: Passed.

    Processing time: 3.153182 ms

    > test<float, >, result: Passed.

▶ 源代码:使用运行时编译

 // sharedmem.cuh,与静态完全相同
 // simpleTemplates_kernel.cu
#include "sharedmem.cuh" template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
{
SharedMemory<T> smem;
T *sdata = smem.getPointer();
// 以上两行结合,等效于 extern __shared__ T sdata[];
const unsigned int tid = threadIdx.x; sdata[tid] = g_idata[tid];
__syncthreads();
sdata[tid] = (T)blockDim.x * sdata[tid];
__syncthreads();
g_odata[tid] = sdata[tid];
} extern "C" __global__ void testFloat(float *p1, float *p2) { testKernel<float>(p1, p2); } extern "C" __global__ void testInt(int *p1, int *p2) { testKernel<int>(p1, p2); }
 // simpleTemplates.cpp
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <nvrtc_helper.h>
#include <timer.h> template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
{
const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定
for (unsigned int i = ; i < len; ++i)
reference[i] = idata[i] * T_len;
} // ArrayComparator 的封装
template<class T> class ArrayComparator
{
public:
bool compare(const T *reference, T *data, unsigned int len)
{
fprintf(stderr, "Error: no comparison function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h
template<> class ArrayComparator<int>
{
public:
bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
}; template<> class ArrayComparator<float>
{
public:
bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
}; // ArrayFileWriter 的封装
template<class T> class ArrayFileWriter
{
public:
bool write(const char *filename, T *data, unsigned int len, float epsilon)
{
fprintf(stderr, "Error: no file write function implemented for this type\n");
return false;
}
};
// int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h
template<> class ArrayFileWriter<int>
{
public:
bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; template<> class ArrayFileWriter<float>
{
public:
bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
}; // getKernel 的模板
template <typename T> CUfunction getKernel(CUmodule in); template<> CUfunction getKernel<int>(CUmodule in)
{
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, in, "testInt");
return kernel_addr;
} template<> CUfunction getKernel<float>(CUmodule in)
{
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, in, "testFloat");
return kernel_addr;
} template<class T> bool test(int len)
{
// 与静态不同,编译 PTX
char *kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTemplates_nvrtc\\simpleTemplates_kernel.cu";
char *ptx;
size_t ptxSize;
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, ); // 1, NULL 分别为 argc 和 argv
CUmodule module = loadPTX(ptx, , NULL); // 1, NULL 分别为 argc 和 argv,有关于 GPU的输出 unsigned int mem_size = sizeof(T) * len;
dim3 grid(, , );
dim3 threads(len, , );
ArrayComparator<T> comparator;
ArrayFileWriter<T> writer;
StartTimer(); // 申请内存
T *h_idata, *h_odata;
CUdeviceptr d_idata, d_odata; // 与静态不同
h_idata = (T *)malloc(mem_size);
h_odata = (T *)malloc(mem_size);
cuMemAlloc(&d_idata, mem_size); // 与静态不同
cuMemAlloc(&d_odata, mem_size);
for (unsigned int i = ; i < len; ++i)
h_idata[i] = (T)i;
cuMemcpyHtoD(d_idata, h_idata, mem_size); // 与静态不同 // 计算和计时
CUfunction kernel_addr = getKernel<T>(module); void *arr[] = { (void *)&d_idata, (void *)&d_odata };
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, threads.x, threads.y, threads.z, mem_size, , &arr[], );
cuCtxSynchronize(); // 上下文同步
cuMemcpyDtoH(h_odata, d_odata, sizeof(T) * len);// 与静态不同
printf("\n\tProcessing time: %f ms\n", GetTimer()); // 检查结果
computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
bool result = comparator.compare(h_idata, h_odata, len);
//writer.write("./data/regression.dat", h_odata, len, 0.0f);// 写入文件的部分 free(h_idata);
free(h_odata);
cuMemFree(d_idata); // 与静态不同
cuMemFree(d_odata);
return result;
} int main()
{
printf("\n\tStart.\n");
printf("\n\t> test<float, 32>, result: %s.\n", test<float>() ? "Passed" : "Failed");
printf("\n\t> test<int, 64>, result: %s.\n", test<int>() ? "Passed" : "Failed"); getchar();
return ;
}

▶ 输出结果:

    Start.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Processing time: 0.699976 ms > test<float, >, result: Passed.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Processing time: 0.665355 ms > test<int, >, result: Passed.

▶ 涨姿势

● 封装了 SharedMemory,ArrayComparator,ArrayFileWriter 三个模板,并定义了其在不同的数据类型下的实现。

0_Simple__simpleTemplates + 0_Simple__simpleTemplates_nvrtc的更多相关文章

随机推荐

  1. 初识Tarjan算法

    #include<bits/stdc++.h> using namespace std; ; ;//强连通分量的个数 int stk[maxn];//暂时存放遍历过的点,在遇到low[x] ...

  2. 在各OJ上的名号

    POJ  MekakuCityActors 牛客 MekakuCityActors hdoj MekakuCityActors 这几个难度较大,所以用Actors 博客 MekakuCityActor ...

  3. bitset与取数凑数类问题

    bitset是C++中的一个东西,定义在头文件#include<bitset>里 所以可以使用#include<bitset>解决取数类的问题https://www.nowco ...

  4. error MSB3073: 命令“regsvr32 /s /c:VCEnd”已退出,代码为 3

    版权声明:博客地址:blog.csdn.net/x356982611,未经同意不得转载,不得转载,不得转载 https://blog.csdn.net/x356982611/article/detai ...

  5. JVM(下)

    持久代:不会被 gc 给轻易回收的,创建后一直存在,持久代在堆内存里面,但是不归 java 程序使用.持久代是 动态 load 的那些 class,局部变量,去 gc 其实也 gc 不了啥 1.8 之 ...

  6. ShareMemory

    项目地址 :  https://github.com/kelin-xycs/ShareMemory ShareMemory 一个用 C# 实现的 No Sql 数据库 , 也可以说是 分布式 缓存 , ...

  7. JZ2440 裸机驱动 第13章 LCD控制器(1)

    本章目标  了解LCD显示器的接口及时序: 掌握S3C2410/S3C2440 LCD控制器的使用方法: 了解帧缓冲区的概念,掌握如何设置帧缓冲区来显示图像: 13.1 LCD和LCD控制器 13.1 ...

  8. 【Spring学习笔记-2】Myeclipse下第一个Spring程序-通过ClassPathXmlApplicationContext加载配置文件

    *.hl_mark_KMSmartTagPinkImg{background-color:#ffaaff;}*.hl_mark_KMSmartTagBlueImg{background-color:# ...

  9. JbossMiner 挖矿蠕虫分析 (转载)

    前言 从2013年的诞生,到2016爆发,挖矿(MiningCryptocurrency) 的高回报率,使其成为了一把双刃剑.据外媒去年的统计,比特币的算力(Hash Rate)已在半年内翻了一翻. ...

  10. spring boot学习(3) SpringBoot 之MVC 支持

    第一节:@RequestMapping 配置url 映射   第二节:@Controller 处理http 请求 转发到一个页面,以前是转发到jsp页面,现在使用freemarker: 在pom.xm ...