0_Simple__cppOverload

▶ 使用 cuda 内置结构 cudaFuncAttributes 来观察核函数的共享内存、寄存器数量

▶ 源代码

 // cppOverload_kernel.cu

 __global__ void simple_kernel(const int *pIn, int *pOut, int a)

 {

     __shared__ int sData[THREAD_N];

     int tid = threadIdx.x + blockDim.x * blockIdx.x;

     sData[threadIdx.x] = pIn[tid];

     __syncthreads();

     pOut[tid] = sData[threadIdx.x] * a + tid;

 }

 __global__ void simple_kernel(const int2 *pIn, int *pOut, int a)

 {

     __shared__ int2 sData[THREAD_N];

     int tid = threadIdx.x + blockDim.x * blockIdx.x;

     sData[threadIdx.x] = pIn[tid];

     __syncthreads();

     pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y) * a + tid;

 }

 __global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a)

 {

     __shared__ int sData1[THREAD_N], sData2[THREAD_N];

     int tid = threadIdx.x + blockDim.x * blockIdx.x;

     sData1[threadIdx.x] = pIn1[tid];

     sData2[threadIdx.x] = pIn2[tid];

     __syncthreads();

     pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid;

 }

 // cppOverload.cu

 #include <stdio.h>

 #include <helper_cuda.h>

 #include <helper_math.h>

 #include <helper_string.h>

 #define THREAD_N            256

 #include "cppOverload_kernel.cu"                                            // 源代码文件中使用了 THREAD_N，必须先定义

 #define N                   1024

 #define DIV_UP(a, b)        (((a) + (b) - 1) / (b))

 #define OUTPUT_ATTR(attr)                                               \

     printf("Shared Size:           %d\n", (int)attr.sharedSizeBytes);   \

     printf("Constant Size:         %d\n", (int)attr.constSizeBytes);    \

     printf("Local Size:            %d\n", (int)attr.localSizeBytes);    \

     printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock);     \

     printf("Number of Registers:   %d\n", attr.numRegs);                \

     printf("PTX Version:           %d\n", attr.ptxVersion);             \

     printf("Binary Version:        %d\n", attr.binaryVersion);             

 bool check_func1(int *hInput, int *hOutput, int a)

 {

     for (int i = ; i < N; ++i)

     {

         int cpuRes = hInput[i] * a + i;

         if (hOutput[i] != cpuRes)

             return false;

     }

     return true;

 }

 bool check_func2(int2 *hInput, int *hOutput, int a)

 {

     for (int i = ; i < N; i++)

     {

         int cpuRes = (hInput[i].x + hInput[i].y)*a + i;

         if (hOutput[i] != cpuRes)

             return false;

     }

     return true;

 }

 bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a)

 {

     for (int i = ; i < N; i++)

     {

         if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i)

             return false;

     }

     return true;

 }

 int main(int argc, const char *argv[])

 {

     int deviceID = cudaSetDevice();

     int *hInput = NULL, *hOutput = NULL, *dInput = NULL, *dOutput = NULL;

     cudaMalloc(&dInput, sizeof(int)*N * );

     cudaMalloc(&dOutput, sizeof(int)*N);

     cudaMallocHost(&hInput, sizeof(int)*N * );

     cudaMallocHost(&hOutput, sizeof(int)*N);

     for (int i = ; i < N * ; i++)

         hInput[i] = i;

     cudaMemcpy(dInput, hInput, sizeof(int)*N * , cudaMemcpyHostToDevice);

     const int a = ;

     void(*func1)(const int *, int *, int) = simple_kernel;

     void(*func2)(const int2 *, int *, int) = simple_kernel;

     void(*func3)(const int *, const int *, int *, int) = simple_kernel;

     struct cudaFuncAttributes attr;

     // function 1

     memset(&attr, , sizeof(attr));

     cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared);                      // 运行前分析资源占用

     cudaFuncGetAttributes(&attr, *func1);

     OUTPUT_ATTR(attr);

     (*func1) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dOutput, a);

     cudaDeviceSynchronize();

     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);

     printf("simple_kernel(const int *pIn, int *pOut, int a) %s\n\n", check_func1(hInput, hOutput, a) ? "PASSED" : "FAILED");

     // function 2

     memset(&attr, , sizeof(attr));

     cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared);

     cudaFuncGetAttributes(&attr, *func2);

     OUTPUT_ATTR(attr);

     (*func2) << <DIV_UP(N, THREAD_N), THREAD_N >> >((int2 *)dInput, dOutput, a);    // 强行转换成 int2*，反正也是对其的

     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);

     printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s\n\n", check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a) ? "PASSED" : "FAILED");

     // function 3

     memset(&attr, , sizeof(attr));

     cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared);

     cudaFuncGetAttributes(&attr, *func3);

     OUTPUT_ATTR(attr);

     (*func3) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dInput + N, dOutput, a);

     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);

     printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s\n\n", check_func3(&hInput[], &hInput[N], hOutput, a) ? "PASSED" : "FAILED");

     cudaFree(dInput);

     cudaFree(dOutput);

     cudaFreeHost(hOutput);

     cudaFreeHost(hInput);

     getchar();

     return ;

 }

● 输出结果：

Shared Size:

Constant Size:

Local Size:

Max Threads Per Block:

Number of Registers:

PTX Version:

Binary Version:

simple_kernel(const int *pIn, int *pOut, int a) PASSED

Shared Size:

Constant Size:

Local Size:

Max Threads Per Block:

Number of Registers:

PTX Version:

Binary Version:

simple_kernel(const int2 *pIn, int *pOut, int a) PASSED

Shared Size:

Constant Size:

Local Size:

Max Threads Per Block:

Number of Registers:

PTX Version:

Binary Version:

simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED

▶ 涨姿势：

● cuda 使用扩展名为 .cuh 的头文件

● cuda内置结构 cudaFuncAttributes 的定义：

 struct __device_builtin__ cudaFuncAttributes

 {

     size_t sharedSizeBytes; // 共享内存大小

     size_t constSizeBytees; // 常量内存大小

     size_t localSizeBytes;  // 局部内存大小

     int maxThreadsPerBlock; // 每线程块线最大程数量

     int numRegs;            // 寄存器数量

     int ptxVersion;         // PTX版本号

     int binaryVersion;      // 机器码版本号

     int cacheModeCA;        // 是否使用编译指令 -Xptxas --dlcm=ca

 };

● 通过使用cuda的内置结构和函数来查看核函数使用的共享内存与寄存器数量

 struct cudaFuncAttributes attr;

 memset(&attr, , sizeof(attr));

 cudaFuncSetCacheConfig(*function, cudaFuncCachePreferShared);

 cudaFuncGetAttributes(&attr, *function);

■ 涉及的函数

 extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);

 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)

 {

     return cudaErrorUnknown;

 }

 #define OUTPUT_ATTR(attr)                                           \

     printf("Shared Size:   %d\n", (int)attr.sharedSizeBytes);       \

     printf("Constant Size: %d\n", (int)attr.constSizeBytes);        \

     printf("Local Size:    %d\n", (int)attr.localSizeBytes);        \

     printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); \

     printf("Number of Registers: %d\n", attr.numRegs);              \

     printf("PTX Version: %d\n", attr.ptxVersion);                   \

     printf("Binary Version: %d\n", attr.binaryVersion);

0_Simple__cppOverload的更多相关文章

随机推荐

SpringBoot开发案例之mail中文附件乱码
前一段时间做过一个邮件发送的服务,以前大体都测试过,文本.图片.附件都是没有问题的,可有同事反应发送的附件名称有中文乱码,类似如下截图展示: 咋一看不像乱码,抱着试试看的态度,为MimeMessage ...
Postman高级应用——流程控制、调试、公共函数、外部数据文件
postman客户端下载地址:https://www.getpostman.com/apps 目录流程控制调试公共函数外部数据文件流程控制流程控制简言之就是设置接口的执行顺序,流程控制只有 ...
python 库之lxml安装坑一个
error: command 'C:\\Users\\Admin\\AppData\\Local\\Programs\\Common\\Microsoft\\Visual C++ for Python ...
Ubuntu16.04 Using Note
I meet lots of problems when i installed and use ubuntu 16.04.below is my using note: (my operating ...
Quartz源码——JobStore保存JonDetail和Trigger源码分析（一）
我都是分析的jobStore 方式为jdbc的SimpleTrigger!RAM的方式类似分析方式! {0} :表的前缀 ,如表qrtz_trigger ,{0}== qrtz_ {1}:quartz ...
java基础——java.util.ConcurrentModificationException
在编写代码的时候,有时候会遇到List里有符合条件的的对象,就移除改对象! 但是这种操作如:使用了 List 的remove,会导致一些很严重的问题! 如下这段代码使用ArrayList: @Test ...
You Are the One DP
You Are the One Time Limit:1000MS Memory Limit:32768KB 64bit IO Format:%I64d & %I64u Sub ...
Apache shiro的简单介绍与使用(与spring整合使用）
apache shiro框架简介 Apache Shiro是一个强大而灵活的开源安全框架,它能够干净利落地处理身份认证,授权,企业会话管理和加密.现在,使用Apache Shiro的人越来越多,因为它 ...
zoj3321 circle floyd 最小生成树
Circle 断一个图是否是一个环. 思路:必有m==n,那么我们用n-1条边能够生成一棵树(即是所有点联通,则用floyd即可),然后看最后一条边的两个点是否是单边(度为一)即可 . #includ ...
zabbix灵活使用userparameters
userparameters介绍官网文献:https://www.zabbix.com/documentation/2.0/manual/config/items/userparameters 当我 ...

0_Simple__cppOverload

0_Simple__cppOverload的更多相关文章

随机推荐

热门专题