计算核函数调用使得占用率,并尝试使用 runtime 函数自动优化线程块尺寸,以便提高占用率。

▶ 源代码。

 #include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <helper_cuda.h> const int manualBlockSize = ; // 核函数,输入数组的每个元素平方后放回
__global__ void square(int *array, int arrayCount)
{
extern __shared__ int dynamicSmem[];
int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < arrayCount)
array[idx] *= array[idx];
} // 负责调用核函数,计时,并考虑是否使用 runtime 函数优化线程块尺寸
static int launchConfig(int *data, int size, bool automatic)
{
int blockSize;
int numBlocks;
int gridSize;
int minGridSize;
float elapsedTime;
double potentialOccupancy;
size_t dynamicSMemUsage = ; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, );
cudaEvent_t start;
cudaEvent_t end;
cudaEventCreate(&start);
cudaEventCreate(&end); if (automatic)// true 则使用 runtime 函数自动优化线程块尺寸
{
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)square, dynamicSMemUsage, size);
printf("\n\tSuggested block size: %d, minimum grid size for maximum occupancy: %d\n", blockSize, minGridSize);
}
else
blockSize = manualBlockSize; gridSize = (size + blockSize - ) / blockSize; cudaEventRecord(start);
square<<<gridSize, blockSize, dynamicSMemUsage>>>(data, size);
cudaEventRecord(end);
cudaDeviceSynchronize();
cudaEventElapsedTime(&elapsedTime, start, end);
printf("\n\tElapsed time: %4.2f ms\n", elapsedTime); // 依线程数计算占用率,分子分母同除以 prop.warpSize 即按活动线程束数计算,两者等价
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, square, blockSize, dynamicSMemUsage);
potentialOccupancy = (double)(numBlocks * blockSize) / (prop.maxThreadsPerMultiProcessor);
printf("\n\tPotential occupancy: %4.2f %%\n", potentialOccupancy * ); return ;
} // 负责核函数调用前后内存控制,以及结果检查
static int test(bool automaticLaunchConfig, const int count = )
{
int size = count * sizeof(int);
int *h_data = (int *)malloc(size);
for (int i = ; i < count; i++)
h_data[i] = i;
int *d_data;
cudaMalloc(&d_data, size); cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
memset(h_data,,size);
launchConfig(d_data, count, automaticLaunchConfig);
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); for (int i = ; i < count; i += )
{
if (h_data[i] != i * i)
{
printf("\n\tError at %d, d_data = %d\n", i, h_data[i]);
return ;
}
} free(h_data);
cudaFree(d_data);
return ;
} int main()
{
int status; printf("\n\tStart.\n"); printf("\n\tManual configuration test, BlockSize = %d\n", manualBlockSize);
if (test(false))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tAutomatic configuration\n");
if (test(true))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tTest PASSED\n");
getchar();
return ;
}

▶ 输出结果

    Start.

    Manual configuration test, BlockSize = 

    Elapsed time: 0.13 ms

    Potential occupancy: 50.00 %

    Automatic configuration

    Suggested block size: , minimum grid size for maximum occupancy: 

    Elapsed time: 0.12 ms

    Potential occupancy: 100.00 %

    Test PASSED

▶ 涨姿势

● 用到的几个 runtime 函数及其相互关系。

 // driver_types.h
// 用于优化线程块尺寸的函数中的标志
#define cudaOccupancyDefault 0x00 // 默认标志
#define cudaOccupancyDisableCachingOverride 0x01 // 开启全局缓存,且不能被禁用 // cuda_device_runtime_api.h
// 与 cuda_runtime.h 中同名的函数,貌似没有用到?
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
} // 被函数 cudaOccupancyMaxActiveBlocksPerMultiprocessor 和函数 cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags 调用的
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
} // cuda_runtime.h
template<class T>
static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, T func, int blockSize, size_t dynamicSMemSize)
{
return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
} template<typename UnaryFunction, class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
(
int* minGridSize, int* blockSize, T func, UnaryFunction blockSizeToDynamicSMemSize, int blockSizeLimit = , unsigned int flags =
)
{
cudaError_t status; // 设备和函数属性
int device;
struct cudaFuncAttributes attr;
int maxThreadsPerMultiProcessor;
int warpSize;
int devMaxThreadsPerBlock;
int multiProcessorCount;
int occupancyLimit;
int granularity; // 记录最大值
int maxBlockSize = ;
int numBlocks = ;
int maxOccupancy = ; // 临时变量
int blockSizeToTryAligned;
int blockSizeToTry;
int occupancyInBlocks;
int occupancyInThreads;
size_t dynamicSMemSize; // 检查输入
if (!minGridSize || !blockSize || !func)
return cudaErrorInvalidValue; //获取设备和核函数属性
status = ::cudaGetDevice(&device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&warpSize,cudaDevAttrWarpSize,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&devMaxThreadsPerBlock,cudaDevAttrMaxThreadsPerBlock,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&multiProcessorCount,cudaDevAttrMultiProcessorCount,device);
if (status != cudaSuccess)
return status;
status = cudaFuncGetAttributes(&attr, func);
if (status != cudaSuccess)
return status; //尝试线程块尺寸
occupancyLimit = maxThreadsPerMultiProcessor;
granularity = warpSize; if (blockSizeLimit == || blockSizeLimit > devMaxThreadsPerBlock)
blockSizeLimit = devMaxThreadsPerBlock; if (blockSizeLimit > attr.maxThreadsPerBlock)
blockSizeLimit = attr.maxThreadsPerBlock; for (blockSizeToTryAligned = ((blockSizeLimit + (warpSize - )) / warpSize) * warpSize; blockSizeToTryAligned > ; blockSizeToTryAligned -= warpSize)
// blockSizeLimit 向上对齐到 warpSize 的整数倍,并尝试以 warpSize 为单位向下减少
// 如果一开始 blockSizeLimit 就比 blockSizeToTryAligned 小,则从 blockSizeLimit 开始尝试(这时只用迭代一次)
{
blockSizeToTry = (blockSizeLimit < blockSizeToTryAligned) ? blockSizeLimit : blockSizeToTryAligned;
dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); // 计算占用率的核心
status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&occupancyInBlocks, func, blockSizeToTry, dynamicSMemSize, flags);
if (status != cudaSuccess)
return status; // 记录有效结果
if ((occupancyInThreads = blockSizeToTry * occupancyInBlocks) > maxOccupancy)
{
maxBlockSize = blockSizeToTry;
numBlocks = occupancyInBlocks;
maxOccupancy = occupancyInThreads;
} // 已经达到了占用率 100%,退出
if (occupancyLimit == maxOccupancy)
break;
} // 返回最优结果
*minGridSize = numBlocks * multiProcessorCount;
*blockSize = maxBlockSize; return status;
} class __cudaOccupancyB2DHelper
{
size_t n;
public:
inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
inline __host__ CUDART_DEVICE size_t operator()(int)
{
return n;
}
}; // 优化线程块尺寸的 runtime 函数
// 参数:输出最小线程格尺寸 minGridSize,输出线程块尺寸 blockSize,内核 func,动态共享内存大小 dynamicSMemSize,总线程数 blockSizeLimit
template<class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize
(
int *minGridSize, int *blockSize, T func, size_t dynamicSMemSize = , int blockSizeLimit =
)
{
return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
}

0_Simple__simpleOccupancy的更多相关文章

随机推荐

  1. POJ 2407:Relatives(欧拉函数模板)

    Relatives AC代码 Relatives Time Limit: 1000MS   Memory Limit: 65536K Total Submissions: 16186   Accept ...

  2. HDU 4681 string 求最长公共子序列的简单DP+暴力枚举

    先预处理,用求最长公共子序列的DP顺着处理一遍,再逆着处理一遍. 再预处理串a和b中包含串c的子序列,当然,为了使这子序列尽可能短,会以c 串的第一个字符开始 ,c 串的最后一个字符结束 将这些起始位 ...

  3. 《DSP using MATLAB》Problem 3.21

    模拟信号经过不同的采样率进行采样后,得到不同的数字角频率,如下: 三种Fs,采样后的信号的谱 重建模拟信号,这里只显示由第1种Fs=0.01采样后序列进行重建,采用zoh.foh和spline三种方法 ...

  4. 【spring data jpa】好文储备

    [spring data jpa]带有条件的查询后分页和不带条件查询后分页实现  :  https://blog.csdn.net/lihuapiao/article/details/48782843 ...

  5. Django FBV和CBV -

    一.FBV和CBV 在Python菜鸟之路:Django 路由.模板.Model(ORM)一节中,已经介绍了几种路由的写法及对应关系,那种写法可以称之为FBV: function base view ...

  6. JSON字符串-赋张最初接触后台从map转json的方法

    **************************************** json数组: *************************************************** ...

  7. JSON与JAVA数据的转换-----从3,23到现在5.25才过去2个月,感觉时间过得那么漫长

    从3月23号去报到,期间经历了清明节,毕业论文答辩,从万达搬到东兴,五一节,毕业照,从东兴的一边搬到另外一个房间中去 2个月的时间过得如此的快啊!白驹过隙! 不要着急,不要和别人比,小龙哥写过3年代码 ...

  8. PHP com组件的使用 (环境搭建 以及测试)

    COM 组件在实际当前的软件开发中依然是比较重要,包括对于串口开发的人员,软件插件使用的人员,PHP 已经为我们添加了对于 COM的支持,可以很好的解决我们在开发中可能碰到的一些问题.一下是开发环境的 ...

  9. 解决首次访问jenkins,输入初始化默认密码之后,一直卡住问题,无法进行jenkins工具安装

    参考网址:http://www.cnblogs.com/520playboy/p/6244257.html 简介   安装系统:centos6.5 安装方式:在官网中下载jenkins.war,放到t ...

  10. bzoj 4556 [Tjoi2016&Heoi2016]字符串——后缀数组+主席树

    题目:https://www.lydsy.com/JudgeOnline/problem.php?id=4556 本来只要查 ht[ ] 数组上的前驱和后继就行,但有长度的限制.可以二分答案解决!然后 ...