0_Simple__simpleOccupancy
计算核函数调用使得占用率,并尝试使用 runtime 函数自动优化线程块尺寸,以便提高占用率。
▶ 源代码。
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <helper_cuda.h> const int manualBlockSize = ; // 核函数,输入数组的每个元素平方后放回
__global__ void square(int *array, int arrayCount)
{
extern __shared__ int dynamicSmem[];
int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < arrayCount)
array[idx] *= array[idx];
} // 负责调用核函数,计时,并考虑是否使用 runtime 函数优化线程块尺寸
static int launchConfig(int *data, int size, bool automatic)
{
int blockSize;
int numBlocks;
int gridSize;
int minGridSize;
float elapsedTime;
double potentialOccupancy;
size_t dynamicSMemUsage = ; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, );
cudaEvent_t start;
cudaEvent_t end;
cudaEventCreate(&start);
cudaEventCreate(&end); if (automatic)// true 则使用 runtime 函数自动优化线程块尺寸
{
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)square, dynamicSMemUsage, size);
printf("\n\tSuggested block size: %d, minimum grid size for maximum occupancy: %d\n", blockSize, minGridSize);
}
else
blockSize = manualBlockSize; gridSize = (size + blockSize - ) / blockSize; cudaEventRecord(start);
square<<<gridSize, blockSize, dynamicSMemUsage>>>(data, size);
cudaEventRecord(end);
cudaDeviceSynchronize();
cudaEventElapsedTime(&elapsedTime, start, end);
printf("\n\tElapsed time: %4.2f ms\n", elapsedTime); // 依线程数计算占用率,分子分母同除以 prop.warpSize 即按活动线程束数计算,两者等价
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, square, blockSize, dynamicSMemUsage);
potentialOccupancy = (double)(numBlocks * blockSize) / (prop.maxThreadsPerMultiProcessor);
printf("\n\tPotential occupancy: %4.2f %%\n", potentialOccupancy * ); return ;
} // 负责核函数调用前后内存控制,以及结果检查
static int test(bool automaticLaunchConfig, const int count = )
{
int size = count * sizeof(int);
int *h_data = (int *)malloc(size);
for (int i = ; i < count; i++)
h_data[i] = i;
int *d_data;
cudaMalloc(&d_data, size); cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
memset(h_data,,size);
launchConfig(d_data, count, automaticLaunchConfig);
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); for (int i = ; i < count; i += )
{
if (h_data[i] != i * i)
{
printf("\n\tError at %d, d_data = %d\n", i, h_data[i]);
return ;
}
} free(h_data);
cudaFree(d_data);
return ;
} int main()
{
int status; printf("\n\tStart.\n"); printf("\n\tManual configuration test, BlockSize = %d\n", manualBlockSize);
if (test(false))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tAutomatic configuration\n");
if (test(true))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tTest PASSED\n");
getchar();
return ;
}
▶ 输出结果
Start.
Manual configuration test, BlockSize =
Elapsed time: 0.13 ms
Potential occupancy: 50.00 %
Automatic configuration
Suggested block size: , minimum grid size for maximum occupancy:
Elapsed time: 0.12 ms
Potential occupancy: 100.00 %
Test PASSED
▶ 涨姿势
● 用到的几个 runtime 函数及其相互关系。
// driver_types.h
// 用于优化线程块尺寸的函数中的标志
#define cudaOccupancyDefault 0x00 // 默认标志
#define cudaOccupancyDisableCachingOverride 0x01 // 开启全局缓存,且不能被禁用 // cuda_device_runtime_api.h
// 与 cuda_runtime.h 中同名的函数,貌似没有用到?
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
} // 被函数 cudaOccupancyMaxActiveBlocksPerMultiprocessor 和函数 cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags 调用的
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
} // cuda_runtime.h
template<class T>
static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, T func, int blockSize, size_t dynamicSMemSize)
{
return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
} template<typename UnaryFunction, class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
(
int* minGridSize, int* blockSize, T func, UnaryFunction blockSizeToDynamicSMemSize, int blockSizeLimit = , unsigned int flags =
)
{
cudaError_t status; // 设备和函数属性
int device;
struct cudaFuncAttributes attr;
int maxThreadsPerMultiProcessor;
int warpSize;
int devMaxThreadsPerBlock;
int multiProcessorCount;
int occupancyLimit;
int granularity; // 记录最大值
int maxBlockSize = ;
int numBlocks = ;
int maxOccupancy = ; // 临时变量
int blockSizeToTryAligned;
int blockSizeToTry;
int occupancyInBlocks;
int occupancyInThreads;
size_t dynamicSMemSize; // 检查输入
if (!minGridSize || !blockSize || !func)
return cudaErrorInvalidValue; //获取设备和核函数属性
status = ::cudaGetDevice(&device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&warpSize,cudaDevAttrWarpSize,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&devMaxThreadsPerBlock,cudaDevAttrMaxThreadsPerBlock,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&multiProcessorCount,cudaDevAttrMultiProcessorCount,device);
if (status != cudaSuccess)
return status;
status = cudaFuncGetAttributes(&attr, func);
if (status != cudaSuccess)
return status; //尝试线程块尺寸
occupancyLimit = maxThreadsPerMultiProcessor;
granularity = warpSize; if (blockSizeLimit == || blockSizeLimit > devMaxThreadsPerBlock)
blockSizeLimit = devMaxThreadsPerBlock; if (blockSizeLimit > attr.maxThreadsPerBlock)
blockSizeLimit = attr.maxThreadsPerBlock; for (blockSizeToTryAligned = ((blockSizeLimit + (warpSize - )) / warpSize) * warpSize; blockSizeToTryAligned > ; blockSizeToTryAligned -= warpSize)
// blockSizeLimit 向上对齐到 warpSize 的整数倍,并尝试以 warpSize 为单位向下减少
// 如果一开始 blockSizeLimit 就比 blockSizeToTryAligned 小,则从 blockSizeLimit 开始尝试(这时只用迭代一次)
{
blockSizeToTry = (blockSizeLimit < blockSizeToTryAligned) ? blockSizeLimit : blockSizeToTryAligned;
dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); // 计算占用率的核心
status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&occupancyInBlocks, func, blockSizeToTry, dynamicSMemSize, flags);
if (status != cudaSuccess)
return status; // 记录有效结果
if ((occupancyInThreads = blockSizeToTry * occupancyInBlocks) > maxOccupancy)
{
maxBlockSize = blockSizeToTry;
numBlocks = occupancyInBlocks;
maxOccupancy = occupancyInThreads;
} // 已经达到了占用率 100%,退出
if (occupancyLimit == maxOccupancy)
break;
} // 返回最优结果
*minGridSize = numBlocks * multiProcessorCount;
*blockSize = maxBlockSize; return status;
} class __cudaOccupancyB2DHelper
{
size_t n;
public:
inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
inline __host__ CUDART_DEVICE size_t operator()(int)
{
return n;
}
}; // 优化线程块尺寸的 runtime 函数
// 参数:输出最小线程格尺寸 minGridSize,输出线程块尺寸 blockSize,内核 func,动态共享内存大小 dynamicSMemSize,总线程数 blockSizeLimit
template<class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize
(
int *minGridSize, int *blockSize, T func, size_t dynamicSMemSize = , int blockSizeLimit =
)
{
return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
}
0_Simple__simpleOccupancy的更多相关文章
随机推荐
- C#/.NET 使用 CommandLineParser 来标准化地解析命令行
CommandLineParser 是一款用于解析命令行参数的 NuGet 包.你只需要关注你的业务,而命令行解析只需要极少量的配置代码. 本文将介绍如何使用 CommandLineParser 高效 ...
- 《DSP using MATLAB》Problem 3.17
用差分方程两边进行z变换,再变量带换得到频率响应函数(或转移函数,即LTI系统脉冲响应的DTFT). 代码: %% ------------------------------------------ ...
- python list 去掉重复元素
貌似用遍历最方便. http://www.cnblogs.com/tudas/p/python-delete-duplicate-element-from-list.html
- android BSP移植综述
将android移植到特定硬件平台上,其核心是bsp的搭建工作,bsp是板级支持包,并不是特定某个文件,而是从功能上理解的一种硬件适配软件包,它的核心就是: 1. linux内核硬件相关部分(主要是l ...
- statik golang 静态资源嵌入二进制文件工具使用(docker 构建)
将静态资源打包进二进制文件有好多方便的地方 方便客户演示 代码简单加密 运行方便 statik 就是一款在golang 中用的比较多,nodejs 有一款pkg (oclif 就推荐使用此工具) ...
- adnanh webhook 框架execute-command 以及参数传递处理
adnanh webhook是一个很不错的webhook 实现,方便灵活. adnanh webhook 支持以下功能: 接收请求 解析header 以及负载以及查询变量 规则检查 执行命令 下面 ...
- javabean(实体类)转Map类型
javabean(实体类)转Map类型 从网上"風亦飞"的导出EXCEL的源代码提取出来的.认为非常好用.分享一下给大家,主要看beanToMap方法就OK了 /*以下是从poi导 ...
- 引用,引用形參,指针形參与指向指针的引用形參,内存泄露及free相关
(另:关于"引用"更具体的讨论.见此.) 由做UVa133引发的一系列问题及讨论 1.引用类型 C++ Primer P51 引用就是对象的还有一个名字,使用多个变量名指向同 ...
- MySQL中UNSIGNED和ZEROFILL的介绍
UNSIGNED: 将数字类型无符号化,这与C和C++这些程序语言的unsigned含义相同. INT的类型范围-2 147 483 648~2 147 483 647 INT UNSIGNED范围0 ...
- mysql的一些 参数查询
1 查询 事务 超时时间: SHOW GLOBAL VARIABLES LIKE 'innodb_lock_wait_timeout'; (默认innodb引擎事务的超时时间) 2 查询事务隔离级别 ...