计算核函数调用使得占用率,并尝试使用 runtime 函数自动优化线程块尺寸,以便提高占用率。

▶ 源代码。

 #include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <helper_cuda.h> const int manualBlockSize = ; // 核函数,输入数组的每个元素平方后放回
__global__ void square(int *array, int arrayCount)
{
extern __shared__ int dynamicSmem[];
int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < arrayCount)
array[idx] *= array[idx];
} // 负责调用核函数,计时,并考虑是否使用 runtime 函数优化线程块尺寸
static int launchConfig(int *data, int size, bool automatic)
{
int blockSize;
int numBlocks;
int gridSize;
int minGridSize;
float elapsedTime;
double potentialOccupancy;
size_t dynamicSMemUsage = ; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, );
cudaEvent_t start;
cudaEvent_t end;
cudaEventCreate(&start);
cudaEventCreate(&end); if (automatic)// true 则使用 runtime 函数自动优化线程块尺寸
{
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)square, dynamicSMemUsage, size);
printf("\n\tSuggested block size: %d, minimum grid size for maximum occupancy: %d\n", blockSize, minGridSize);
}
else
blockSize = manualBlockSize; gridSize = (size + blockSize - ) / blockSize; cudaEventRecord(start);
square<<<gridSize, blockSize, dynamicSMemUsage>>>(data, size);
cudaEventRecord(end);
cudaDeviceSynchronize();
cudaEventElapsedTime(&elapsedTime, start, end);
printf("\n\tElapsed time: %4.2f ms\n", elapsedTime); // 依线程数计算占用率,分子分母同除以 prop.warpSize 即按活动线程束数计算,两者等价
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, square, blockSize, dynamicSMemUsage);
potentialOccupancy = (double)(numBlocks * blockSize) / (prop.maxThreadsPerMultiProcessor);
printf("\n\tPotential occupancy: %4.2f %%\n", potentialOccupancy * ); return ;
} // 负责核函数调用前后内存控制,以及结果检查
static int test(bool automaticLaunchConfig, const int count = )
{
int size = count * sizeof(int);
int *h_data = (int *)malloc(size);
for (int i = ; i < count; i++)
h_data[i] = i;
int *d_data;
cudaMalloc(&d_data, size); cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
memset(h_data,,size);
launchConfig(d_data, count, automaticLaunchConfig);
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); for (int i = ; i < count; i += )
{
if (h_data[i] != i * i)
{
printf("\n\tError at %d, d_data = %d\n", i, h_data[i]);
return ;
}
} free(h_data);
cudaFree(d_data);
return ;
} int main()
{
int status; printf("\n\tStart.\n"); printf("\n\tManual configuration test, BlockSize = %d\n", manualBlockSize);
if (test(false))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tAutomatic configuration\n");
if (test(true))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tTest PASSED\n");
getchar();
return ;
}

▶ 输出结果

    Start.

    Manual configuration test, BlockSize = 

    Elapsed time: 0.13 ms

    Potential occupancy: 50.00 %

    Automatic configuration

    Suggested block size: , minimum grid size for maximum occupancy: 

    Elapsed time: 0.12 ms

    Potential occupancy: 100.00 %

    Test PASSED

▶ 涨姿势

● 用到的几个 runtime 函数及其相互关系。

 // driver_types.h
// 用于优化线程块尺寸的函数中的标志
#define cudaOccupancyDefault 0x00 // 默认标志
#define cudaOccupancyDisableCachingOverride 0x01 // 开启全局缓存,且不能被禁用 // cuda_device_runtime_api.h
// 与 cuda_runtime.h 中同名的函数,貌似没有用到?
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
} // 被函数 cudaOccupancyMaxActiveBlocksPerMultiprocessor 和函数 cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags 调用的
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
} // cuda_runtime.h
template<class T>
static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, T func, int blockSize, size_t dynamicSMemSize)
{
return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
} template<typename UnaryFunction, class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
(
int* minGridSize, int* blockSize, T func, UnaryFunction blockSizeToDynamicSMemSize, int blockSizeLimit = , unsigned int flags =
)
{
cudaError_t status; // 设备和函数属性
int device;
struct cudaFuncAttributes attr;
int maxThreadsPerMultiProcessor;
int warpSize;
int devMaxThreadsPerBlock;
int multiProcessorCount;
int occupancyLimit;
int granularity; // 记录最大值
int maxBlockSize = ;
int numBlocks = ;
int maxOccupancy = ; // 临时变量
int blockSizeToTryAligned;
int blockSizeToTry;
int occupancyInBlocks;
int occupancyInThreads;
size_t dynamicSMemSize; // 检查输入
if (!minGridSize || !blockSize || !func)
return cudaErrorInvalidValue; //获取设备和核函数属性
status = ::cudaGetDevice(&device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&warpSize,cudaDevAttrWarpSize,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&devMaxThreadsPerBlock,cudaDevAttrMaxThreadsPerBlock,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&multiProcessorCount,cudaDevAttrMultiProcessorCount,device);
if (status != cudaSuccess)
return status;
status = cudaFuncGetAttributes(&attr, func);
if (status != cudaSuccess)
return status; //尝试线程块尺寸
occupancyLimit = maxThreadsPerMultiProcessor;
granularity = warpSize; if (blockSizeLimit == || blockSizeLimit > devMaxThreadsPerBlock)
blockSizeLimit = devMaxThreadsPerBlock; if (blockSizeLimit > attr.maxThreadsPerBlock)
blockSizeLimit = attr.maxThreadsPerBlock; for (blockSizeToTryAligned = ((blockSizeLimit + (warpSize - )) / warpSize) * warpSize; blockSizeToTryAligned > ; blockSizeToTryAligned -= warpSize)
// blockSizeLimit 向上对齐到 warpSize 的整数倍,并尝试以 warpSize 为单位向下减少
// 如果一开始 blockSizeLimit 就比 blockSizeToTryAligned 小,则从 blockSizeLimit 开始尝试(这时只用迭代一次)
{
blockSizeToTry = (blockSizeLimit < blockSizeToTryAligned) ? blockSizeLimit : blockSizeToTryAligned;
dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); // 计算占用率的核心
status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&occupancyInBlocks, func, blockSizeToTry, dynamicSMemSize, flags);
if (status != cudaSuccess)
return status; // 记录有效结果
if ((occupancyInThreads = blockSizeToTry * occupancyInBlocks) > maxOccupancy)
{
maxBlockSize = blockSizeToTry;
numBlocks = occupancyInBlocks;
maxOccupancy = occupancyInThreads;
} // 已经达到了占用率 100%,退出
if (occupancyLimit == maxOccupancy)
break;
} // 返回最优结果
*minGridSize = numBlocks * multiProcessorCount;
*blockSize = maxBlockSize; return status;
} class __cudaOccupancyB2DHelper
{
size_t n;
public:
inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
inline __host__ CUDART_DEVICE size_t operator()(int)
{
return n;
}
}; // 优化线程块尺寸的 runtime 函数
// 参数:输出最小线程格尺寸 minGridSize,输出线程块尺寸 blockSize,内核 func,动态共享内存大小 dynamicSMemSize,总线程数 blockSizeLimit
template<class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize
(
int *minGridSize, int *blockSize, T func, size_t dynamicSMemSize = , int blockSizeLimit =
)
{
return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
}

0_Simple__simpleOccupancy的更多相关文章

随机推荐

  1. Executor 框架详解

    框架最核心的类是ThreadPoolExecutor,它是Java线程池的实现类,通过Executors工具类,可以创建3种类型的ThreadPoolExecutor: 首先附上ThreadPoolE ...

  2. Complete the Word

    ZS the Coder loves to read the dictionary. He thinks that a word is nice if there exists a substring ...

  3. ES中Module的使用

    Module 1.ES6在语言标准的层面上,实现了模块功能,成为浏览器和服务器通用的模块解决方案,完全可以取代 CommonJS 和 AMD 规范,基本特点如下: 每一个模块只加载一次, 每一个JS只 ...

  4. 实习第一天:try和catch的使用

    package wo;public class wowo{ public static void main(String[] args){ try{ // int i = 1/0; 是没有语法错误的, ...

  5. 在Win7下新建库并修改图标

    win7中在库中添加桌面方法详解 1.在空白处,鼠标右键选择新建——库. 2.命名为桌面,然后选择桌面. 3.鼠标右键选择属性. 4.点击包括文件夹. 5.选择桌面,点击包括文件夹按钮. 6.点击确定 ...

  6. WebApi_基于Token的身份验证——JWT

    JWT是啥? JWT就是一个字符串,经过加密处理与校验处理的字符串,形式为: A.B.C A由JWT头部信息header加密得到B由JWT用到的身份验证信息json数据加密得到C由A和B加密得到,是校 ...

  7. hadoop 配置文件简析

    文件名称            格式                     描述 hadoop-env.sh      bash脚本            记录hadoop要用的环境变量 core- ...

  8. 教你如何阅读Oracle数据库官方文档

    < Ask Oracle官方原创 > Oracle 官方文档 数量庞大,而且往往没有侧重点,让oracle新手看起来很费力.但是,仍有很多Oracle使用者认为任何oracle学习资料都比 ...

  9. 机器学习Hands On Lab

    fetch_data fetch_mldata默认路径是在scikit_learn_data路径下,mnist的mat文件其实直接放置到scikit_lean/mldata下面即可通过fetch_ml ...

  10. 多线程执行顺序诡异现象谈,你不知道的pthread_create

    引文:学而时习之,不亦说乎.总是忙于具体项目,业务功能的实现:关于编程本身的技能都要有些生疏了,于是就选择了几个专题做了一次温习,重点放在了多线程和多进程上,跑了一个实例,居然有新的发现: (1)多个 ...