0_Simple__simpleOccupancy
计算核函数调用使得占用率,并尝试使用 runtime 函数自动优化线程块尺寸,以便提高占用率。
▶ 源代码。
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <helper_cuda.h> const int manualBlockSize = ; // 核函数,输入数组的每个元素平方后放回
__global__ void square(int *array, int arrayCount)
{
extern __shared__ int dynamicSmem[];
int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < arrayCount)
array[idx] *= array[idx];
} // 负责调用核函数,计时,并考虑是否使用 runtime 函数优化线程块尺寸
static int launchConfig(int *data, int size, bool automatic)
{
int blockSize;
int numBlocks;
int gridSize;
int minGridSize;
float elapsedTime;
double potentialOccupancy;
size_t dynamicSMemUsage = ; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, );
cudaEvent_t start;
cudaEvent_t end;
cudaEventCreate(&start);
cudaEventCreate(&end); if (automatic)// true 则使用 runtime 函数自动优化线程块尺寸
{
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)square, dynamicSMemUsage, size);
printf("\n\tSuggested block size: %d, minimum grid size for maximum occupancy: %d\n", blockSize, minGridSize);
}
else
blockSize = manualBlockSize; gridSize = (size + blockSize - ) / blockSize; cudaEventRecord(start);
square<<<gridSize, blockSize, dynamicSMemUsage>>>(data, size);
cudaEventRecord(end);
cudaDeviceSynchronize();
cudaEventElapsedTime(&elapsedTime, start, end);
printf("\n\tElapsed time: %4.2f ms\n", elapsedTime); // 依线程数计算占用率,分子分母同除以 prop.warpSize 即按活动线程束数计算,两者等价
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, square, blockSize, dynamicSMemUsage);
potentialOccupancy = (double)(numBlocks * blockSize) / (prop.maxThreadsPerMultiProcessor);
printf("\n\tPotential occupancy: %4.2f %%\n", potentialOccupancy * ); return ;
} // 负责核函数调用前后内存控制,以及结果检查
static int test(bool automaticLaunchConfig, const int count = )
{
int size = count * sizeof(int);
int *h_data = (int *)malloc(size);
for (int i = ; i < count; i++)
h_data[i] = i;
int *d_data;
cudaMalloc(&d_data, size); cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
memset(h_data,,size);
launchConfig(d_data, count, automaticLaunchConfig);
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); for (int i = ; i < count; i += )
{
if (h_data[i] != i * i)
{
printf("\n\tError at %d, d_data = %d\n", i, h_data[i]);
return ;
}
} free(h_data);
cudaFree(d_data);
return ;
} int main()
{
int status; printf("\n\tStart.\n"); printf("\n\tManual configuration test, BlockSize = %d\n", manualBlockSize);
if (test(false))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tAutomatic configuration\n");
if (test(true))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tTest PASSED\n");
getchar();
return ;
}
▶ 输出结果
Start.
Manual configuration test, BlockSize =
Elapsed time: 0.13 ms
Potential occupancy: 50.00 %
Automatic configuration
Suggested block size: , minimum grid size for maximum occupancy:
Elapsed time: 0.12 ms
Potential occupancy: 100.00 %
Test PASSED
▶ 涨姿势
● 用到的几个 runtime 函数及其相互关系。
// driver_types.h
// 用于优化线程块尺寸的函数中的标志
#define cudaOccupancyDefault 0x00 // 默认标志
#define cudaOccupancyDisableCachingOverride 0x01 // 开启全局缓存,且不能被禁用 // cuda_device_runtime_api.h
// 与 cuda_runtime.h 中同名的函数,貌似没有用到?
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
} // 被函数 cudaOccupancyMaxActiveBlocksPerMultiprocessor 和函数 cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags 调用的
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
} // cuda_runtime.h
template<class T>
static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, T func, int blockSize, size_t dynamicSMemSize)
{
return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
} template<typename UnaryFunction, class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
(
int* minGridSize, int* blockSize, T func, UnaryFunction blockSizeToDynamicSMemSize, int blockSizeLimit = , unsigned int flags =
)
{
cudaError_t status; // 设备和函数属性
int device;
struct cudaFuncAttributes attr;
int maxThreadsPerMultiProcessor;
int warpSize;
int devMaxThreadsPerBlock;
int multiProcessorCount;
int occupancyLimit;
int granularity; // 记录最大值
int maxBlockSize = ;
int numBlocks = ;
int maxOccupancy = ; // 临时变量
int blockSizeToTryAligned;
int blockSizeToTry;
int occupancyInBlocks;
int occupancyInThreads;
size_t dynamicSMemSize; // 检查输入
if (!minGridSize || !blockSize || !func)
return cudaErrorInvalidValue; //获取设备和核函数属性
status = ::cudaGetDevice(&device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&warpSize,cudaDevAttrWarpSize,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&devMaxThreadsPerBlock,cudaDevAttrMaxThreadsPerBlock,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&multiProcessorCount,cudaDevAttrMultiProcessorCount,device);
if (status != cudaSuccess)
return status;
status = cudaFuncGetAttributes(&attr, func);
if (status != cudaSuccess)
return status; //尝试线程块尺寸
occupancyLimit = maxThreadsPerMultiProcessor;
granularity = warpSize; if (blockSizeLimit == || blockSizeLimit > devMaxThreadsPerBlock)
blockSizeLimit = devMaxThreadsPerBlock; if (blockSizeLimit > attr.maxThreadsPerBlock)
blockSizeLimit = attr.maxThreadsPerBlock; for (blockSizeToTryAligned = ((blockSizeLimit + (warpSize - )) / warpSize) * warpSize; blockSizeToTryAligned > ; blockSizeToTryAligned -= warpSize)
// blockSizeLimit 向上对齐到 warpSize 的整数倍,并尝试以 warpSize 为单位向下减少
// 如果一开始 blockSizeLimit 就比 blockSizeToTryAligned 小,则从 blockSizeLimit 开始尝试(这时只用迭代一次)
{
blockSizeToTry = (blockSizeLimit < blockSizeToTryAligned) ? blockSizeLimit : blockSizeToTryAligned;
dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); // 计算占用率的核心
status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&occupancyInBlocks, func, blockSizeToTry, dynamicSMemSize, flags);
if (status != cudaSuccess)
return status; // 记录有效结果
if ((occupancyInThreads = blockSizeToTry * occupancyInBlocks) > maxOccupancy)
{
maxBlockSize = blockSizeToTry;
numBlocks = occupancyInBlocks;
maxOccupancy = occupancyInThreads;
} // 已经达到了占用率 100%,退出
if (occupancyLimit == maxOccupancy)
break;
} // 返回最优结果
*minGridSize = numBlocks * multiProcessorCount;
*blockSize = maxBlockSize; return status;
} class __cudaOccupancyB2DHelper
{
size_t n;
public:
inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
inline __host__ CUDART_DEVICE size_t operator()(int)
{
return n;
}
}; // 优化线程块尺寸的 runtime 函数
// 参数:输出最小线程格尺寸 minGridSize,输出线程块尺寸 blockSize,内核 func,动态共享内存大小 dynamicSMemSize,总线程数 blockSizeLimit
template<class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize
(
int *minGridSize, int *blockSize, T func, size_t dynamicSMemSize = , int blockSizeLimit =
)
{
return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
}
0_Simple__simpleOccupancy的更多相关文章
随机推荐
- ostringstream的用法
使用stringstream对象简化类型转换C++标准库中的<sstream>提供了比ANSI C的<stdio.h>更高级的一些功能,即单纯性.类型安全和可扩展性.在本文中, ...
- nginx+keepalived实现负载均衡nginx的高可用
准备四台服务器 两台做主备,另外两台做访问 192.168.1.120 master 192.168.1.121 backup 192.168.1.122 nginx 192.168.1.123 ng ...
- js代码中碰到的函数
第一个--->字符串的截取substring()方法 substring(a,b)--->[a,b)区间截取字符.下标从0开始.从a下标开始,截取到b下标的前一个字符.返回一个新的字符串 ...
- leetcode:Valid Palindrome【Python版】
1.注意空字符串的处理: 2.注意是alphanumeric字符: 3.字符串添加字符直接用+就可以: class Solution: # @param s, a string # @return a ...
- C#中DateTime的缺陷 ---- 代替品DateTimeOffset
C#中的DateTime在逻辑上有个非常严重的缺陷: > var d = DateTime.Now; > var d2 = d.ToUniversalTime(); > d == d ...
- TweenMax.allTo
需要多个MC进行相同的缓动.比如下面这个游戏菜单.三个按钮的缓动是相同的,都缓动到同一个x坐标位置.然后同时有缓动出舞台. 如果有TweenLite实现的话,需要 if (is ...
- php脚本输出js代码不执行的解决办法和原理。
<?phpecho "<script>alert('我弹出来了')</script>";?> 很简单你一句话就可以在PHP里面输出JS脚本让浏览 ...
- JVM(下)
持久代:不会被 gc 给轻易回收的,创建后一直存在,持久代在堆内存里面,但是不归 java 程序使用.持久代是 动态 load 的那些 class,局部变量,去 gc 其实也 gc 不了啥 1.8 之 ...
- Jmeter参数跨线程组传递
1.利用BeanShell, 请求==>后置==>beanshellpostprocessorScripts内写:props.put("user_name"," ...
- 单变量微积分笔记20——三角替换1(sin和cos)
sin和cos的常用公式 基本公式: 半角公式: 微分公式: 积分公式: 三角替换 示例1 根据微分公式,cosxdx = dsinx 示例2 示例3 半角公式 示例1 示例2 解法1: 解法2: 综 ...