0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc
介绍了线程束表决函数的实例(其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html),并在静态和运行时编译两种条件下进行使用。
▶ 源代码:静态
// simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} __global__ void vote3(bool *info, int warp_size)// 跨线程束检查
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );// 将每个线程指向等距间隔的元素,表明表决函数的运算结果可以进行分发 // 第一组 “下标模 3 得 0” 的元素为 0,第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素
*offs = __any_sync(mask, tx >= warp_size * / );
// 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0,第二组后半段和第三组 “下标模 3 得 1” 的元素为 1
*(offs + ) = (tx >= warp_size * / )? true: false;
// 第一组和第二组 “下标模 3 得 2” 的元素为 0,第三组 “下标模 3 得 2” 的元素为 1
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
// 最终结果应该是:
// 1 2 3 4 15 16 17 18 30 31 32
// 000 000 000 000 ... 000 000 000 000 ... 000 000 000
// 100 100 100 100 ... 100 100 110 110 ... 110 110 110
// 111 111 111 111 ... 111 111 111 111 ... 111 111 111
}
#endif
// simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h>
#include "simpleVote_kernel.cuh" #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组,size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
}
// 数组检查函数,type == 1:把数组元素全部加起来,结果非零就报错;type == 0:把数组元素全部加起来,结果不等于 WARP_SIZE 就报错
int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end-);
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type?(sum > ):(sum != WARP_SIZE);
} // 数组检查的中间函数,type == 1:使用(1,0,0,0)的模式调用数组检查函数;type == 0:使用(1,1,1,0)的模式调用数组检查函数
int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / )) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
unsigned int *d_input, *d_result;
bool *dinfo = NULL, *hinfo = NULL;
int error_count[] = { , , };
cudaSetDevice(); //使用长度为 4 个线程束的数组,刚好分为 4 个组(全零,后交替非零,前交替非零,全非零)进行表决
totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice); //测试一,any
printf("\n\tTest 1: ");
voteAny << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
voteAll << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三,使用长度为 9 个线程束的数组,但调用内核时只使用数量为 3 个线程束的线程,即分为 3 组,每组 WARP_SIZE * 3 个元素
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));
cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);
vote3 << <, totalElement / >> > (dinfo, WARP_SIZE);
cudaDeviceSynchronize();
cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cudaFree(d_input);
cudaFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cudaFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}
▶ 输出结果:
Start.
Test : Passed
Test : Passed
Test : Passed
Finish.
▶ 源代码:运行时编译(删掉了相同的注释)
// simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} extern "C" __global__ void vote3(bool *info, int warp_size)
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );
*offs = __any_sync(mask, tx >= warp_size * / );
*(offs + ) = (tx >= warp_size * / ) ? true : false;
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
}
#endif
// simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "nvrtc_helper.h"
#include <helper_functions.h> #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
} int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end - );
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type ? (sum > ) : (sum != WARP_SIZE);
} int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
CUdeviceptr d_input, d_result;// unsigned long long
bool *hinfo = NULL;
CUdeviceptr dinfo;
int error_count[] = { , , };
//cudaSetDevice(0); // 编译 PTX
char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleVoteIntrinsics_nvrtc\\simpleVote_kernel.cuh";
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );// (1, NULL) 为主函数接受的参数个数和参数
CUmodule module = loadPTX(ptx, , NULL); totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));
cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int)); //测试一,any
printf("\n\tTest 1: ");
dim3 gridBlock(, );
dim3 threadBlock(totalElement, );
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, module, "voteAny");
void *arr1[] = { (void *)&d_input, (void *)&d_result };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
cuModuleGetFunction(&kernel_addr, module, "voteAll");
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cuMemAlloc(&dinfo, totalElement * sizeof(bool));
cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));
threadBlock = dim3(totalElement / , ); // 改变线程块尺寸
cuModuleGetFunction(&kernel_addr, module, "vote3");
int size = WARP_SIZE;
void *arr2[] = { (void *)&dinfo, (void *)&size };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr2[], );
cuCtxSynchronize();
cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cuMemFree(d_input);
cuMemFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cuMemFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}
▶ 输出结果:
Start.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Test : Passed Test : Passed Test : Passed Finish.
▶ 涨姿势
● 线程表决函数见另一篇博客,注意 CUDA9.0 改进了部分函数,废弃了旧的部分函数。
0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章
随机推荐
- IIS7中Ajax.AjaxMethod无效的原因及解决方法
使用Ajax.AjaxMethod方法在asp.net的服务器下一切正常,用iis的时候,js中总是cs类找不到,具体的解决方法如下,遇到类似情况的朋友可以参考下 最近做用Ajax.AjaxMetho ...
- 20155207 2006-2007-2 《Java程序设计》第5周学习总结
20155207 2006-2007-2 <Java程序设计>第5周学习总结 教材学习内容总结 第八章 语法与继承架构 Java中的错误以对象方式呈现为 java.lang.Throwab ...
- The Suspects 并查集
Severe acute respiratory syndrome (SARS), an atypical pneumonia of unknown aetiology, was recognized ...
- 【分形】【洛谷P1498】
https://www.luogu.org/problemnew/show/P1498 题目描述 自从到了南蛮之地,孔明不仅把孟获收拾的服服帖帖,而且还发现了不少少数民族的智慧,他发现少数民族的图腾往 ...
- stardog graphql 简单操作
预备环境: 下载stardog 软件包 graphql 查询地址 创建一个简单数据库 ./stardog-admin db create -nstarwars graphql 查询方式 http 地址 ...
- 编辑文章 - 博客频道 - CSDN.NET
站点连接 :http://www.gaoshou.me/uid/19125624 不用不知道,一用吓一跳. 每一个月的手机话费不用愁了. 仅限苹果手机 1.同步请求能够从因特网请求数据.一旦发送 ...
- ResourceBundle介绍
介绍: ResourceBundle类主要作用是读取属性文件,读取属性文件时可以直接指定属性文件的名称(指定名称时不需要文件的后缀),也可以根据Locale所指定的区域码来选取指定的资源文件: Res ...
- RAW+ASM 的RAC 安装文档
实验平台:Oracle 10gR2 RAC + RHEL 4.0 +VMWare GSX 3.2.0 安装步骤: 1.安装前准备及OS安装配置 2.安装Oracle 10gR2 clusterware ...
- bzoj1072排列
题目:https://www.lydsy.com/JudgeOnline/problem.php?id=1072 好像是这方面的裸题. 整除k 要想转移需要记录下 达到模k所有余数 的方案数. 为了生 ...
- Oracle 11G 单机asm安装
http://sugarlovecxq.blog.51cto.com/6707742/1702092/