0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc
介绍了线程束表决函数的实例(其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html),并在静态和运行时编译两种条件下进行使用。
▶ 源代码:静态
// simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} __global__ void vote3(bool *info, int warp_size)// 跨线程束检查
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );// 将每个线程指向等距间隔的元素,表明表决函数的运算结果可以进行分发 // 第一组 “下标模 3 得 0” 的元素为 0,第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素
*offs = __any_sync(mask, tx >= warp_size * / );
// 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0,第二组后半段和第三组 “下标模 3 得 1” 的元素为 1
*(offs + ) = (tx >= warp_size * / )? true: false;
// 第一组和第二组 “下标模 3 得 2” 的元素为 0,第三组 “下标模 3 得 2” 的元素为 1
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
// 最终结果应该是:
// 1 2 3 4 15 16 17 18 30 31 32
// 000 000 000 000 ... 000 000 000 000 ... 000 000 000
// 100 100 100 100 ... 100 100 110 110 ... 110 110 110
// 111 111 111 111 ... 111 111 111 111 ... 111 111 111
}
#endif
// simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h>
#include "simpleVote_kernel.cuh" #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组,size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
}
// 数组检查函数,type == 1:把数组元素全部加起来,结果非零就报错;type == 0:把数组元素全部加起来,结果不等于 WARP_SIZE 就报错
int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end-);
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type?(sum > ):(sum != WARP_SIZE);
} // 数组检查的中间函数,type == 1:使用(1,0,0,0)的模式调用数组检查函数;type == 0:使用(1,1,1,0)的模式调用数组检查函数
int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / )) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
unsigned int *d_input, *d_result;
bool *dinfo = NULL, *hinfo = NULL;
int error_count[] = { , , };
cudaSetDevice(); //使用长度为 4 个线程束的数组,刚好分为 4 个组(全零,后交替非零,前交替非零,全非零)进行表决
totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice); //测试一,any
printf("\n\tTest 1: ");
voteAny << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
voteAll << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三,使用长度为 9 个线程束的数组,但调用内核时只使用数量为 3 个线程束的线程,即分为 3 组,每组 WARP_SIZE * 3 个元素
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));
cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);
vote3 << <, totalElement / >> > (dinfo, WARP_SIZE);
cudaDeviceSynchronize();
cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cudaFree(d_input);
cudaFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cudaFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}
▶ 输出结果:
Start.
Test : Passed
Test : Passed
Test : Passed
Finish.
▶ 源代码:运行时编译(删掉了相同的注释)
// simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} extern "C" __global__ void vote3(bool *info, int warp_size)
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );
*offs = __any_sync(mask, tx >= warp_size * / );
*(offs + ) = (tx >= warp_size * / ) ? true : false;
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
}
#endif
// simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "nvrtc_helper.h"
#include <helper_functions.h> #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
} int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end - );
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type ? (sum > ) : (sum != WARP_SIZE);
} int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
CUdeviceptr d_input, d_result;// unsigned long long
bool *hinfo = NULL;
CUdeviceptr dinfo;
int error_count[] = { , , };
//cudaSetDevice(0); // 编译 PTX
char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleVoteIntrinsics_nvrtc\\simpleVote_kernel.cuh";
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );// (1, NULL) 为主函数接受的参数个数和参数
CUmodule module = loadPTX(ptx, , NULL); totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));
cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int)); //测试一,any
printf("\n\tTest 1: ");
dim3 gridBlock(, );
dim3 threadBlock(totalElement, );
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, module, "voteAny");
void *arr1[] = { (void *)&d_input, (void *)&d_result };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
cuModuleGetFunction(&kernel_addr, module, "voteAll");
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cuMemAlloc(&dinfo, totalElement * sizeof(bool));
cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));
threadBlock = dim3(totalElement / , ); // 改变线程块尺寸
cuModuleGetFunction(&kernel_addr, module, "vote3");
int size = WARP_SIZE;
void *arr2[] = { (void *)&dinfo, (void *)&size };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr2[], );
cuCtxSynchronize();
cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cuMemFree(d_input);
cuMemFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cuMemFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}
▶ 输出结果:
Start.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Test : Passed Test : Passed Test : Passed Finish.
▶ 涨姿势
● 线程表决函数见另一篇博客,注意 CUDA9.0 改进了部分函数,废弃了旧的部分函数。
0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章
随机推荐
- 代理模式及Spring AOP (一)
一.代理模式 在不更改源码的前提下,加入新功能,通常需要用到代理设计模式. 代理设计模式分类: 静态代理 动态代理 jdk动态代理 cglib动态代理 其中spring AOP的底层用的是动态代理.其 ...
- Vue中的“混合”——mixins使用方法
混合是一种灵活的分布式复用 Vue 组件的方式.混合对象可以包含任意组件选项.以组件使用混合对象时,所有混合对象的选项将被混入该组件本身的选项.当组件和混合对象含有同名选项时,这些选项将以恰当的方式混 ...
- Android USB gadget框架学习笔记
一 Gadget框架结构 kernel/drivers/usb/gadget,这个目录是android下usbgadget的主要目录. Gadget功能组织单元:主要文件android.c,usb g ...
- oracle 以及 sql server mysql 空值默认值修改
在SQL Server Oracle MySQL当数据库中查出某值为NULL怎么办? 1.MSSQL: ISNULL() 语法 ISNULL ( check_expression , replacem ...
- 02.将uboot,kernel,rootfs下载到开发板上
转载,侵删 将uboot,kernel,rootfs下载到开发板上 1.为什么要下载 所谓下载,也称烧录,部署. 1.1.什么是u-boot Hi3518EV200 单板的 Bootloader 采用 ...
- vuex 知识点
Action 类似于 mutation,不同在于: 1.Action 提交的是 mutation,而不是直接变更状态. 2.Action 可以包含任意异步操作. mutation是同步的,当需要异步操 ...
- HBase常用指令
disable 'smsFlow'drop 'smsFlow'create 'smsFlow','info','partition'count 'smsFlow'scan 'smsFlow' trun ...
- 【python】重定向输出
重定向的理解:就是把所要输出的结果输出到其他的地方.常用方法:"print >>",(若有其他方法后续补充) 举个例子: __author__ = 'paulwinfl ...
- 【python】class之类的内建函数
- css-inline-block和float的布局二者择其一?
几个月前,带着不甘和忐忑毅然决然的在亚马逊离职了,当时不知道对我来说是好是坏,现在看来,当初的选择还是蛮不错的.感觉在亚马逊的几个月貌似接触最多的就是wiki和tt了,怀着对技术热忱离开,拒绝了腾讯, ...