介绍了线程束表决函数的实例(其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html),并在静态和运行时编译两种条件下进行使用。

▶ 源代码:静态

 // simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} __global__ void vote3(bool *info, int warp_size)// 跨线程束检查
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );// 将每个线程指向等距间隔的元素,表明表决函数的运算结果可以进行分发 // 第一组 “下标模 3 得 0” 的元素为 0,第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素
*offs = __any_sync(mask, tx >= warp_size * / );
// 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0,第二组后半段和第三组 “下标模 3 得 1” 的元素为 1
*(offs + ) = (tx >= warp_size * / )? true: false;
// 第一组和第二组 “下标模 3 得 2” 的元素为 0,第三组 “下标模 3 得 2” 的元素为 1
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
// 最终结果应该是:
// 1 2 3 4 15 16 17 18 30 31 32
// 000 000 000 000 ... 000 000 000 000 ... 000 000 000
// 100 100 100 100 ... 100 100 110 110 ... 110 110 110
// 111 111 111 111 ... 111 111 111 111 ... 111 111 111
}
#endif
 // simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h>
#include "simpleVote_kernel.cuh" #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组,size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
}
// 数组检查函数,type == 1:把数组元素全部加起来,结果非零就报错;type == 0:把数组元素全部加起来,结果不等于 WARP_SIZE 就报错
int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end-);
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type?(sum > ):(sum != WARP_SIZE);
} // 数组检查的中间函数,type == 1:使用(1,0,0,0)的模式调用数组检查函数;type == 0:使用(1,1,1,0)的模式调用数组检查函数
int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type?:,"Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / )) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * )) // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
unsigned int *d_input, *d_result;
bool *dinfo = NULL, *hinfo = NULL;
int error_count[] = { , , };
cudaSetDevice(); //使用长度为 4 个线程束的数组,刚好分为 4 个组(全零,后交替非零,前交替非零,全非零)进行表决
totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int));
cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice); //测试一,any
printf("\n\tTest 1: ");
voteAny << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
voteAll << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);
cudaDeviceSynchronize();
cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三,使用长度为 9 个线程束的数组,但调用内核时只使用数量为 3 个线程束的线程,即分为 3 组,每组 WARP_SIZE * 3 个元素
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));
cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);
vote3 << <, totalElement / >> > (dinfo, WARP_SIZE);
cudaDeviceSynchronize();
cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cudaFree(d_input);
cudaFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cudaFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}

▶ 输出结果:

    Start.

    Test : Passed

    Test : Passed

    Test : Passed

    Finish.

▶ 源代码:运行时编译(删掉了相同的注释)

 // simpleVote_kernel.cuh
#ifndef SIMPLEVOTE_KERNEL_CU
#define SIMPLEVOTE_KERNEL_CU extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]);
} extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)
{
int tx = threadIdx.x;
int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]);
} extern "C" __global__ void vote3(bool *info, int warp_size)
{
int tx = threadIdx.x;
unsigned int mask = 0xffffffff;
bool *offs = info + (tx * );
*offs = __any_sync(mask, tx >= warp_size * / );
*(offs + ) = (tx >= warp_size * / ) ? true : false;
*(offs + ) = all(tx >= warp_size * / ) ? true : false;
}
#endif
 // simpleVoteIntrinsics.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "nvrtc_helper.h"
#include <helper_functions.h> #define WARP_SIZE 32
#define GROUP 4 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
{
for (int i = ; i < size / ; i++)
VOTE_PATTERN[i] = 0x00000000; for (int i = size / ; i < size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? i : ; for (int i = size / ; i < * size / ; i++)
VOTE_PATTERN[i] = (i & 0x01) ? : i; for (int i = * size / ; i < size; i++)
VOTE_PATTERN[i] = 0xffffffff;
} int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)
{
int i, sum;
for (sum = , i = start; i < end; i++)
sum += h_result[i];
if (type&&sum > || !type&& sum != WARP_SIZE)
{
printf("\n\t<%s>[%d - %d]:", name, start, end - );
for (i = start; i < end; i++)
printf("%d,", h_result[i]);
printf("\b\n");
}
return type ? (sum > ) : (sum != WARP_SIZE);
} int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)
{
int error_count = ; error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any");
error_count += checkErrors(h_result, * totalElement / , * totalElement / , type ? : , "Vote.Any"); printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
}
int checkResultsVoteKernel(bool *hinfo, int totalThread)
{
int i, error_count;
for (i = error_count = ; i < totalThread * ; i++)
{
switch (i % )
{
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * / ))
error_count++;
break;
case :
if (hinfo[i] != (i >= totalThread * ))
error_count++;
break;
}
}
printf("%s\n", !error_count ? "Passed" : "Failed");
return error_count;
} int main()
{
printf("\n\tStart.\n");
int totalElement;
unsigned int *h_input, *h_result;
CUdeviceptr d_input, d_result;// unsigned long long
bool *hinfo = NULL;
CUdeviceptr dinfo;
int error_count[] = { , , };
//cudaSetDevice(0); // 编译 PTX
char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleVoteIntrinsics_nvrtc\\simpleVote_kernel.cuh";
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );// (1, NULL) 为主函数接受的参数个数和参数
CUmodule module = loadPTX(ptx, , NULL); totalElement = WARP_SIZE * GROUP;
h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));
cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));
cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));
genVoteTestPattern(h_input, totalElement);
cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int)); //测试一,any
printf("\n\tTest 1: ");
dim3 gridBlock(, );
dim3 threadBlock(totalElement, );
CUfunction kernel_addr;
cuModuleGetFunction(&kernel_addr, module, "voteAny");
void *arr1[] = { (void *)&d_input, (void *)&d_result };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试二,all
printf("\n\tTest 2: ");
cuModuleGetFunction(&kernel_addr, module, "voteAll");
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );
cuCtxSynchronize();
cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));
error_count[] += checkResultsVoteKernel(h_result, totalElement, ); // 测试三
printf("\n\tTest 3: ");
totalElement = WARP_SIZE * * ;
hinfo = (bool *)calloc(totalElement, sizeof(bool));
cuMemAlloc(&dinfo, totalElement * sizeof(bool));
cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));
threadBlock = dim3(totalElement / , ); // 改变线程块尺寸
cuModuleGetFunction(&kernel_addr, module, "vote3");
int size = WARP_SIZE;
void *arr2[] = { (void *)&dinfo, (void *)&size };
cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr2[], );
cuCtxSynchronize();
cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));
error_count[] = checkResultsVoteKernel(hinfo, totalElement / ); // 清理工作
cuMemFree(d_input);
cuMemFree(d_result);
free(h_input);
free(h_result);
free(hinfo);
cuMemFree(dinfo);
printf("\t\nFinish.\n");
getchar();
return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;
}

▶ 输出结果:

        Start.
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability Test : Passed Test : Passed Test : Passed Finish.

▶ 涨姿势

● 线程表决函数见另一篇博客,注意 CUDA9.0 改进了部分函数,废弃了旧的部分函数。

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章

随机推荐

  1. Pma模块详解,对用户登录linux等进行限制,密码修改限制等

    PAM详细介绍 2014-04-02 09:26:41 标签:PAM 原创作品,允许转载,转载时请务必以超链接形式标明文章 原始出处 .作者信息和本声明.否则将追究法律责任.http://lidefu ...

  2. 关于jdbc的面试题

    什么是JDBC,在什么时候会用到它? JDBC的全称是Java DataBase Connection,也就是Java数据库连接,我们可以用它来操作关系型数据库.JDBC接口及相关类在java.sql ...

  3. grandstack graphql 开发模型

    当前grandstack 支持两类开发方式 js (使用Neo4j-graphql-js) 插件模型 js 模型 参考https://github.com/rongfengliang/grand-st ...

  4. GOOGLE高级搜索的秘籍

    一.摘要 本文内容来源自互联网,全面的介绍Google搜索的各种功能和技巧. 二.GOOGLE简介 Google(http://www.google.com/)是一个搜索引擎,由两个斯坦福大学博士生L ...

  5. uml建模工具介绍

    应用最广的由两种种1. Rational Rose,它是ibm的 .2.Microsoft的 Microsoft Office Visio® 2003 3.Enterprise Architect.还 ...

  6. 罗技 M558 鼠标维修记录

    罗技 M558 鼠标维修记录 故障现象 按键不灵敏 拆机内部图 前进键 后退键 左键 右键 中键 自定义功能键 使用的是 OMRON 按键,好东西,质量可以. 但毕竟是机械的东西,还是有老化,用万用表 ...

  7. Arrays、ArrayUtils 区别

    Arrays java.util 包提供的静态类:java.util.Arrays 此静态类专门用来操作array ,提供搜索.排序.复制等静态方法. ArrayUtils apache 提供的类:o ...

  8. 箭头函数中的 this

    JS 每一个 function 都有自己独立的运行上下文,但箭头函数不属于普通的 function,所以没有独立的上下文. 所以在箭头函数里写的 this 其实是包含该箭头函数最近的一个 functi ...

  9. EditorConfig知识点

    .editorconfig 该文件定义项目的编码规范,编辑器的行为会与.editorconfig 文件中定义的一致,并且其优先级比编辑器自身的设置要高,这在多人合作开发项目时十分有用而且必要. 在哪里 ...

  10. python下爬某个网页的图片

    #coding=utf-8 import re import urllib def getHtml(url): #获取url对应得源码 page = urllib.urlopen(url) html ...