0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics

介绍了线程束表决函数的实例（其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html），并在静态和运行时编译两种条件下进行使用。

▶ 源代码：静态

 // simpleVote_kernel.cuh

 #ifndef SIMPLEVOTE_KERNEL_CU

 #define SIMPLEVOTE_KERNEL_CU

 __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __any_sync(mask, input[tx]);

 }

 __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __all_sync(mask, input[tx]);

 }

 __global__ void vote3(bool *info, int warp_size)// 跨线程束检查

 {

     int tx = threadIdx.x;

     unsigned int mask = 0xffffffff;

     bool *offs = info + (tx * );// 将每个线程指向等距间隔的元素，表明表决函数的运算结果可以进行分发

     // 第一组 “下标模 3 得 0” 的元素为 0，第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素

     *offs = __any_sync(mask, tx >= warp_size *  / );

     // 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0，第二组后半段和第三组 “下标模 3 得 1” 的元素为 1

     *(offs + ) = (tx >= warp_size *  / )? true: false;

     // 第一组和第二组 “下标模 3 得 2” 的元素为 0，第三组 “下标模 3 得 2” 的元素为 1

     *(offs + ) = all(tx >= warp_size *  / ) ? true : false;

     // 最终结果应该是：

     //   1   2   3   4      15  16  17  18      30  31  32

     // 000 000 000 000 ... 000 000 000 000 ... 000 000 000

     // 100 100 100 100 ... 100 100 110 110 ... 110 110 110

     // 111 111 111 111 ... 111 111 111 111 ... 111 111 111

 }

 #endif

 // simpleVoteIntrinsics.cu

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_functions.h>

 #include <helper_cuda.h>

 #include "simpleVote_kernel.cuh"

 #define WARP_SIZE   32

 #define GROUP       4

 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组，size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}

 {

     for (int i = ; i < size / ; i++)

         VOTE_PATTERN[i] = 0x00000000; 

     for (int i = size / ; i < size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ? i : ; 

     for (int i = size / ; i <  * size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ?  : i; 

     for (int i =  * size / ; i < size; i++)

         VOTE_PATTERN[i] = 0xffffffff;

 }

 // 数组检查函数，type == 1：把数组元素全部加起来，结果非零就报错；type == 0：把数组元素全部加起来，结果不等于 WARP_SIZE 就报错

 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)

 {

     int i, sum;

     for (sum = , i = start; i < end; i++)

         sum += h_result[i];

     if (type&&sum >  || !type&& sum != WARP_SIZE)

     {

         printf("\n\t<%s>[%d - %d]:", name, start, end-);

         for (i = start; i < end; i++)

             printf("%d,", h_result[i]);

         printf("\b\n");

     }

     return type?(sum > ):(sum != WARP_SIZE);

 }

 // 数组检查的中间函数，type == 1：使用(1,0,0,0)的模式调用数组检查函数；type == 0：使用(1,1,1,0)的模式调用数组检查函数

 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)

 {

     int error_count = ;

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int checkResultsVoteKernel(bool *hinfo, int totalThread)

 {

     int i, error_count;

     for (i = error_count = ; i < totalThread * ; i++)

     {

         switch (i % )

         {

             case :

                 if (hinfo[i] != (i >= totalThread * ))     // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)

                     error_count++;

                 break;

             case :

                 if (hinfo[i] != (i >= totalThread *  / )) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)

                     error_count++;

                 break;

             case :

                 if (hinfo[i] != (i >= totalThread * ))     // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)

                     error_count++;

                 break;

         }

     }

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int main()

 {

     printf("\n\tStart.\n");

     int totalElement;

     unsigned int *h_input, *h_result;

     unsigned int *d_input, *d_result;

     bool *dinfo = NULL, *hinfo = NULL;

     int error_count[] = { , ,  };

     cudaSetDevice();

     //使用长度为 4 个线程束的数组，刚好分为 4 个组（全零，后交替非零，前交替非零，全非零）进行表决

     totalElement = WARP_SIZE * GROUP;

     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int));

     cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));

     genVoteTestPattern(h_input, totalElement);

     cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice);

     //测试一，any

     printf("\n\tTest 1: ");

     voteAny << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);

     cudaDeviceSynchronize();

     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试二，all

     printf("\n\tTest 2: ");

     voteAll << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);

     cudaDeviceSynchronize();

     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试三，使用长度为 9 个线程束的数组，但调用内核时只使用数量为 3 个线程束的线程，即分为 3 组，每组 WARP_SIZE * 3 个元素

     printf("\n\tTest 3: ");

     totalElement = WARP_SIZE *  * ;

     hinfo = (bool *)calloc(totalElement, sizeof(bool));

     cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));

     cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);

     vote3 << <, totalElement /  >> > (dinfo, WARP_SIZE);

     cudaDeviceSynchronize();

     cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);

     error_count[] = checkResultsVoteKernel(hinfo, totalElement / );

     // 清理工作

     cudaFree(d_input);

     cudaFree(d_result);

     free(h_input);

     free(h_result);

     free(hinfo);

     cudaFree(dinfo);

     printf("\t\nFinish.\n");

     getchar();

     return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;

 }

▶ 输出结果：

    Start.

    Test : Passed

    Test : Passed

    Test : Passed

    Finish.

▶ 源代码：运行时编译（删掉了相同的注释）

 // simpleVote_kernel.cuh

 #ifndef SIMPLEVOTE_KERNEL_CU

 #define SIMPLEVOTE_KERNEL_CU

 extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __any_sync(mask, input[tx]);

 }

 extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __all_sync(mask, input[tx]);

 }

 extern "C" __global__ void vote3(bool *info, int warp_size)

 {

     int tx = threadIdx.x;

     unsigned int mask = 0xffffffff;

     bool *offs = info + (tx * );

     *offs = __any_sync(mask, tx >= warp_size *  / );

     *(offs + ) = (tx >= warp_size *  / ) ? true : false;

     *(offs + ) = all(tx >= warp_size *  / ) ? true : false;

 }

 #endif

 // simpleVoteIntrinsics.cu

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include "nvrtc_helper.h"

 #include <helper_functions.h>

 #define WARP_SIZE   32

 #define GROUP       4

 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)

 {

     for (int i = ; i < size / ; i++)

         VOTE_PATTERN[i] = 0x00000000;

     for (int i = size / ; i < size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ? i : ;

     for (int i = size / ; i <  * size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ?  : i;

     for (int i =  * size / ; i < size; i++)

         VOTE_PATTERN[i] = 0xffffffff;

 }

 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)

 {

     int i, sum;

     for (sum = , i = start; i < end; i++)

         sum += h_result[i];

     if (type&&sum >  || !type&& sum != WARP_SIZE)

     {

         printf("\n\t<%s>[%d - %d]:", name, start, end - );

         for (i = start; i < end; i++)

             printf("%d,", h_result[i]);

         printf("\b\n");

     }

     return type ? (sum > ) : (sum != WARP_SIZE);

 }

 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)

 {

     int error_count = ;

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int checkResultsVoteKernel(bool *hinfo, int totalThread)

 {

     int i, error_count;

     for (i = error_count = ; i < totalThread * ; i++)

     {

         switch (i % )

         {

         case :

             if (hinfo[i] != (i >= totalThread * ))

                 error_count++;

             break;

         case :

             if (hinfo[i] != (i >= totalThread *  / ))

                 error_count++;

             break;

         case :

             if (hinfo[i] != (i >= totalThread * ))

                 error_count++;

             break;

         }

     }

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int main()

 {

     printf("\n\tStart.\n");

     int totalElement;

     unsigned int *h_input, *h_result;

     CUdeviceptr d_input, d_result;// unsigned long long

     bool *hinfo = NULL;

     CUdeviceptr dinfo;

     int error_count[] = { , ,  };

     //cudaSetDevice(0); 

     // 编译 PTX

     char *ptx, *kernel_file;

     size_t ptxSize;

     kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleVoteIntrinsics_nvrtc\\simpleVote_kernel.cuh";

     compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );// (1, NULL) 为主函数接受的参数个数和参数

     CUmodule module = loadPTX(ptx, , NULL);

     totalElement = WARP_SIZE * GROUP;

     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));

     cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));

     genVoteTestPattern(h_input, totalElement);

     cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int));

     //测试一，any

     printf("\n\tTest 1: ");

     dim3 gridBlock(, );

     dim3 threadBlock(totalElement, );

     CUfunction kernel_addr;

     cuModuleGetFunction(&kernel_addr, module, "voteAny");

     void *arr1[] = { (void *)&d_input, (void *)&d_result };

     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );

     cuCtxSynchronize();

     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试二，all

     printf("\n\tTest 2: ");

     cuModuleGetFunction(&kernel_addr, module, "voteAll");

     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );

     cuCtxSynchronize();

     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试三

     printf("\n\tTest 3: ");

     totalElement = WARP_SIZE *  * ;

     hinfo = (bool *)calloc(totalElement, sizeof(bool));

     cuMemAlloc(&dinfo, totalElement * sizeof(bool));

     cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));

     threadBlock = dim3(totalElement / , );                    // 改变线程块尺寸

     cuModuleGetFunction(&kernel_addr, module, "vote3");

     int size = WARP_SIZE;

     void *arr2[] = { (void *)&dinfo, (void *)&size };

     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr2[], );

     cuCtxSynchronize();

     cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));

     error_count[] = checkResultsVoteKernel(hinfo, totalElement / );

     // 清理工作

     cuMemFree(d_input);

     cuMemFree(d_result);

     free(h_input);

     free(h_result);

     free(hinfo);

     cuMemFree(dinfo);

     printf("\t\nFinish.\n");

     getchar();

     return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;

 }

▶ 输出结果：

        Start.

> Using CUDA Device []: GeForce GTX

> GPU Device has SM 6.1 compute capability

        Test : Passed

        Test : Passed

        Test : Passed

Finish.

▶ 涨姿势

● 线程表决函数见另一篇博客，注意 CUDA9.0 改进了部分函数，废弃了旧的部分函数。

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章

随机推荐

IIS7中Ajax.AjaxMethod无效的原因及解决方法
使用Ajax.AjaxMethod方法在asp.net的服务器下一切正常,用iis的时候,js中总是cs类找不到,具体的解决方法如下,遇到类似情况的朋友可以参考下最近做用Ajax.AjaxMetho ...
20155207 2006-2007-2 《Java程序设计》第5周学习总结
20155207 2006-2007-2 <Java程序设计>第5周学习总结教材学习内容总结第八章语法与继承架构 Java中的错误以对象方式呈现为 java.lang.Throwab ...
The Suspects 并查集
Severe acute respiratory syndrome (SARS), an atypical pneumonia of unknown aetiology, was recognized ...
【分形】【洛谷P1498】
https://www.luogu.org/problemnew/show/P1498 题目描述自从到了南蛮之地,孔明不仅把孟获收拾的服服帖帖,而且还发现了不少少数民族的智慧,他发现少数民族的图腾往 ...
stardog graphql 简单操作
预备环境: 下载stardog 软件包 graphql 查询地址创建一个简单数据库 ./stardog-admin db create -nstarwars graphql 查询方式 http 地址 ...
编辑文章 - 博客频道 - CSDN.NET
站点连接 :http://www.gaoshou.me/uid/19125624 不用不知道,一用吓一跳. 每一个月的手机话费不用愁了. 仅限苹果手机 1.同步请求能够从因特网请求数据.一旦发送 ...
ResourceBundle介绍
介绍: ResourceBundle类主要作用是读取属性文件,读取属性文件时可以直接指定属性文件的名称(指定名称时不需要文件的后缀),也可以根据Locale所指定的区域码来选取指定的资源文件: Res ...
RAW+ASM 的RAC 安装文档
实验平台:Oracle 10gR2 RAC + RHEL 4.0 +VMWare GSX 3.2.0 安装步骤: 1.安装前准备及OS安装配置 2.安装Oracle 10gR2 clusterware ...
bzoj1072排列
题目:https://www.lydsy.com/JudgeOnline/problem.php?id=1072 好像是这方面的裸题. 整除k 要想转移需要记录下达到模k所有余数的方案数. 为了生 ...
Oracle 11G 单机asm安装
http://sugarlovecxq.blog.51cto.com/6707742/1702092/

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章

随机推荐

热门专题