0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics

介绍了线程束表决函数的实例（其概念介绍见 http://www.cnblogs.com/cuancuancuanhao/p/7841512.html），并在静态和运行时编译两种条件下进行使用。

▶ 源代码：静态

 // simpleVote_kernel.cuh

 #ifndef SIMPLEVOTE_KERNEL_CU

 #define SIMPLEVOTE_KERNEL_CU

 __global__ void voteAny(unsigned int *input, unsigned int *result)// 任意一个线程抛出非零值则函数返回非零值

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __any_sync(mask, input[tx]);

 }

 __global__ void voteAll(unsigned int *input, unsigned int *result)// 当且仅当所有线程抛出非零值函数才返回非零值

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __all_sync(mask, input[tx]);

 }

 __global__ void vote3(bool *info, int warp_size)// 跨线程束检查

 {

     int tx = threadIdx.x;

     unsigned int mask = 0xffffffff;

     bool *offs = info + (tx * );// 将每个线程指向等距间隔的元素，表明表决函数的运算结果可以进行分发

     // 第一组 “下标模 3 得 0” 的元素为 0，第二组和第三组 “下标模 3 得 0” 的元素为 1。“一组” 为 warp_size * 3 个元素

     *offs = __any_sync(mask, tx >= warp_size *  / );

     // 第一组和第二组前半段 “下标模 3 得 1” 的元素为 0，第二组后半段和第三组 “下标模 3 得 1” 的元素为 1

     *(offs + ) = (tx >= warp_size *  / )? true: false;

     // 第一组和第二组 “下标模 3 得 2” 的元素为 0，第三组 “下标模 3 得 2” 的元素为 1

     *(offs + ) = all(tx >= warp_size *  / ) ? true : false;

     // 最终结果应该是：

     //   1   2   3   4      15  16  17  18      30  31  32

     // 000 000 000 000 ... 000 000 000 000 ... 000 000 000

     // 100 100 100 100 ... 100 100 110 110 ... 110 110 110

     // 111 111 111 111 ... 111 111 111 111 ... 111 111 111

 }

 #endif

 // simpleVoteIntrinsics.cu

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_functions.h>

 #include <helper_cuda.h>

 #include "simpleVote_kernel.cuh"

 #define WARP_SIZE   32

 #define GROUP       4

 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)// 构建原数组，size == 8 时结果为{0,0,0,3,4,0,ffffffff,ffffffff}

 {

     for (int i = ; i < size / ; i++)

         VOTE_PATTERN[i] = 0x00000000; 

     for (int i = size / ; i < size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ? i : ; 

     for (int i = size / ; i <  * size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ?  : i; 

     for (int i =  * size / ; i < size; i++)

         VOTE_PATTERN[i] = 0xffffffff;

 }

 // 数组检查函数，type == 1：把数组元素全部加起来，结果非零就报错；type == 0：把数组元素全部加起来，结果不等于 WARP_SIZE 就报错

 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)

 {

     int i, sum;

     for (sum = , i = start; i < end; i++)

         sum += h_result[i];

     if (type&&sum >  || !type&& sum != WARP_SIZE)

     {

         printf("\n\t<%s>[%d - %d]:", name, start, end-);

         for (i = start; i < end; i++)

             printf("%d,", h_result[i]);

         printf("\b\n");

     }

     return type?(sum > ):(sum != WARP_SIZE);

 }

 // 数组检查的中间函数，type == 1：使用(1,0,0,0)的模式调用数组检查函数；type == 0：使用(1,1,1,0)的模式调用数组检查函数

 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)

 {

     int error_count = ;

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type?:,"Vote.Any");

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int checkResultsVoteKernel(bool *hinfo, int totalThread)

 {

     int i, error_count;

     for (i = error_count = ; i < totalThread * ; i++)

     {

         switch (i % )

         {

             case :

                 if (hinfo[i] != (i >= totalThread * ))     // 等价于 if (i < totalThread && hinfo[i] == 0 || i >= totalThread && hinfo == 1)

                     error_count++;

                 break;

             case :

                 if (hinfo[i] != (i >= totalThread *  / )) // 等价于 if (i < totalThread * 3 / 2 && hinfo[i] == 0 || i >= totalThread * 3 / 2 && hinfo == 1)

                     error_count++;

                 break;

             case :

                 if (hinfo[i] != (i >= totalThread * ))     // 等价于 if (i < totalThread * 2 && hinfo[i] == 0 || i >= totalThread * 2 && hinfo == 1)

                     error_count++;

                 break;

         }

     }

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int main()

 {

     printf("\n\tStart.\n");

     int totalElement;

     unsigned int *h_input, *h_result;

     unsigned int *d_input, *d_result;

     bool *dinfo = NULL, *hinfo = NULL;

     int error_count[] = { , ,  };

     cudaSetDevice();

     //使用长度为 4 个线程束的数组，刚好分为 4 个组（全零，后交替非零，前交替非零，全非零）进行表决

     totalElement = WARP_SIZE * GROUP;

     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     cudaMalloc((void **)&d_input, totalElement * sizeof(unsigned int));

     cudaMalloc((void **)&d_result, totalElement * sizeof(unsigned int));

     genVoteTestPattern(h_input, totalElement);

     cudaMemcpy(d_input, h_input, totalElement * sizeof(unsigned int), cudaMemcpyHostToDevice);

     //测试一，any

     printf("\n\tTest 1: ");

     voteAny << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);

     cudaDeviceSynchronize();

     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试二，all

     printf("\n\tTest 2: ");

     voteAll << <dim3(, ), dim3(totalElement, ) >> > (d_input, d_result);

     cudaDeviceSynchronize();

     cudaMemcpy(h_result, d_result, totalElement * sizeof(unsigned int), cudaMemcpyDeviceToHost);

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试三，使用长度为 9 个线程束的数组，但调用内核时只使用数量为 3 个线程束的线程，即分为 3 组，每组 WARP_SIZE * 3 个元素

     printf("\n\tTest 3: ");

     totalElement = WARP_SIZE *  * ;

     hinfo = (bool *)calloc(totalElement, sizeof(bool));

     cudaMalloc((void **)&dinfo, totalElement * sizeof(bool));

     cudaMemcpy(dinfo, hinfo, totalElement * sizeof(bool), cudaMemcpyHostToDevice);

     vote3 << <, totalElement /  >> > (dinfo, WARP_SIZE);

     cudaDeviceSynchronize();

     cudaMemcpy(hinfo, dinfo, totalElement * sizeof(bool), cudaMemcpyDeviceToHost);

     error_count[] = checkResultsVoteKernel(hinfo, totalElement / );

     // 清理工作

     cudaFree(d_input);

     cudaFree(d_result);

     free(h_input);

     free(h_result);

     free(hinfo);

     cudaFree(dinfo);

     printf("\t\nFinish.\n");

     getchar();

     return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;

 }

▶ 输出结果：

    Start.

    Test : Passed

    Test : Passed

    Test : Passed

    Finish.

▶ 源代码：运行时编译（删掉了相同的注释）

 // simpleVote_kernel.cuh

 #ifndef SIMPLEVOTE_KERNEL_CU

 #define SIMPLEVOTE_KERNEL_CU

 extern "C" __global__ void voteAny(unsigned int *input, unsigned int *result)

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __any_sync(mask, input[tx]);

 }

 extern "C" __global__ void voteAll(unsigned int *input, unsigned int *result)

 {

     int tx = threadIdx.x;

     int mask = 0xffffffff;

     result[tx] = __all_sync(mask, input[tx]);

 }

 extern "C" __global__ void vote3(bool *info, int warp_size)

 {

     int tx = threadIdx.x;

     unsigned int mask = 0xffffffff;

     bool *offs = info + (tx * );

     *offs = __any_sync(mask, tx >= warp_size *  / );

     *(offs + ) = (tx >= warp_size *  / ) ? true : false;

     *(offs + ) = all(tx >= warp_size *  / ) ? true : false;

 }

 #endif

 // simpleVoteIntrinsics.cu

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include "nvrtc_helper.h"

 #include <helper_functions.h>

 #define WARP_SIZE   32

 #define GROUP       4

 void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)

 {

     for (int i = ; i < size / ; i++)

         VOTE_PATTERN[i] = 0x00000000;

     for (int i = size / ; i < size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ? i : ;

     for (int i = size / ; i <  * size / ; i++)

         VOTE_PATTERN[i] = (i & 0x01) ?  : i;

     for (int i =  * size / ; i < size; i++)

         VOTE_PATTERN[i] = 0xffffffff;

 }

 int checkErrors(unsigned int *h_result, int start, int end, bool type, const char * name)

 {

     int i, sum;

     for (sum = , i = start; i < end; i++)

         sum += h_result[i];

     if (type&&sum >  || !type&& sum != WARP_SIZE)

     {

         printf("\n\t<%s>[%d - %d]:", name, start, end - );

         for (i = start; i < end; i++)

             printf("%d,", h_result[i]);

         printf("\b\n");

     }

     return type ? (sum > ) : (sum != WARP_SIZE);

 }

 int checkResultsVoteKernel(unsigned int *h_result, int totalElement, bool type)

 {

     int error_count = ;

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     error_count += checkErrors(h_result,  * totalElement / ,  * totalElement / , type ?  : , "Vote.Any");

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int checkResultsVoteKernel(bool *hinfo, int totalThread)

 {

     int i, error_count;

     for (i = error_count = ; i < totalThread * ; i++)

     {

         switch (i % )

         {

         case :

             if (hinfo[i] != (i >= totalThread * ))

                 error_count++;

             break;

         case :

             if (hinfo[i] != (i >= totalThread *  / ))

                 error_count++;

             break;

         case :

             if (hinfo[i] != (i >= totalThread * ))

                 error_count++;

             break;

         }

     }

     printf("%s\n", !error_count ? "Passed" : "Failed");

     return error_count;

 }

 int main()

 {

     printf("\n\tStart.\n");

     int totalElement;

     unsigned int *h_input, *h_result;

     CUdeviceptr d_input, d_result;// unsigned long long

     bool *hinfo = NULL;

     CUdeviceptr dinfo;

     int error_count[] = { , ,  };

     //cudaSetDevice(0); 

     // 编译 PTX

     char *ptx, *kernel_file;

     size_t ptxSize;

     kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleVoteIntrinsics_nvrtc\\simpleVote_kernel.cuh";

     compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize, );// (1, NULL) 为主函数接受的参数个数和参数

     CUmodule module = loadPTX(ptx, , NULL);

     totalElement = WARP_SIZE * GROUP;

     h_input = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     h_result = (unsigned int *)malloc(totalElement * sizeof(unsigned int));

     cuMemAlloc(&d_input, totalElement * sizeof(unsigned int));

     cuMemAlloc(&d_result, totalElement * sizeof(unsigned int));

     genVoteTestPattern(h_input, totalElement);

     cuMemcpyHtoD(d_input, h_input, totalElement * sizeof(unsigned int));

     //测试一，any

     printf("\n\tTest 1: ");

     dim3 gridBlock(, );

     dim3 threadBlock(totalElement, );

     CUfunction kernel_addr;

     cuModuleGetFunction(&kernel_addr, module, "voteAny");

     void *arr1[] = { (void *)&d_input, (void *)&d_result };

     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );

     cuCtxSynchronize();

     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试二，all

     printf("\n\tTest 2: ");

     cuModuleGetFunction(&kernel_addr, module, "voteAll");

     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr1[], );

     cuCtxSynchronize();

     cuMemcpyDtoH(h_result, d_result, totalElement * sizeof(unsigned int));

     error_count[] += checkResultsVoteKernel(h_result, totalElement, );

     // 测试三

     printf("\n\tTest 3: ");

     totalElement = WARP_SIZE *  * ;

     hinfo = (bool *)calloc(totalElement, sizeof(bool));

     cuMemAlloc(&dinfo, totalElement * sizeof(bool));

     cuMemcpyHtoD(dinfo, hinfo, totalElement * sizeof(bool));

     threadBlock = dim3(totalElement / , );                    // 改变线程块尺寸

     cuModuleGetFunction(&kernel_addr, module, "vote3");

     int size = WARP_SIZE;

     void *arr2[] = { (void *)&dinfo, (void *)&size };

     cuLaunchKernel(kernel_addr, gridBlock.x, gridBlock.y, gridBlock.z, threadBlock.x, threadBlock.y, threadBlock.z, , , &arr2[], );

     cuCtxSynchronize();

     cuMemcpyDtoH(hinfo, dinfo, totalElement * sizeof(bool));

     error_count[] = checkResultsVoteKernel(hinfo, totalElement / );

     // 清理工作

     cuMemFree(d_input);

     cuMemFree(d_result);

     free(h_input);

     free(h_result);

     free(hinfo);

     cuMemFree(dinfo);

     printf("\t\nFinish.\n");

     getchar();

     return (error_count[] || error_count[] || error_count[]) ? EXIT_FAILURE : EXIT_SUCCESS;

 }

▶ 输出结果：

        Start.

> Using CUDA Device []: GeForce GTX

> GPU Device has SM 6.1 compute capability

        Test : Passed

        Test : Passed

        Test : Passed

Finish.

▶ 涨姿势

● 线程表决函数见另一篇博客，注意 CUDA9.0 改进了部分函数，废弃了旧的部分函数。

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章

随机推荐

代理模式及Spring AOP (一)
一.代理模式在不更改源码的前提下,加入新功能,通常需要用到代理设计模式. 代理设计模式分类: 静态代理动态代理 jdk动态代理 cglib动态代理其中spring AOP的底层用的是动态代理.其 ...
Vue中的“混合”——mixins使用方法
混合是一种灵活的分布式复用 Vue 组件的方式.混合对象可以包含任意组件选项.以组件使用混合对象时,所有混合对象的选项将被混入该组件本身的选项.当组件和混合对象含有同名选项时,这些选项将以恰当的方式混 ...
Android USB gadget框架学习笔记
一 Gadget框架结构 kernel/drivers/usb/gadget,这个目录是android下usbgadget的主要目录. Gadget功能组织单元:主要文件android.c,usb g ...
oracle 以及 sql server mysql 空值默认值修改
在SQL Server Oracle MySQL当数据库中查出某值为NULL怎么办? 1.MSSQL: ISNULL() 语法 ISNULL ( check_expression , replacem ...
02.将uboot，kernel，rootfs下载到开发板上
转载,侵删将uboot,kernel,rootfs下载到开发板上 1.为什么要下载所谓下载,也称烧录,部署. 1.1.什么是u-boot Hi3518EV200 单板的 Bootloader 采用 ...
vuex 知识点
Action 类似于 mutation,不同在于: 1.Action 提交的是 mutation,而不是直接变更状态. 2.Action 可以包含任意异步操作. mutation是同步的,当需要异步操 ...
HBase常用指令
disable 'smsFlow'drop 'smsFlow'create 'smsFlow','info','partition'count 'smsFlow'scan 'smsFlow' trun ...
【python】重定向输出
重定向的理解:就是把所要输出的结果输出到其他的地方.常用方法:"print >>",(若有其他方法后续补充) 举个例子: __author__ = 'paulwinfl ...
【python】class之类的内建函数
css-inline-block和float的布局二者择其一？
几个月前,带着不甘和忐忑毅然决然的在亚马逊离职了,当时不知道对我来说是好是坏,现在看来,当初的选择还是蛮不错的.感觉在亚马逊的几个月貌似接触最多的就是wiki和tt了,怀着对技术热忱离开,拒绝了腾讯, ...

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc

0_Simple__simpleVoteIntrinsics + 0_Simple__simpleVoteIntrinsics_nvrtc的更多相关文章

随机推荐

热门专题