0_Simple__simpleCooperativeGroups

▶ 协作组，CUDA9.0 的新特性

▶ 源代码，如何获得协作组的编号？

 #include <stdio.h>

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include <cooperative_groups.h>

 #define THREAD_PER_BLOCK 64

 using namespace cooperative_groups;                             // 注意使用命名空间

 __device__ int sumReduction(thread_group g, int *x, int val)    // 规约设备函数，要求共享内存 int *x 要够放得下 g.size() 个参加规约的元素

 {

     int lane = g.thread_rank();                                 // 线程在协作组中的编号，教程中名字就叫 line ID

     for (int i = g.size() / ; i > ; i /= )

     {

         x[lane] = val;                                          // 第一次迭代该步相当于初始化，以后迭代该步相当于存储上一次迭代的结果

         g.sync();                                               // 协作组同步

         if (lane < i)

             val += x[lane + i];                                 // 利用每个线程局部变量 val 记录当前结果

         g.sync();

     }

     if (g.thread_rank() == )                                   // 零号线程返回计算结果

         return val;

     else

         return -;

 }

 __global__ void cgkernel()

 {

     extern __shared__ int workspace[];

     thread_block group = this_thread_block();                   // 将线程块内所有线程打包为一个协作组

     int groupSize = group.size();                               // 获得协作组大小（线程个数）

     int input = group.thread_rank();                            // 获得线程在协作组内的编号，并作为计算输入

     int output = sumReduction(group, workspace, input);         // 规约计算，注意直接使用共享内存作为工作空间

     int expectedOutput = (groupSize - )*groupSize / ;         // 预期计算结果，0 + 1 + 2 +...+ 63 = 2016

     if (group.thread_rank() == )                               // 0 号线程报告计算结果，宣布开始新的 4 个协作组的计算任务

     {

         printf("\n\tSum of thread 0 ~ %d in group is %d (expected %d)\n", group.size() - , output, expectedOutput);

         printf("\n\tNow creating %d groups, each of size 16 threads:\n", group.size() / );

     }

     group.sync();                                               // 协作组同步

     thread_block_tile<> group16 = tiled_partition<>(group); // 每16个线程分割为一个协作组（只能使用 2 的整数次幂）

     int offset = group.thread_rank() - group16.thread_rank();   // 各协作组使用的共享内存的地址偏移量

     printf("%d -> thread_rank = %d, group16.thread_rank = %d， offset = %d\n", threadIdx.x, group.thread_rank(), group16.thread_rank(), offset);

     // dim3 group.group_index() 打印出来全是 (0, 0, 0)，dim3 group.thread_index() 打印出来跟 group.thread_rank() 一样 

     input = group16.thread_rank();                              // 获得线程在新协作组中的编号，并作为计算输入

     output = sumReduction(group16, workspace + offset, input);  // 规约计算，注意工作空间的地址偏移

     expectedOutput =  *  / ;                               // 预期计算结果，0 + 1 + 2 +...+ 16 = 120

     if (group16.thread_rank() == )                             // 各协作组零号线程报告计算结果

         printf("\n\tSum of all ranks 0..15 in group16 is %d (expected %d)\n", output, expectedOutput);

     return;

 }

 int main()

 {

     printf("\n\tStart with %d threads.\n", THREAD_PER_BLOCK);

     cgkernel << <, THREAD_PER_BLOCK, THREAD_PER_BLOCK * sizeof(int) >> > ();

     cudaDeviceSynchronize();

     printf("\n\tFinish.\n");

     getchar();

     return ;

 }

● 输出结果

        Start with  threads.

        Sum of thread  ~  in group is  (expected )

        Now creating  groups, each of size  threads:

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset =

 -> thread_rank = , group16.thread_rank = ， offset = 

        Sum of all ranks .. in group16 is  (expected )

        Sum of all ranks .. in group16 is  (expected )

        Sum of all ranks .. in group16 is  (expected )

        Sum of all ranks .. in group16 is  (expected )

        Finish.

▶ 涨姿势：

● 相关定义

 // cooperative_groups_helper.h

 # if !defined(_CG_QUALIFIER)

 #  define _CG_QUALIFIER __forceinline__ __device__

 # endif

 # define die() assert();

 // cooperative_groups.h（调整顺序）

 class thread_group                      // 通用线程组类型

 {

     friend _CG_QUALIFIER thread_group this_thread();

     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);

     friend class thread_block;

 protected:

     union __align__()

     {

         unsigned int type : ;

         struct

         {

             unsigned int type : ;

             unsigned int size : ;

             unsigned int mask;

         } coalesced;

         struct

         {

             void* ptr[];

         } buffer;

     } _data;

     _CG_QUALIFIER thread_group operator=(const thread_group& src);

     _CG_QUALIFIER thread_group(__internal::groupType type)

     {

         _data.type = type;

     }

 #if __cplusplus >= 201103L

     static_assert(sizeof(_data) == , "Failed size check");

 #endif

 public:

     _CG_QUALIFIER unsigned int size() const;

     _CG_QUALIFIER unsigned int thread_rank() const;

     _CG_QUALIFIER void sync() const;

 };

 class thread_block : public thread_group

 {

     friend _CG_QUALIFIER thread_block this_thread_block();

     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);

     friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);

     _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) {}

     _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const

     {

         const bool pow2_tilesz = ((tilesz & (tilesz - )) == );

         if (tilesz ==  || (tilesz > ) || !pow2_tilesz)

         {

             die();

             return (thread_block());

         }

         unsigned int mask;

         unsigned int base_offset = thread_rank() & (~(tilesz - ));

         unsigned int masklength = min(size() - base_offset, tilesz);

         mask = (unsigned int)(-) >> ( - masklength);

         mask <<= (__internal::laneid() & ~(tilesz - ));

         thread_group tile = thread_group(__internal::CoalescedTile);

         tile._data.coalesced.mask = mask;

         tile._data.coalesced.size = __popc(mask);

         return (tile);

     }

 public:

     _CG_QUALIFIER void sync() const { __internal::cta::sync(); }

     _CG_QUALIFIER unsigned int size() const { return (__internal::cta::size()); }

     _CG_QUALIFIER unsigned int thread_rank() const { return (__internal::cta::thread_rank()); }

     _CG_QUALIFIER dim3 group_index() const { return (__internal::cta::group_index()); }

     _CG_QUALIFIER dim3 thread_index() const { return (__internal::cta::thread_index()); }

 };

 _CG_QUALIFIER thread_block this_thread_block()// 范例代码中用到的，实际是调用了 thread_block 的构造函数

 {

     return (thread_block());

 }

 template <unsigned int Size>

 class thread_block_tile;

 template <> class thread_block_tile<> : public __thread_block_tile_base<> { };

 template <> class thread_block_tile<> : public __thread_block_tile_base<> { };

 template <> class thread_block_tile<> : public __thread_block_tile_base<> { };

 template <> class thread_block_tile<> : public __thread_block_tile_base<> { };

 template <> class thread_block_tile<> : public __thread_block_tile_base<> { };

 template <> class thread_block_tile<> : public __thread_block_tile_base<> { };

 template <unsigned int Size>

 class __thread_block_tile_base : public thread_group

 {

     static const unsigned int numThreads = Size;

     _CG_QUALIFIER unsigned int build_mask() const

     {

         unsigned int mask;

         if (numThreads == )

             mask = 0xFFFFFFFF;

         else

         {

             mask = (unsigned int)(-) >> ( - numThreads);

             mask <<= (__internal::laneid() & (~(numThreads - )));

         }

         return (mask);

     }

 protected:

     _CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile)

     {

         _data.coalesced.mask = build_mask();

         _data.coalesced.size = numThreads;

     }

 public:

     _CG_QUALIFIER void sync() const { __syncwarp(build_mask()); }

     _CG_QUALIFIER unsigned int thread_rank() const { return (threadIdx.x & (numThreads - )); }

     _CG_QUALIFIER unsigned int size() const { return (numThreads); }

     // PTX supported collectives

     _CG_QUALIFIER int shfl(int var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); }

     ...

 #ifdef _CG_HAS_FP16_COLLECTIVE

     _CG_QUALIFIER __half shfl(__half var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); }

     ...

 #endif

 #ifdef _CG_HAS_MATCH_COLLECTIVE

     _CG_QUALIFIER unsigned int match_any(int val) const

     {

         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);

         return (lane_match >> (__internal::laneid() & (~(numThreads - ))));

     }

     ...

 #endif

 };

● 用到的线程协作相关函数

 thread_block threadBlockGroup = this_thread_block();    // 将当前线程块分配为一个协作组

 thread_block_tile<> tiledPartition16 = tiled_partition<>(threadBlockGroup); // 协作组分组

 int in = tiledPartition16.thread_rank();                // 协作组中线程的编号

 tiledPartition16.sync();                            // 协作组同步

0_Simple__simpleCooperativeGroups的更多相关文章

随机推荐

spfa【模板】
#include<iostream> #include<cstdio> #include<cstring> #include<queue> using ...
LG3684 [CERC2016]机棚障碍 Hangar Hurdles
题意题目描述你正在评估一些关于一个巨型飞机仓库的建设计划.飞机仓库的地面可以表示为n行n列的网格图,其中每个格子要么是空的,要么有障碍物.行从上到下依次被编号为1到n,列从左到右依次被编号为1到n ...
ES6 — 箭头函数
一为什么要有箭头函数我们在日常开发中,可能会需要写类似下面的代码 const Person = { 'name': 'little bear', 'age': 18, 'sayHello': fu ...
day20 python sys os time json pickl 正则
字符组 : [字符组] 在同一个位置可能出现的各种字符组成了一个字符组,在正则表达式中用[]表示字符分为很多类,比如数字.字母.标点等等. 假如你现在要求一个位置....9这10个数之一. 量词几 ...
开源泛域名服务xip.io部署试用
xip.io 是一个很方便的泛域名服务,类似的有一个xip.name 的开源实现下载 go get github.com/peterhellberg/xip.name 启动二进制包在GOPATH/ ...
转oracle 学习 - 表空间
Oracle 数据库的表空间和 Oracle 数据库数据文件关于 Oracle 数据库的表空间. 很多 Oracle 初学者弄不明白表空间的概念和意义,他们只知道给数据库建表的时候需要到表空间这个东 ...
c#数据库訪问返回值类型为SqlDataReader时使用using时注意的问题
版权声明:本文为博主原创文章,未经博主同意不得转载. https://blog.csdn.net/u010512579/article/details/24011761 在封装通用 SQLSERVER ...
ZooKeeper 知识点
zookeeper 命令: 命令说明 ./zkServer.sh start 启动ZooKeeper(终端执行) ./zkServer.sh stop 停止ZooKeeper(终端执行) ./zkC ...
commonJS模块规范和 es6模块规范区别
ES6 模块与 CommonJS 模块的差异 CommonJS 模块输出的是一个值的拷贝,ES6 模块输出的是值的引用. CommonJS 模块是运行时加载,ES6 模块是编译时输出接口. Commo ...
gitlab怎么删除创建的项目
在gitlab新建了一个项目,怎么将此项目删除呢?打开这个工程,点击右上角的“settings”拉到最下面,有个show them to me,点击~在下拉选项的最后,有个remove框,点击即可以彻 ...

0_Simple__simpleCooperativeGroups

0_Simple__simpleCooperativeGroups的更多相关文章

随机推荐

热门专题