0_Simple__UnifiedMemoryStreams

使用 OpenMP 和 pthreads 两种环境，利用实现统一内存编址，计算基本的矩阵乘法 result = α * A * x + β * result 。

▶ 源代码

 #include <cstdio>

 #include <vector>

 #include <algorithm>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <cublas_v2.h>

 //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS

 #ifdef USE_PTHREADS

     #include <pthread.h>

     #pragma comment(lib, "pthreadVC2.lib")

 #else

     #include <omp.h>

 #endif

 // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

 void srand48(long seed) { srand((unsigned int)seed); }

 double drand48() { return double(rand()) / RAND_MAX; }

 #endif

 template <typename T> struct Task// struct 也可使用类的构造和析构函数

 {

     unsigned int size, id;

     T *data;

     T *result;

     T *vector;

     Task() : size(), id(), data(NULL), result(NULL), vector(NULL) {};

     Task(unsigned int s) : size(s), id(), data(NULL), result(NULL)

     {

         cudaMallocManaged(&data, sizeof(T)*size*size);

         cudaMallocManaged(&result, sizeof(T)*size);

         cudaMallocManaged(&vector, sizeof(T)*size);

         cudaDeviceSynchronize();

     }

     ~Task()

     {

         cudaDeviceSynchronize();

         cudaFree(data);

         cudaFree(result);

         cudaFree(vector);

     }

     void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存，初始化各成员数组

     {

         id = unique_id;

         size = s;

         cudaMallocManaged(&data, sizeof(T)*size*size);

         cudaMallocManaged(&result, sizeof(T)*size);

         cudaMallocManaged(&vector, sizeof(T)*size);

         cudaDeviceSynchronize();

         for (int i = ; i < size*size; i++)

             data[i] = drand48();

         for (int i = ; i < size; i++)

         {

             result[i] = .;

             vector[i] = drand48();

         }

     }

 };

 #ifdef USE_PTHREADS// 封装 pthread 型的任务

 struct threadData_t

 {

     int tid;

     Task<double> *TaskListPtr;

     cudaStream_t *streams;

     cublasHandle_t *handles;

     int taskSize;

 };

 typedef struct threadData_t threadData;

 #endif

 template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result

 {

     for (int i = ; i < m; i++)// 源代码这写成了 n，并且漏掉了后面的 alpha

     {

         result[i] *= *beta;

         for (int j = ; j < n; j++)

             result[i] += *alpha * A[i*n + j] * x[j];

     }

 }

 // execute a single task on either host or device depending on size

 #ifdef USE_PTHREADS

 void * execute(void* inpArgs)

 {

     threadData *dataPtr    = (threadData *) inpArgs;

     cudaStream_t *stream   = dataPtr->streams;

     cublasHandle_t *handle = dataPtr->handles;

     int tid                = dataPtr->tid;

     for (int i = ; i < dataPtr->taskSize; i++)

     {

         Task<double>  &t = dataPtr->TaskListPtr[i];

         double alpha = 1.0;

         double beta = 0.0;

         if (t.size < )// 数据规模较小在主机上运行，否则在设备上运行

         {

             printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);

             cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);

             cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);

             cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);

             cudaStreamSynchronize(stream[]);

             gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);

         }

         else

         {

             printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);

             cublasSetStream(handle[tid+], stream[tid+]);

             cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);

             cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);

             cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);

             cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );

         }

     }

     return NULL;

 }

 #else

 template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)

 {

     double alpha = 1.0;

     double beta = 0.0;

     if (t.size < )// 数据规模较小在主机上运行，否则在设备上运行

     {

         printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);

         cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);

         cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);

         cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);

         cudaStreamSynchronize(stream[]);

         gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);

     }

     else

     {

         printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);

         cublasSetStream(handle[tid+], stream[tid+]);

         cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);

         cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);

         cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);

         cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );

     }

 }

 #endif

 template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)

 {

     for (unsigned int i = ; i < TaskList.size(); i++)

     {

         int size;

         size = std::max((int)(drand48()*1000.0), );

         TaskList[i].allocate(size, i);

     }

 }

 int main()

 {

     printf("\n\tStart.\n");

     cudaDeviceProp device_prop;

     cudaGetDeviceProperties(&device_prop, );

     if (!device_prop.managedMemory)

     {

         printf("\n\tUnified Memory not supported\n");

         getchar();

         return ;

     }

     if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式

     {

         printf("\n\tComputeMode is cudaComputeModeProhibited\n");

         getchar();

         return ;

     }

     srand48(time(NULL));

     const int nthreads = ;

     cudaStream_t *streams = new cudaStream_t[nthreads+];

     cublasHandle_t *handles = new cublasHandle_t[nthreads+];

     for (int i=; i<nthreads+; i++)

     {

         cudaStreamCreate(&streams[i]);

         cublasCreate(&handles[i]);

     }

     unsigned int N = ;

     std::vector<Task<double> > TaskList(N);

     initialise_tasks(TaskList);

     cudaSetDevice();

 #ifdef USE_PTHREADS

     pthread_t threads[nthreads];

     threadData *InputToThreads = new threadData[nthreads];

     int temp = TaskList.size() / nthreads;

     for (int i=; i < nthreads; i++)

     {

         InputToThreads[i].tid = i;

         InputToThreads[i].streams = streams;

         InputToThreads[i].handles = handles;

         if (temp == )  // 任务数量比线程数少

         {

             InputToThreads[i].taskSize = ;

             InputToThreads[i].TaskListPtr = &TaskList[];

         }

         else            // 任务数量不少于线程数。任务尽量均分，多出的零头全部塞给最后一个线程

         {

             if (i == nthreads - )

             {

                 InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads);

                 InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];

             }

             else

             {

                 InputToThreads[i].taskSize = temp;

                 InputToThreads[i].TaskListPtr = &TaskList[i*temp];

             }

         }

         pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);

     }

     for (int i=; i < nthreads; i++)

         pthread_join(threads[i], NULL);

 #else

     omp_set_num_threads(nthreads);

     #pragma omp parallel for schedule(dynamic)

     for (int i=; i<TaskList.size(); i++)

     {

         int tid = omp_get_thread_num();

         execute(TaskList[i], handles, streams, tid);

     }

 #endif

     cudaDeviceSynchronize();

     // 清理工作

     for (int i=; i<nthreads+; i++)

     {

         cudaStreamDestroy(streams[i]);

         cublasDestroy(handles[i]);

     }

     std::vector< Task<double> >().swap(TaskList);

     printf("\n\tFinish.\n");

     getchar();

     return ;

 }

▶ 输出结果：OpenMP

    Start.

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [ ], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

Task [], thread [ ], size[ ], on device

    Finish.

▶ 输出结果：pthreads

    Start.

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [  ], on host

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [  ], on host

Task [ ], thread [ ], size [  ], on host

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [ ], thread [ ], size [  ], on host

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

Task [], thread [ ], size [ ], on device

    Finish.

▶ 涨姿势：

● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。

● 使用了 cuBLAS 库，注意句柄的使用和库函数的调用。

● 用到的申请内存的函数

 // driver_types.h

 #define cudaMemAttachGlobal 0x01  // 可访问内存

 #define cudaMemAttachHost   0x02  // 不可访问内存

 #define cudaMemAttachSingle 0x04  // 单线程可访问内存

 // cuda_runtime.h

 template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = , unsigned int flags = cudaMemAttachSingle)

 {

     return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);

 }

 // cuda_runtime_api.h

 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(), unsigned int flags __dv(cudaMemAttachSingle));

0_Simple__UnifiedMemoryStreams的更多相关文章

随机推荐

BZOJ3672: [Noi2014]购票【CDQ分治】【点分治】【斜率优化DP】
Description 今年夏天,NOI在SZ市迎来了她30周岁的生日.来自全国 n 个城市的OIer们都会从各地出发,到SZ市参加这次盛会. 全国的城市构成了一棵以SZ市为根的有根树,每个城市与它的 ...
JavaScript 之call , apply 和prototype 介绍
1. 前言为什么将这三个概念放在一起说.原因是这些是会在实现js 继承会需要使用到的 2. call 和 apply call 和 apply 的作用基本类似, 都是去执行function并将这个f ...
（7）random（随机模块）
import random print(random.random()) #得到一个随机的数,但是随机的数的范围是(0,1),这里用小括号(开曲线)代表取不到0也取不到1,o-1之间只有小数,所以只能 ...
oracle 11g 建库建表增删改查约束
一.建库 1.(点击左上角带绿色+号的按钮) 2.(进入这个界面,passowrd为密码.填写完后点击下面一排的Test按钮进行测试,无异常就点击Connect) 二.建表 1-1. create t ...
【java规则引擎】《Drools7.0.0.Final规则引擎教程》第4章 4.3 日历
日历日历可以单独应用于规则中,也可以和timer结合使用在规则中使用.通过属性calendars来定义日历.如果是多个日历,则不同日历之间用逗号进行分割. 在Drools中,日历的概念只是将日历属性 ...
Android中logcat和日志打印
一.logcat对日志过滤 1.# logcat --help # logcat --help Usage: logcat [options] [filterspecs] options inclu ...
memsql 6.7集群安装
预备环境处理安装yum 源 yum install -y yum-utils yum-config-manager --add-repo https://release.memsql.com/pro ...
BAT编程
echo 表示显示此命令后的字符 echo off 表示在此语句后所有运行的命令都不显示命令行本身 @与echo off相象,但它是加在每个命令行的最前面,表示运行时不显示这一行的命令行(只能影响 ...
【转】每天一个linux命令（8）：cp 命令
原文网址:http://www.cnblogs.com/peida/archive/2012/10/29/2744185.html cp命令用来复制文件或者目录,是Linux系统中最常用的命令之一.一 ...
打开Visual Studio 2012的解决方案连接 Dynamics CRM 2011 的Connect to Dynamics CRM Server 在其工具下没有显示
一.使用TFS 代码管理,发现Visual Studio 2012 菜单栏工具下的Connect to Dynamics CRM Server 没有显示. 平常打开VS下的工具都会出现Connect ...

0_Simple__UnifiedMemoryStreams

0_Simple__UnifiedMemoryStreams的更多相关文章

随机推荐

热门专题