使用 OpenMP 和 pthreads 两种环境,利用实现统一内存编址,计算基本的矩阵乘法 result = α * A * x + β * result 。

▶ 源代码

 #include <cstdio>
#include <vector>
#include <algorithm>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h> //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS
#ifdef USE_PTHREADS
#include <pthread.h>
#pragma comment(lib, "pthreadVC2.lib")
#else
#include <omp.h>
#endif // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
void srand48(long seed) { srand((unsigned int)seed); }
double drand48() { return double(rand()) / RAND_MAX; }
#endif template <typename T> struct Task// struct 也可使用类的构造和析构函数
{
unsigned int size, id;
T *data;
T *result;
T *vector; Task() : size(), id(), data(NULL), result(NULL), vector(NULL) {};
Task(unsigned int s) : size(s), id(), data(NULL), result(NULL)
{
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize();
} ~Task()
{
cudaDeviceSynchronize();
cudaFree(data);
cudaFree(result);
cudaFree(vector);
} void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存,初始化各成员数组
{
id = unique_id;
size = s;
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize(); for (int i = ; i < size*size; i++)
data[i] = drand48();
for (int i = ; i < size; i++)
{
result[i] = .;
vector[i] = drand48();
}
}
}; #ifdef USE_PTHREADS// 封装 pthread 型的任务
struct threadData_t
{
int tid;
Task<double> *TaskListPtr;
cudaStream_t *streams;
cublasHandle_t *handles;
int taskSize;
}; typedef struct threadData_t threadData;
#endif template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result
{
for (int i = ; i < m; i++)// 源代码这写成了 n,并且漏掉了后面的 alpha
{
result[i] *= *beta;
for (int j = ; j < n; j++)
result[i] += *alpha * A[i*n + j] * x[j];
}
} // execute a single task on either host or device depending on size
#ifdef USE_PTHREADS
void * execute(void* inpArgs)
{
threadData *dataPtr = (threadData *) inpArgs;
cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid; for (int i = ; i < dataPtr->taskSize; i++)
{
Task<double> &t = dataPtr->TaskListPtr[i];
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
return NULL;
}
#else
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
{
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
#endif template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)
{
for (unsigned int i = ; i < TaskList.size(); i++)
{
int size;
size = std::max((int)(drand48()*1000.0), );
TaskList[i].allocate(size, i);
}
} int main()
{
printf("\n\tStart.\n"); cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, );
if (!device_prop.managedMemory)
{
printf("\n\tUnified Memory not supported\n");
getchar();
return ;
}
if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式
{
printf("\n\tComputeMode is cudaComputeModeProhibited\n");
getchar();
return ;
} srand48(time(NULL));
const int nthreads = ;
cudaStream_t *streams = new cudaStream_t[nthreads+];
cublasHandle_t *handles = new cublasHandle_t[nthreads+];
for (int i=; i<nthreads+; i++)
{
cudaStreamCreate(&streams[i]);
cublasCreate(&handles[i]);
} unsigned int N = ;
std::vector<Task<double> > TaskList(N);
initialise_tasks(TaskList);
cudaSetDevice(); #ifdef USE_PTHREADS
pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads];
int temp = TaskList.size() / nthreads;
for (int i=; i < nthreads; i++)
{
InputToThreads[i].tid = i;
InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles; if (temp == ) // 任务数量比线程数少
{
InputToThreads[i].taskSize = ;
InputToThreads[i].TaskListPtr = &TaskList[];
}
else // 任务数量不少于线程数。任务尽量均分,多出的零头全部塞给最后一个线程
{
if (i == nthreads - )
{
InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];
}
else
{
InputToThreads[i].taskSize = temp;
InputToThreads[i].TaskListPtr = &TaskList[i*temp];
}
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i=; i < nthreads; i++)
pthread_join(threads[i], NULL);
#else
omp_set_num_threads(nthreads);
#pragma omp parallel for schedule(dynamic)
for (int i=; i<TaskList.size(); i++)
{
int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid);
}
#endif
cudaDeviceSynchronize(); // 清理工作
for (int i=; i<nthreads+; i++)
{
cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]);
}
std::vector< Task<double> >().swap(TaskList);
printf("\n\tFinish.\n");
getchar();
return ;
}

▶ 输出结果:OpenMP

    Start.

Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Finish.

▶ 输出结果:pthreads

    Start.

Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on host
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Finish.

▶ 涨姿势:

● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。

● 使用了 cuBLAS 库,注意句柄的使用和库函数的调用。

● 用到的申请内存的函数

 // driver_types.h
#define cudaMemAttachGlobal 0x01 // 可访问内存
#define cudaMemAttachHost 0x02 // 不可访问内存
#define cudaMemAttachSingle 0x04 // 单线程可访问内存 // cuda_runtime.h
template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = , unsigned int flags = cudaMemAttachSingle)
{
return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
} // cuda_runtime_api.h
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(), unsigned int flags __dv(cudaMemAttachSingle));

0_Simple__UnifiedMemoryStreams的更多相关文章

随机推荐

  1. (1)什么是socket(套接字)

    什么是套接字socket socket把传输层以下的协议都封装成了简单的接口,我编写基于网络通信的软件只需要调用这些接口即可,写出的程序自然是遵循tcp/udp协议的.... 什么是互联网 互联网=物 ...

  2. 视觉惯性里程计Visual–Inertial Odometry(VIO)概述

    周围很多朋友开始做vio了,之前在知乎上也和胖爷讨论过这个问题,本文主要来自于知乎的讨论. 来自https://www.zhihu.com/question/53571648/answer/13772 ...

  3. LG4454 【[CQOI2018]破解D-H协议】

    先谈一下BSGS算法(传送门) 但是上面这位的程序实现比较繁琐,看下面这位的. clover_hxy这样说 bsgs算法,又称大小步算法(某大神称拔山盖世算法). 主要用来解决 A^x=B(mod C ...

  4. ES6必知必会 (一)—— 变量声明和结构赋值

    本文章属于个人对es6一些比较常用的语法的总结归纳,其主要参考阮一峰大神的<a href="http://es6.ruanyifeng.com//">ECMAScrip ...

  5. ES6 — 箭头函数

    一 为什么要有箭头函数 我们在日常开发中,可能会需要写类似下面的代码 const Person = { 'name': 'little bear', 'age': 18, 'sayHello': fu ...

  6. (领悟)第一个servlet

    //这个代码不可以写在javase的project文件里面//要写在动态web工程里面,不需要配置文件,不需要jsp,只需jar包的帮助(url跳转的必须是java-web文件) import jav ...

  7. harbor helm 仓库使用

    harbor 已经支持helm 私服仓库了,还是比较方便的 安装 下载在线安装包 wget https://storage.googleapis.com/harbor-releases/release ...

  8. CodeIgniter 安装指导

    CodeIgniter 安装分为四个步骤: 解压缩安装包. 把 CodeIgniter 文件夹和里面的文件上传到你的服务器.通常 index.php 在根目录. 用任何文本编辑器打开 applicat ...

  9. Windows下搭建Subversion 服务器

    一.准备工作 1.获取 Subversion 服务器程序 到官方网站(http://subversion.tigris.org/)下载最新的服务器安装程序.目前最新的是1.5版本,具体下载地址在:ht ...

  10. Java性能分析之线程栈详解与性能分析

    Java性能分析之线程栈详解 Java性能分析迈不过去的一个关键点是线程栈,新的性能班级也讲到了JVM这一块,所以本篇文章对线程栈进行基础知识普及以及如何对线程栈进行性能分析. 基本概念 线程堆栈也称 ...