使用 OpenMP 和 pthreads 两种环境,利用实现统一内存编址,计算基本的矩阵乘法 result = α * A * x + β * result 。

▶ 源代码

 #include <cstdio>
#include <vector>
#include <algorithm>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h> //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS
#ifdef USE_PTHREADS
#include <pthread.h>
#pragma comment(lib, "pthreadVC2.lib")
#else
#include <omp.h>
#endif // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
void srand48(long seed) { srand((unsigned int)seed); }
double drand48() { return double(rand()) / RAND_MAX; }
#endif template <typename T> struct Task// struct 也可使用类的构造和析构函数
{
unsigned int size, id;
T *data;
T *result;
T *vector; Task() : size(), id(), data(NULL), result(NULL), vector(NULL) {};
Task(unsigned int s) : size(s), id(), data(NULL), result(NULL)
{
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize();
} ~Task()
{
cudaDeviceSynchronize();
cudaFree(data);
cudaFree(result);
cudaFree(vector);
} void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存,初始化各成员数组
{
id = unique_id;
size = s;
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize(); for (int i = ; i < size*size; i++)
data[i] = drand48();
for (int i = ; i < size; i++)
{
result[i] = .;
vector[i] = drand48();
}
}
}; #ifdef USE_PTHREADS// 封装 pthread 型的任务
struct threadData_t
{
int tid;
Task<double> *TaskListPtr;
cudaStream_t *streams;
cublasHandle_t *handles;
int taskSize;
}; typedef struct threadData_t threadData;
#endif template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result
{
for (int i = ; i < m; i++)// 源代码这写成了 n,并且漏掉了后面的 alpha
{
result[i] *= *beta;
for (int j = ; j < n; j++)
result[i] += *alpha * A[i*n + j] * x[j];
}
} // execute a single task on either host or device depending on size
#ifdef USE_PTHREADS
void * execute(void* inpArgs)
{
threadData *dataPtr = (threadData *) inpArgs;
cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid; for (int i = ; i < dataPtr->taskSize; i++)
{
Task<double> &t = dataPtr->TaskListPtr[i];
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
return NULL;
}
#else
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
{
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
#endif template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)
{
for (unsigned int i = ; i < TaskList.size(); i++)
{
int size;
size = std::max((int)(drand48()*1000.0), );
TaskList[i].allocate(size, i);
}
} int main()
{
printf("\n\tStart.\n"); cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, );
if (!device_prop.managedMemory)
{
printf("\n\tUnified Memory not supported\n");
getchar();
return ;
}
if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式
{
printf("\n\tComputeMode is cudaComputeModeProhibited\n");
getchar();
return ;
} srand48(time(NULL));
const int nthreads = ;
cudaStream_t *streams = new cudaStream_t[nthreads+];
cublasHandle_t *handles = new cublasHandle_t[nthreads+];
for (int i=; i<nthreads+; i++)
{
cudaStreamCreate(&streams[i]);
cublasCreate(&handles[i]);
} unsigned int N = ;
std::vector<Task<double> > TaskList(N);
initialise_tasks(TaskList);
cudaSetDevice(); #ifdef USE_PTHREADS
pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads];
int temp = TaskList.size() / nthreads;
for (int i=; i < nthreads; i++)
{
InputToThreads[i].tid = i;
InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles; if (temp == ) // 任务数量比线程数少
{
InputToThreads[i].taskSize = ;
InputToThreads[i].TaskListPtr = &TaskList[];
}
else // 任务数量不少于线程数。任务尽量均分,多出的零头全部塞给最后一个线程
{
if (i == nthreads - )
{
InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];
}
else
{
InputToThreads[i].taskSize = temp;
InputToThreads[i].TaskListPtr = &TaskList[i*temp];
}
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i=; i < nthreads; i++)
pthread_join(threads[i], NULL);
#else
omp_set_num_threads(nthreads);
#pragma omp parallel for schedule(dynamic)
for (int i=; i<TaskList.size(); i++)
{
int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid);
}
#endif
cudaDeviceSynchronize(); // 清理工作
for (int i=; i<nthreads+; i++)
{
cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]);
}
std::vector< Task<double> >().swap(TaskList);
printf("\n\tFinish.\n");
getchar();
return ;
}

▶ 输出结果:OpenMP

    Start.

Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Finish.

▶ 输出结果:pthreads

    Start.

Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on host
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Finish.

▶ 涨姿势:

● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。

● 使用了 cuBLAS 库,注意句柄的使用和库函数的调用。

● 用到的申请内存的函数

 // driver_types.h
#define cudaMemAttachGlobal 0x01 // 可访问内存
#define cudaMemAttachHost 0x02 // 不可访问内存
#define cudaMemAttachSingle 0x04 // 单线程可访问内存 // cuda_runtime.h
template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = , unsigned int flags = cudaMemAttachSingle)
{
return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
} // cuda_runtime_api.h
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(), unsigned int flags __dv(cudaMemAttachSingle));

0_Simple__UnifiedMemoryStreams的更多相关文章

随机推荐

  1. Redis安装和主要功能简介

    Redis安装和主要功能简介   Redis(https://redis.io/), 是一个内存中的数据结构存储系统,它可以用作数据库.缓存和消息中间件. 安装Redis 我很少在开发机中直接装各种数 ...

  2. atitit.加入win 系统服务 bat批处理程序服务的法总结instsrv srvany java linux

    atitit.加入win 系统服务 bat批处理程序服务的法总结instsrv srvany  java linux 系统服务不同于普通视窗系统应用程式.不可能简简单单地通过执行一个EXE就启动视窗系 ...

  3. 移植RTL8188CUS USB-WIFI(移植失败)

    1.主makefile CONFIG_POWER_SAVING = n CONFIG_PLATFORM_I386_PC = n CONFIG_PLATFORM_HI3518E = y ##swann ...

  4. 【转】每天一个linux命令(55):traceroute命令

    原文网址:http://www.cnblogs.com/peida/archive/2013/03/07/2947326.html 通过traceroute我们可以知道信息从你的计算机到互联网另一端的 ...

  5. stylus 知识点

    循环的范围可以用两个小数点..表示,如(1..10)就是从1到10,并且包括1和10 for in 的循环范围写法: for index in 1 2 3 等价于: $li_length = 3 fo ...

  6. linq to sql 项目移植后,数据库实体类需要重新创建?

    项目中,使用LINQ to SQL 访问数据库,代码移植到其他机器上,每次需要重新生成dbml文件,有无方法只要更改app.config呢? 经过试验是可行的: 1.引用system.configur ...

  7. JZ2440 裸机驱动 第7章 内存管理单元MMU

    本章目标:     了解虚拟地址和物理地址的关系:     掌握如何通过设置MMU来控制虚拟地址到物理地址的转化:     了解MMU的内存访问权限机制:     了解TLB.Cache.Write ...

  8. TS流的解析

    个字节不一定都是有效数据,有一些可能为填充数据). 工作形式: 因为在TS流里可以填入很多种东西,所以有必要有一种机制来确定怎么来标识这些数据.制定TS流标准的机构就规定了一些数据结构来定义.比如: ...

  9. android自定义控件的一个思路-入门

    转自:http://blog.sina.com.cn/s/blog_691051e10101a3by.html   很多时候没有我们需要使用的控件,或者控件并不美观.比如这个滑动开关,这是androi ...

  10. CSS Grid布局入门

    相信大家都比较熟悉flex布局了,最近有空研究了波grid布局,感觉虽然兼容性还不是太高,应用不是太普遍,但是功能非常强大.未来应该是grid+flex为主流,grid是二维布局,很灵活,适合整体构架 ...