0_Simple__UnifiedMemoryStreams
使用 OpenMP 和 pthreads 两种环境,利用实现统一内存编址,计算基本的矩阵乘法 result = α * A * x + β * result 。
▶ 源代码
#include <cstdio>
#include <vector>
#include <algorithm>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h> //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS
#ifdef USE_PTHREADS
#include <pthread.h>
#pragma comment(lib, "pthreadVC2.lib")
#else
#include <omp.h>
#endif // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
void srand48(long seed) { srand((unsigned int)seed); }
double drand48() { return double(rand()) / RAND_MAX; }
#endif template <typename T> struct Task// struct 也可使用类的构造和析构函数
{
unsigned int size, id;
T *data;
T *result;
T *vector; Task() : size(), id(), data(NULL), result(NULL), vector(NULL) {};
Task(unsigned int s) : size(s), id(), data(NULL), result(NULL)
{
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize();
} ~Task()
{
cudaDeviceSynchronize();
cudaFree(data);
cudaFree(result);
cudaFree(vector);
} void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存,初始化各成员数组
{
id = unique_id;
size = s;
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize(); for (int i = ; i < size*size; i++)
data[i] = drand48();
for (int i = ; i < size; i++)
{
result[i] = .;
vector[i] = drand48();
}
}
}; #ifdef USE_PTHREADS// 封装 pthread 型的任务
struct threadData_t
{
int tid;
Task<double> *TaskListPtr;
cudaStream_t *streams;
cublasHandle_t *handles;
int taskSize;
}; typedef struct threadData_t threadData;
#endif template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result
{
for (int i = ; i < m; i++)// 源代码这写成了 n,并且漏掉了后面的 alpha
{
result[i] *= *beta;
for (int j = ; j < n; j++)
result[i] += *alpha * A[i*n + j] * x[j];
}
} // execute a single task on either host or device depending on size
#ifdef USE_PTHREADS
void * execute(void* inpArgs)
{
threadData *dataPtr = (threadData *) inpArgs;
cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid; for (int i = ; i < dataPtr->taskSize; i++)
{
Task<double> &t = dataPtr->TaskListPtr[i];
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
return NULL;
}
#else
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
{
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
#endif template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)
{
for (unsigned int i = ; i < TaskList.size(); i++)
{
int size;
size = std::max((int)(drand48()*1000.0), );
TaskList[i].allocate(size, i);
}
} int main()
{
printf("\n\tStart.\n"); cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, );
if (!device_prop.managedMemory)
{
printf("\n\tUnified Memory not supported\n");
getchar();
return ;
}
if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式
{
printf("\n\tComputeMode is cudaComputeModeProhibited\n");
getchar();
return ;
} srand48(time(NULL));
const int nthreads = ;
cudaStream_t *streams = new cudaStream_t[nthreads+];
cublasHandle_t *handles = new cublasHandle_t[nthreads+];
for (int i=; i<nthreads+; i++)
{
cudaStreamCreate(&streams[i]);
cublasCreate(&handles[i]);
} unsigned int N = ;
std::vector<Task<double> > TaskList(N);
initialise_tasks(TaskList);
cudaSetDevice(); #ifdef USE_PTHREADS
pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads];
int temp = TaskList.size() / nthreads;
for (int i=; i < nthreads; i++)
{
InputToThreads[i].tid = i;
InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles; if (temp == ) // 任务数量比线程数少
{
InputToThreads[i].taskSize = ;
InputToThreads[i].TaskListPtr = &TaskList[];
}
else // 任务数量不少于线程数。任务尽量均分,多出的零头全部塞给最后一个线程
{
if (i == nthreads - )
{
InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];
}
else
{
InputToThreads[i].taskSize = temp;
InputToThreads[i].TaskListPtr = &TaskList[i*temp];
}
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i=; i < nthreads; i++)
pthread_join(threads[i], NULL);
#else
omp_set_num_threads(nthreads);
#pragma omp parallel for schedule(dynamic)
for (int i=; i<TaskList.size(); i++)
{
int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid);
}
#endif
cudaDeviceSynchronize(); // 清理工作
for (int i=; i<nthreads+; i++)
{
cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]);
}
std::vector< Task<double> >().swap(TaskList);
printf("\n\tFinish.\n");
getchar();
return ;
}
▶ 输出结果:OpenMP
Start. Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Finish.
▶ 输出结果:pthreads
Start. Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on host
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Finish.
▶ 涨姿势:
● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。
● 使用了 cuBLAS 库,注意句柄的使用和库函数的调用。
● 用到的申请内存的函数
// driver_types.h
#define cudaMemAttachGlobal 0x01 // 可访问内存
#define cudaMemAttachHost 0x02 // 不可访问内存
#define cudaMemAttachSingle 0x04 // 单线程可访问内存 // cuda_runtime.h
template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = , unsigned int flags = cudaMemAttachSingle)
{
return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
} // cuda_runtime_api.h
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(), unsigned int flags __dv(cudaMemAttachSingle));
0_Simple__UnifiedMemoryStreams的更多相关文章
随机推荐
- 定义一组抽象的 Awaiter 的实现接口,你下次写自己的 await 可等待对象时将更加方便
我在几篇文章中都说到了在 .NET 中自己实现 Awaiter 情况.async / await 写异步代码用起来真的很爽,就像写同步一样.然而实现 Awaiter 没有现成的接口,它需要你按照编译器 ...
- 杭电OJ1789、南阳OJ236(贪心法)解题报告
杭电OJ1789http://acm.hdu.edu.cn/showproblem.php?pid=1789 南阳OJ236http://59.69.128.203/JudgeOnline/probl ...
- 【java规则引擎】《Drools7.0.0.Final规则引擎教程》第4章 4.2 lock-on-active
转载至:https://blog.csdn.net/wo541075754/article/details/75208955 lock-on-active 当在规则上使用ruleflow-group属 ...
- python表单验证封装
在Web程序中往往包含大量的表单验证的工作,如:判断输入是否为空,是否符合规则. <!DOCTYPE html><html><head lang="en&quo ...
- (研) int(*p)[10]; int *p[10]; int(*)[10]; 之间的区别
int *p[10]; 从这个最简单的说起 p先与后面的[4]结合,说明他本质是一个数组 ,“[]”的优先级比“*”要高.p先与“[]”结合,构成一个数组的定义,数组名为p,int *修饰的是数组的内 ...
- IBM DS存储存储性能调优
版权声明:本文为博主原创文章,未经博主同意不得转载. https://blog.csdn.net/jaminwm/article/details/26458791 ibm存储适用,其它存储有相似參数. ...
- 2018第52周日&技术人员如何面对裁员?
2018的最后一周,看了汽车的两个趋势,纯电动车和自动驾驶,两个现在很火但还有很多问题要解决的方向,尤其是自动驾驶.虽然它因为一些问题当前不能完全被克服,但因它而生的各种辅助驾驶技术却被用的越来越多, ...
- 【转】linux下解压.bz2压缩文件
原文网址:http://zhidao.baidu.com/question/90378903.html tar-c: 建立压缩档案-x:解压-t:查看内容-r:向压缩归档文件末尾追加文件-u:更新原压 ...
- cookie 知识点
cookie失效是由浏览器实现的,根据时间来控制,服务器端并不做cookie是否失效的验证. 某个cookie失效了浏览器发送请求时便不会带上它,服务器端自然就没有这个cookie了,所以对于服务器来 ...
- netty答题
1,介绍一下netty netty封装了Java原生的nio,是一个异步和数据驱动的网络编程框架, 与tcp: netty -> Java Runtime Socket (io.nio.nio2 ...