使用 OpenMP 和 pthreads 两种环境,利用实现统一内存编址,计算基本的矩阵乘法 result = α * A * x + β * result 。

▶ 源代码

 #include <cstdio>
#include <vector>
#include <algorithm>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h> //#define USE_PTHREADS // 使用 pthread 时补充定义 USE_PTHREADS
#ifdef USE_PTHREADS
#include <pthread.h>
#pragma comment(lib, "pthreadVC2.lib")
#else
#include <omp.h>
#endif // Windows 系统需要构造与函数 SRAND48 和 DRAND48 等价的随机函数
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
void srand48(long seed) { srand((unsigned int)seed); }
double drand48() { return double(rand()) / RAND_MAX; }
#endif template <typename T> struct Task// struct 也可使用类的构造和析构函数
{
unsigned int size, id;
T *data;
T *result;
T *vector; Task() : size(), id(), data(NULL), result(NULL), vector(NULL) {};
Task(unsigned int s) : size(s), id(), data(NULL), result(NULL)
{
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize();
} ~Task()
{
cudaDeviceSynchronize();
cudaFree(data);
cudaFree(result);
cudaFree(vector);
} void allocate(const unsigned int s, const unsigned int unique_id)// 申请内存,初始化各成员数组
{
id = unique_id;
size = s;
cudaMallocManaged(&data, sizeof(T)*size*size);
cudaMallocManaged(&result, sizeof(T)*size);
cudaMallocManaged(&vector, sizeof(T)*size);
cudaDeviceSynchronize(); for (int i = ; i < size*size; i++)
data[i] = drand48();
for (int i = ; i < size; i++)
{
result[i] = .;
vector[i] = drand48();
}
}
}; #ifdef USE_PTHREADS// 封装 pthread 型的任务
struct threadData_t
{
int tid;
Task<double> *TaskListPtr;
cudaStream_t *streams;
cublasHandle_t *handles;
int taskSize;
}; typedef struct threadData_t threadData;
#endif template <typename T> void gemv(int m, int n, T *alpha, T *A, T *x, T *beta, T *result)// 计算 result = α * A * x + β * result
{
for (int i = ; i < m; i++)// 源代码这写成了 n,并且漏掉了后面的 alpha
{
result[i] *= *beta;
for (int j = ; j < n; j++)
result[i] += *alpha * A[i*n + j] * x[j];
}
} // execute a single task on either host or device depending on size
#ifdef USE_PTHREADS
void * execute(void* inpArgs)
{
threadData *dataPtr = (threadData *) inpArgs;
cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid; for (int i = ; i < dataPtr->taskSize; i++)
{
Task<double> &t = dataPtr->TaskListPtr[i];
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size [%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
return NULL;
}
#else
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
{
double alpha = 1.0;
double beta = 0.0;
if (t.size < )// 数据规模较小在主机上运行,否则在设备上运行
{
printf("\nTask [%2d], thread [%2d], size [%4d], on host",t.id,tid,t.size);
cudaStreamAttachMemAsync(stream[], t.data, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.vector, , cudaMemAttachHost);
cudaStreamAttachMemAsync(stream[], t.result, , cudaMemAttachHost);
cudaStreamSynchronize(stream[]);
gemv(t.size, t.size, &alpha, t.data, t.vector, &beta, t.result);
}
else
{
printf("\nTask [%2d], thread [%2d], size[%4d], on device",t.id,tid,t.size);
cublasSetStream(handle[tid+], stream[tid+]);
cudaStreamAttachMemAsync(stream[tid+], t.data, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.vector, , cudaMemAttachSingle);
cudaStreamAttachMemAsync(stream[tid+], t.result, , cudaMemAttachSingle);
cublasDgemv(handle[tid+], CUBLAS_OP_N, t.size, t.size, &alpha, t.data, t.size, t.vector, , &beta, t.result, );
}
}
#endif template <typename T> void initialise_tasks(std::vector< Task<T> > &TaskList)
{
for (unsigned int i = ; i < TaskList.size(); i++)
{
int size;
size = std::max((int)(drand48()*1000.0), );
TaskList[i].allocate(size, i);
}
} int main()
{
printf("\n\tStart.\n"); cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, );
if (!device_prop.managedMemory)
{
printf("\n\tUnified Memory not supported\n");
getchar();
return ;
}
if (device_prop.computeMode == cudaComputeModeProhibited)// Device 为线程禁用模式
{
printf("\n\tComputeMode is cudaComputeModeProhibited\n");
getchar();
return ;
} srand48(time(NULL));
const int nthreads = ;
cudaStream_t *streams = new cudaStream_t[nthreads+];
cublasHandle_t *handles = new cublasHandle_t[nthreads+];
for (int i=; i<nthreads+; i++)
{
cudaStreamCreate(&streams[i]);
cublasCreate(&handles[i]);
} unsigned int N = ;
std::vector<Task<double> > TaskList(N);
initialise_tasks(TaskList);
cudaSetDevice(); #ifdef USE_PTHREADS
pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads];
int temp = TaskList.size() / nthreads;
for (int i=; i < nthreads; i++)
{
InputToThreads[i].tid = i;
InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles; if (temp == ) // 任务数量比线程数少
{
InputToThreads[i].taskSize = ;
InputToThreads[i].TaskListPtr = &TaskList[];
}
else // 任务数量不少于线程数。任务尽量均分,多出的零头全部塞给最后一个线程
{
if (i == nthreads - )
{
InputToThreads[i].taskSize = temp + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr = &TaskList[i*temp + (TaskList.size() % nthreads)];
}
else
{
InputToThreads[i].taskSize = temp;
InputToThreads[i].TaskListPtr = &TaskList[i*temp];
}
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i=; i < nthreads; i++)
pthread_join(threads[i], NULL);
#else
omp_set_num_threads(nthreads);
#pragma omp parallel for schedule(dynamic)
for (int i=; i<TaskList.size(); i++)
{
int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid);
}
#endif
cudaDeviceSynchronize(); // 清理工作
for (int i=; i<nthreads+; i++)
{
cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]);
}
std::vector< Task<double> >().swap(TaskList);
printf("\n\tFinish.\n");
getchar();
return ;
}

▶ 输出结果:OpenMP

    Start.

Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [ ], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Task [], thread [ ], size[ ], on device
Finish.

▶ 输出结果:pthreads

    Start.

Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on host
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [ ], thread [ ], size [ ], on host
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Task [], thread [ ], size [ ], on device
Finish.

▶ 涨姿势:

● 使用 C++ 结构体完成了类似类的方法。即在结构体中定义构造函数、析构函数及其他方法。

● 使用了 cuBLAS 库,注意句柄的使用和库函数的调用。

● 用到的申请内存的函数

 // driver_types.h
#define cudaMemAttachGlobal 0x01 // 可访问内存
#define cudaMemAttachHost 0x02 // 不可访问内存
#define cudaMemAttachSingle 0x04 // 单线程可访问内存 // cuda_runtime.h
template<class T> static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, T *devPtr, size_t length = , unsigned int flags = cudaMemAttachSingle)
{
return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
} // cuda_runtime_api.h
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(), unsigned int flags __dv(cudaMemAttachSingle));

0_Simple__UnifiedMemoryStreams的更多相关文章

随机推荐

  1. .net core grpc 实现通信(一)

    现在系统都服务化,.net core 实现服务化的方式有很多,我们通过grpc实现客户端.服务端通信. grpc(https://grpc.io/)是google发布的一个开源.高性能.通用RPC(R ...

  2. Redis安装和主要功能简介

    Redis安装和主要功能简介   Redis(https://redis.io/), 是一个内存中的数据结构存储系统,它可以用作数据库.缓存和消息中间件. 安装Redis 我很少在开发机中直接装各种数 ...

  3. JQuery实时监控文本框字符变化

    $(function(){ $('input[name="addr"]').on('input propertychange', function() { if ($('input ...

  4. YUI JS压缩Ant脚本

    <?xml version="1.0" encoding="UTF-8"?><!-- 对指定目录下的所有js进行压缩,放入指定位置 --> ...

  5. NoSQLUnit

    NoSQLUnit Core Overview Unit testing is a method by which the smallest testable part of an applicati ...

  6. Kettle入门教程

    最近做的项目用到了ETL工具Kettle,这个工具相当好用,可以将各种类型数据作为数据流,经过处理后再生成各种类型的数据.正如其名“水壶”,将各个地方的水倒进水壶里,再用水壶倒入不同的容器.不过一来初 ...

  7. 使用OPRT库来实现局域网视频实时传输

    转载,侵删 4.代码设计 目的:使用OPRT库来实现局域网视频实时传输 参考samle_venc.c进行ortp开发 4.1.程序流程如下 step1:定义变量,VPSS,VENC,零散变量 step ...

  8. 【转】每天一个linux命令(48):watch命令

    原文网址:http://www.cnblogs.com/peida/archive/2012/12/31/2840241.html watch是一个非常实用的命令,基本所有的Linux发行版都带有这个 ...

  9. oracle常用数据类型说明

    类型 含义 存储描述 备注 CHAR 固定长度字符串 最大长度2000bytes VARCHAR2 可变长度的字符串, 最大长度4000bytes 可做索引的最大长度749 NCHAR 根据字符集而定 ...

  10. JS时间(日期)比较或相减(暂时停用)

    注:此文均来自网上,可行,只供参考 //JAVASCRIPT中 日期相减很麻烦 ,现在有现成的实现方法,拷贝过去就可以用了,方便 //调用该方法(主方法) function dateDiff(date ...