0_Simple__simpleP2P

使用 P2P 特性在 GPU 之间传输、读写数据。

▶ 源代码。包括 P2P 使用前的各项检查，设备之间的数据互拷，主机和设备之间数据传输和相互访问。

 #include <stdlib.h>

 #include <stdio.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_cuda.h>

 #include <helper_functions.h>

 #define MAX_GPU_COUNT 64

 __global__ void SimpleKernel(float *src, float *dst)

 {

     const int idx = blockIdx.x * blockDim.x + threadIdx.x;

     dst[idx] = src[idx] * 2.0f;

 }

 inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)

 {

 #ifdef _WIN32

     return (bool)(pProp->tccDriver ? true : false);

 #else

     return (bool)(pProp->major >= );

 #endif

 }

 int main(int argc, char **argv)

 {

     printf("\n\tStarting\n", argv[]);

     // 检查是否使用 64 位操作系统环境

     if (sizeof(void*) != )

     {

         printf("\n\tError for program only supported with 64-bit OS and 64-bit target\n");

         return EXIT_WAIVED;

     }

     // 找到头两块计算能力不小于 2.0 的设备

     int gpu_n;

     cudaGetDeviceCount(&gpu_n);

     printf("\n\tDevice count: %d\n", gpu_n);

     if (gpu_n < )

     {

         printf("\n\tError for two or more GPUs with SM2.0 required\n");

         return EXIT_WAIVED;

     }

     cudaDeviceProp prop[MAX_GPU_COUNT];

     int gpuid[MAX_GPU_COUNT], gpu_count = ;

     printf("\n\tShow device\n");// 展示所有设备

     for (int i=; i < gpu_n; i++)

     {

         cudaGetDeviceProperties(&prop[i], i);

         if ((prop[i].major >= )

 #ifdef _WIN32

             && prop[i].tccDriver// Windows 系统还要求有 Tesla 计算集群驱动

 #endif

            )

             gpuid[gpu_count++] = i;

         printf("\n\tGPU%d = \"%15s\" ---- %s\n", i, prop[i].name, (IsGPUCapableP2P(&prop[i]) ? "YES" : "NO"));

     }

     if (gpu_count < )

     {

         printf("\n\tError for two or more GPUs with SM2.0 required\n");

 #ifdef _WIN32

         printf("\nOr for TCC driver required\n");

 #endif

         cudaSetDevice();

         return EXIT_WAIVED;

     }

     // 寻找测试设备

     int can_access_peer, p2pCapableGPUs[];

     p2pCapableGPUs[] = p2pCapableGPUs[] = -;

     printf("\n\tShow combination of devices with P2P\n");// 展示所有能 P2P 的设备组合

     for (int i = ; i < gpu_count - ; i++)

     {

         for (int j = i + ; j < gpu_count; j++)

         {

             cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);

             if (can_access_peer)

             {

                 printf("\n\tGPU%d (%s) <--> GPU%d (%s) : %s\n", gpuid[i], prop[gpuid[i]].name, gpuid[j], prop[gpuid[j]].name);

                 if (p2pCapableGPUs[] == -)

                     p2pCapableGPUs[] = gpuid[i], p2pCapableGPUs[] = gpuid[j];

             }

         }

     }

     if (p2pCapableGPUs[] == - || p2pCapableGPUs[] == -)

     {

         printf("\n\tError for P2P not available among GPUs\n");

         for (int i=; i < gpu_count; i++)

             cudaSetDevice(gpuid[i]);

         return EXIT_WAIVED;

     }

     // 使用找到的设备进行测试

     gpuid[] = p2pCapableGPUs[];

     gpuid[] = p2pCapableGPUs[];

     printf("\n\tEnabling P2P between GPU%d and GPU%d\n", gpuid[], gpuid[]);

     // 启用 P2P

     cudaSetDevice(gpuid[]);

     cudaDeviceEnablePeerAccess(gpuid[], );

     cudaSetDevice(gpuid[]);

     cudaDeviceEnablePeerAccess(gpuid[], );

     // 检查设备是否支持同一可视地址空间 (Unified Virtual Address Space，UVA)

     if (!(prop[gpuid[]].unifiedAddressing && prop[gpuid[]].unifiedAddressing))

         printf("\n\tError for GPU not support UVA\n");

         return EXIT_WAIVED;

     // 申请内存

     const size_t buf_size =  *  *  * sizeof(float);

     printf("\n\tAllocating buffers %iMB\n", int(buf_size /  / ));

     cudaSetDevice(gpuid[]);

     float *g0;

     cudaMalloc(&g0, buf_size);

     cudaSetDevice(gpuid[]);

     float *g1;

     cudaMalloc(&g1, buf_size);

     float *h0;

     cudaMallocHost(&h0, buf_size);

     cudaEvent_t start_event, stop_event;

     int eventflags = cudaEventBlockingSync;

     float time_memcpy;

     cudaEventCreateWithFlags(&start_event, eventflags);

     cudaEventCreateWithFlags(&stop_event, eventflags);

     cudaEventRecord(start_event, );

     for (int i=; i<; i++)

     {

         // GPU 互拷

         // UVA 特性下 cudaMemcpyDefault 直接根据指针（属于主机还是设备）来确定拷贝方向

         if (i %  == )

             cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);

         else

             cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault);

     }

     cudaEventRecord(stop_event, );

     cudaEventSynchronize(stop_event);

     cudaEventElapsedTime(&time_memcpy, start_event, stop_event);

     printf("\n\tcudaMemcpy: %.2fGB/s\n", (100.0f * buf_size) / (1024.0f * 1024.0f * 1024.0f * (time_memcpy / 1000.0f)));

     for (int i=; i<buf_size / sizeof(float); i++)

         h0[i] = float(i % );

     cudaSetDevice(gpuid[]);

     cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);

     const dim3 threads(, );

     const dim3 blocks((buf_size / sizeof(float)) / threads.x, );

     // 使用 GPU1 读取 GPU0 的全局内存数据，计算并写入 GPU1 的全局内存

     printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[], gpuid[], gpuid[]);

     cudaSetDevice(gpuid[]);

     SimpleKernel<<<blocks, threads>>>(g0, g1);

     cudaDeviceSynchronize();

     // 使用 GPU0 读取 GPU1 的全局内存数据，计算并写入 GPU0 的全局内存

     printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[], gpuid[], gpuid[]);

     cudaSetDevice(gpuid[]);

     SimpleKernel<<<blocks, threads>>>(g1, g0);

     cudaDeviceSynchronize();

     // 检查结果

     cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault);

     int error_count = ;

     for (int i=; i<buf_size / sizeof(float); i++)

     {

         if (h0[i] != float(i % ) * 2.0f * 2.0f)

         {

             printf("\n\tResult error at %i: gpu[i] = %f, cpu[i] = %f\n", i, h0[i], (float(i%)*2.0f*2.0f));

             if (error_count++ > )

                 break;

         }

     }

     // 关闭 P2P

     cudaSetDevice(gpuid[]);

     cudaDeviceDisablePeerAccess(gpuid[]);

     cudaSetDevice(gpuid[]);

     cudaDeviceDisablePeerAccess(gpuid[]);

     // 回收工作

     cudaFreeHost(h0);

     cudaSetDevice(gpuid[]);

     cudaFree(g0);

     cudaSetDevice(gpuid[]);

     cudaFree(g1);

     cudaEventDestroy(start_event);

     cudaEventDestroy(stop_event);

     for (int i=; i<gpu_n; i++)

         cudaSetDevice(i);

     printf("\n\t%s!\n",error_count?"Test failed": "Test passed");

     getchar();

     return ;

 }

▶ 输出结果

只有一台设备，暂无法进行测试

▶ 涨姿势：

● P2P 要求：至少两台计算能力不低于 2.0 的设备，并支持同一可视内存空间特性；计算环境不低于 CUDA 4.0；Windows 安装 Tesla 计算集群驱动。

● 使用P2P的关键步骤

 // 检查两台设备之间是否能使用 P2P

 int can_access_peer;

 cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));

 // 启用 P2P

 cudaSetDevice(gpuid[i]);

 cudaDeviceEnablePeerAccess(gpuid[j], );

 cudaSetDevice(gpuid[j];

 cudaDeviceEnablePeerAccess(gpuid[i], );

 // 设备间传输数据

 cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);

 // 关闭 P2P

 cudaSetDevice(gpuid[i]);

 cudaDeviceDisablePeerAccess(gpuid[i]);

 cudaSetDevice(gpuid[j]);

 cudaDeviceDisablePeerAccess(gpuid[j]);

 // cuda_runtime_api.h

 extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);

 extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);

 extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);

● 其他代码中的定义

 // helper_string.h

 #define EXIT_WAIVED 2

0_Simple__simpleP2P的更多相关文章

随机推荐

Jquery中.attr与.prop的区别
☆ http://www.jb51.net/article/114876.htm http://www.365mini.com/page/jquery-attr-vs-prop.htm https:/ ...
CH3602 Counting Swaps
题意 3602 Counting Swaps 0x30「数学知识」例题背景 https://ipsc.ksp.sk/2016/real/problems/c.html Just like yeste ...
FastAdmin 开发时对数据库进行版本管理（非 think-migration）
因为开必项目,暂时还不没用 think-migration,先用脚本处理. 在导出 SQL 时将相关字段数据还原,比如 admin logitime updatetime token. 把 admi ...
asm数据文件迁移(asm–>asm)
rman迁移操作 [oracle@localhost oradata]$ rman target / Recovery Manager: Release 10.2.0.3.0 - Producti ...
http 301 和 302的区别
301 永久重定向 301 重定向是当用户或搜索引擎向网站服务器发出浏览请求时,服务器返回的HTTP数据流中头信息(header)中的状态码的一种,表示本网页永久性转移到另一个地址. 301 重定向是 ...
Spring 集成开发工具（STS）安装及配置
安装 spring 集成开发工具,下载地址:https://spring.io/tools 下载后,解压,双击 STS ,运行. 如果提示: 去oracle的网站上下载了1.8版本的jdk,下载地址如 ...
Apache Shiro在web开发安全框架中的应用
前阶段就hadoop的分享了一些内容,希望对新手入门的朋友有点帮助吧!对于hadoop新手入门的,还是比较推荐大快搜索的DKHadoop发行版,三节点标准版还是值得拥有的(三节点的标准版是可以免费下载 ...
ASP.NET 实现伪静态网页方法
方法一:利用Httphandler实现URL重写(伪URL及伪静态) 我们有时候会见到这样的地址:“http://www.huoho.com/show-12-34.html”,你或许认为在站点服务器根 ...
大数据工具篇之flume1.4-安装部署指南
一.引言 flume-ng是一个分布式.高可靠和高效的日志收集系统,flume-ng是flume的新版本的意思,其中“ng”意为new generate(新一代),目前来说,flume-ng 1.4是 ...
python 可视化二维坐标标注等等
基本画图操作: import matplotlib.pyplot as plt import numpy as np x = np.linspace(-3,3,50) y1 = 2*x+1 y2 = ...

0_Simple__simpleP2P

0_Simple__simpleP2P的更多相关文章

随机推荐

热门专题