0_Simple__simpleMultiCopy
利用 CUDA 的 Overlap 特性同时进行运算和数据拷贝来实现加速。
▶ 源代码。使用 4 个流一共执行 10 次 “数据上传 - 内核计算 - 数据下载” 过程,记录使用时间。
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_cuda.h>
#include <helper_functions.h> #define STREAM_COUNT 4
#define NREPS 10
#define INNER_REPS 5 int N = << ;
int memsize; int *h_data_source;
int *h_data_sink;
int *h_data_in[STREAM_COUNT];
int *h_data_out[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];
int *d_data_out[STREAM_COUNT]; dim3 grid;
dim3 block();
cudaEvent_t cycleDone[STREAM_COUNT], start, stop;
cudaStream_t stream[STREAM_COUNT]; __global__ void incKernel(int *g_out, int *g_in, int size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size)
{
for (int i = ; i < INNER_REPS; ++i)// 暴力重复 5 次,不会被编译器优化掉?
g_out[idx] = g_in[idx] + ;
}
} float processWithStreams(int streams_used)
{
cudaEventRecord(start, );
for (int i = , current_stream = ; i < NREPS; ++i)
{
int next_stream = (current_stream + ) % streams_used; #ifdef SIMULATE_IO// ?
// 改变下载数据
memcpy(h_data_sink, h_data_out[current_stream], memsize); // 改变上传数据
memcpy(h_data_in[next_stream], h_data_source, memsize);
#endif // 保证上一次循环中位于流 next_stream 中的任务已经完成
cudaEventSynchronize(cycleDone[next_stream]); // 执行当前流的内核
incKernel << <grid, block, , stream[current_stream] >> > (d_data_out[current_stream], d_data_in[current_stream], N); // 执行下一个流的数据上传
cudaMemcpyAsync(d_data_in[next_stream],h_data_in[next_stream],memsize,cudaMemcpyHostToDevice,stream[next_stream]); // 执行当前流的数据下载
cudaMemcpyAsync(h_data_out[current_stream],d_data_out[current_stream],memsize,cudaMemcpyDeviceToHost,stream[current_stream]); cudaEventRecord(cycleDone[current_stream],stream[current_stream]); current_stream = next_stream;
}
cudaEventRecord(stop, );
cudaDeviceSynchronize(); float time;
cudaEventElapsedTime(&time, start, stop);
return time;
} bool test()
{
bool passed = true;
for (int j = ; j<STREAM_COUNT; ++j)
{
for (int i = ; i < N; ++i)
passed &= (h_data_out[j][i] == );
}
return passed;
} int main(int argc, char *argv[])
{
printf("\n\tStart.\n"); // 挑选设备和分析设备性能
int cuda_device = ;
cudaDeviceProp deviceProp; if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
if ((cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=")) < )
{
printf("Invalid command line parameters\n");
exit(EXIT_FAILURE);
}
else
{
printf("cuda_device = %d\n", cuda_device);
if ((cuda_device = gpuDeviceInit(cuda_device)) < )
{
printf("No CUDA Capable devices found, exiting...\n");
exit(EXIT_SUCCESS);
}
}
}
else
{
cuda_device = gpuGetMaxGflopsDeviceId();
cudaSetDevice(cuda_device);
cudaGetDeviceProperties(&deviceProp, cuda_device);
printf("\n\tDevice [%d]: %s, computation cability %d.%d, ", cuda_device, deviceProp.name, deviceProp.major, deviceProp.minor);
}
cudaGetDeviceProperties(&deviceProp, cuda_device);
printf("%d MP(s) x %d (Cores/MP) = %d (Cores)\n",
deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),// 依计算能力反应流处理器个数情况,定义于 helper_cuda.h
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf("\n\tRelevant properties of this CUDA device.\n");
printf("\t(%c) Execute several GPU kernels simultaneously\n", deviceProp.major >= ? 'Y' : 'N');
printf("\t(%c) Overlap one CPU<->GPU data transfer with GPU kernel execution\n", deviceProp.deviceOverlap ? 'Y' : 'N');
printf("\t(%c) Overlap two CPU<->GPU data transfers with GPU kernel execution\n",(deviceProp.major >= && deviceProp.asyncEngineCount > )? 'Y' : 'N'); // 如果流处理器个数少于 32,则降低工作负荷
float scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
N = (int)((float)N / scale_factor);
printf("\n\tscale_factor = %.2f\n\tarray_size = %d\n", 1.0f / scale_factor, N); // 准备运行配置
memsize = N * sizeof(int);
int thread_blocks = N / block.x;
grid.x = thread_blocks % ;
grid.y = (thread_blocks / + ); h_data_source = (int *) malloc(memsize);
h_data_sink = (int *) malloc(memsize);
for (int i = ; i < STREAM_COUNT; ++i)
{
cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault);
cudaMalloc(&d_data_in[i], memsize);
cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault);
cudaMalloc(&d_data_out[i], memsize); cudaStreamCreate(&stream[i]);
cudaEventCreate(&cycleDone[i]);
cudaEventRecord(cycleDone[i], stream[i]);
}
cudaEventCreate(&start);
cudaEventCreate(&stop); // 初始化 h_data_source 和 h_data_in
for (int i = ; i<N; ++i)
h_data_source[i] = ;
for (int i = ; i < STREAM_COUNT; ++i)
memcpy(h_data_in[i], h_data_source, memsize); // 预跑
incKernel<<<grid, block>>>(d_data_out[], d_data_in[], N); // 各种测试
cudaEventRecord(start,);
cudaMemcpyAsync(d_data_in[], h_data_in[], memsize, cudaMemcpyHostToDevice, );
cudaEventRecord(stop,);
cudaEventSynchronize(stop); float memcpy_h2d_time;
cudaEventElapsedTime(&memcpy_h2d_time, start, stop); cudaEventRecord(start,);
cudaMemcpyAsync(h_data_out[], d_data_out[], memsize, cudaMemcpyDeviceToHost, );
cudaEventRecord(stop,);
cudaEventSynchronize(stop); float memcpy_d2h_time;
cudaEventElapsedTime(&memcpy_d2h_time, start, stop); cudaEventRecord(start,);
incKernel<<<grid, block,,>>>(d_data_out[], d_data_in[], N);
cudaEventRecord(stop,);
cudaEventSynchronize(stop); float kernel_time;
cudaEventElapsedTime(&kernel_time, start, stop); printf("\n\tMeasured timings (throughput):\n");
printf("\tMemcpy host to device:\t%f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-) / memcpy_h2d_time);
printf("\tMemcpy device to host:\t%f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-) / memcpy_d2h_time);
printf("\tKernel: \t%f ms (%f GB/s)\n", kernel_time, (INNER_REPS *memsize * 2e-) / kernel_time); printf("\n\tTheoretical limits for speedup gained from overlapped data transfers:\n");
printf("\tNo overlap (transfer-kernel-transfer):\t%f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
printf("\tOverlap one transfer: \t%f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
printf("\tOverlap both data transfers: \t%f ms\n", max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time)); // 使用 Overlap 特性进行计算
float serial_time = processWithStreams();
float overlap_time = processWithStreams(STREAM_COUNT); printf("\n\tAverage measured timings over %d repetitions:\n", NREPS);
printf("\tAvg. time serialized: \t%f ms (%f GB/s)\n", serial_time / NREPS, (NREPS * (memsize * 2e-)) / serial_time);
printf("\tAvg. time using %d streams:\t%f ms (%f GB/s)\n", STREAM_COUNT, overlap_time / NREPS, (NREPS * (memsize * 2e-)) / overlap_time);
printf("\tAvg. speedup gained: \t%f ms\n", (serial_time - overlap_time) / NREPS); printf("\n\tResult test: %s.\n", test() ? "Passed" : "Failed"); // 回收工作
free(h_data_source);
free(h_data_sink);
for (int i =; i<STREAM_COUNT; ++i)
{
cudaFreeHost(h_data_in[i]);
cudaFree(d_data_in[i]);
cudaFreeHost(h_data_out[i]);
cudaFree(d_data_out[i]);
cudaStreamDestroy(stream[i]);
cudaEventDestroy(cycleDone[i]);
}
cudaEventDestroy(start);
cudaEventDestroy(stop); getchar();
return ;
}
▶ 输出结果
Start. Device []: GeForce GTX , computation cability 6.1, MP(s) x (Cores/MP) = (Cores) Relevant properties of this CUDA device.
(Y) Execute several GPU kernels simultaneously
(Y) Overlap one CPU<->GPU data transfer with GPU kernel execution
(Y) Overlap two CPU<->GPU data transfers with GPU kernel execution scale_factor = 1.00
array_size = Measured timings (throughput):
Memcpy host to device: 1.276192 ms (13.146311 GB/s)
Memcpy device to host: 1.279008 ms (13.117366 GB/s)
Kernel: 1.312768 ms (127.800314 GB/s) Theoretical limits for speedup gained from overlapped data transfers:
No overlap (transfer-kernel-transfer): 3.867968 ms
Overlap one transfer: 2.555200 ms
Overlap both data transfers: 1.312768 ms Average measured timings over repetitions:
Avg. time serialized: 3.992167 ms (8.405068 GB/s)
Avg. time using streams: 1.896141 ms (17.696171 GB/s)
Avg. speedup gained: 2.096026 ms Result test: Passed.
▶ 涨姿势
● 没有
0_Simple__simpleMultiCopy的更多相关文章
随机推荐
- SVN命令行使用总结
1.上传项目到SVN服务器上svn import project_dir(本地项目全路径) http://192.168.1.242:8080/svn/IOS/Ben/remote_dir(svn项目 ...
- LeetCode-Microsoft-Add Two Numbers II
You are given two non-empty linked lists representing two non-negative integers. The most significan ...
- V4L2驱动内核文档翻译(一)
随着一些视频或者图像硬件的复杂化,V4L2驱动也越来越趋于复杂.许多硬件有多个IC,在/dev下生成多个video设备或者其他的诸如,DVB,ALSA,FB,I2C ,IR等等非V4L2的设备.所以, ...
- 一张方便的graphql schema 语言手册
参考资料 https://github.com/sogko/graphql-schema-language-cheat-sheet
- stenciljs 学习十一 pwa 支持
stenciljs 对于pwa 的支持是自动注入的,我们只需要简单的配置,stenciljs使用workbox 配置 默认配置 { skipWaiting: true, clientsClaim: t ...
- cacheAsBitmap位图缓存
使用cacheAsBitmap将缓存显示对象的内部位图表示形式. 此缓存可以提高包含复杂矢量内容的显示对象的性能.此方法适合运用于较多的图片或文字移动,不过也不能太随意乱用,有利必有弊,使用cache ...
- CAM 查看里先选哪些层才能方便查看
CAM 检查 Gerber 时选 Layer 时有先后次序,才以看清楚是否有冲突. 比如检查 TOP 层时顺序应该是 MT ST L1 BOT 层检查顺序 MB SB L2/L4
- innotop监控mysql
InnoTop 是一个系统活动报告,类似于Linux性能工具,它与Linux的top命令相仿,并参考mytop工具而设计. 它专门用后监控InnoDB性能和MySQL服务器.主要用于监控事务,死锁,外 ...
- AWS ECU SSH无法连接问题处理
AWS ECU SSH无法连接问题处理,因同事误操作导致/var/empty/sshd目录权限为771,需要修改为711,因AWS只有一台实例,所以需要通过建立临时实例来挂载“卷”来修改/var/ ...
- Java堆外内存之三:堆外内存回收方法
一.JVM内存的分配及垃圾回收 对于JVM的内存规则,应该是老生常谈的东西了,这里我就简单的说下: 新生代:一般来说新创建的对象都分配在这里. 年老代:经过几次垃圾回收,新生代的对象就会放在年老代里面 ...