cuda并行计算的几种模式
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
#include <stdlib.h> #define MAX 120
#define MIN 0 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
float* etime);
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
float* etime, int type);
__global__ void addKernel(int *c, const int *a, const int *b) {
int i = blockIdx.x;
c[i] = a[i] + b[i];
} __global__ void addKernelThread(int *c, const int *a, const int *b) {
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
int main() {
const int arraySize = ;
srand((unsigned) time(NULL));
int a[arraySize] = { , , , , };
int b[arraySize] = { , , , , }; for (int i = ; i < arraySize; i++) {
a[i] = rand() % (MAX + - MIN) + MIN;
b[i] = rand() % (MAX + - MIN) + MIN;
}
int c[arraySize] = { };
// Add vectors in parallel.
cudaError_t cudaStatus;
int num = ;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for (int i = ; i < num; i++) {
cudaGetDeviceProperties(&prop, i);
} float time;
cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
printf("Elasped time of stream is : %f \n", time);
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
a[arraySize - - ], a[arraySize - - ], a[arraySize - - ],
a[arraySize - - ], a[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], b[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], c[arraySize - - ], c[arraySize - - ],
c[arraySize - - ], c[arraySize - - ], c[arraySize - - ]);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCudaStream failed!");
return ;
}
cudaStatus = addWithCuda(c, a, b, arraySize, &time, );
printf("Elasped time of Block is : %f \n", time);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCudaStream failed!");
return ;
}
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
a[arraySize - - ], a[arraySize - - ], a[arraySize - - ],
a[arraySize - - ], a[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], b[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], c[arraySize - - ], c[arraySize - - ],
c[arraySize - - ], c[arraySize - - ], c[arraySize - - ]); cudaStatus = addWithCuda(c, a, b, arraySize, &time, );
printf("Elasped time of thread is : %f \n", time);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCudaStream failed!");
return ;
}
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
a[arraySize - - ], a[arraySize - - ], a[arraySize - - ],
a[arraySize - - ], a[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], b[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], c[arraySize - - ], c[arraySize - - ],
c[arraySize - - ], c[arraySize - - ], c[arraySize - - ]); cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
printf("Elasped time of stream is : %f \n", time);
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
a[arraySize - - ], a[arraySize - - ], a[arraySize - - ],
a[arraySize - - ], a[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], b[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], c[arraySize - - ], c[arraySize - - ],
c[arraySize - - ], c[arraySize - - ], c[arraySize - - ]);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCudaStream failed!");
return ;
}
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaThreadExit failed!");
return ;
}
return ;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
float* etime) {
int *dev_a = ;
int *dev_b = ;
int *dev_c = ;
clock_t start, stop;
float time;
cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStream_t stream[];
for (int i = ; i < ; i++) {
cudaStreamCreate(&stream[i]); //创建流
}
// Launch a kernel on the GPU with one thread for each element.
for (int i = ; i < ; i++) {
addKernel<<<, , , stream[i]>>>(dev_c + i, dev_a + i, dev_b + i); //执行流
}
start = clock();
cudaDeviceSynchronize();
stop = clock();
time = (float) (stop - start) / CLOCKS_PER_SEC;
*etime = time;
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaThreadSynchronize returned error code %d after launching addKernel!\n",
cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error: for (int i = ; i < ; i++) {
cudaStreamDestroy(stream[i]); //销毁流
}
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
float * etime, int type) {
int *dev_a = ;
int *dev_b = ;
int *dev_c = ;
clock_t start, stop;
float time;
cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
} if (type == ) {
start = clock();
addKernel<<<size, >>>(dev_c, dev_a, dev_b);
} else {
start = clock();
addKernelThread<<<, size>>>(dev_c, dev_a, dev_b);
}
stop = clock();
time = (float) (stop - start) / CLOCKS_PER_SEC;
*etime = time;
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaThreadSynchronize returned error code %d after launching addKernel!\n",
cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error: cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
如上文的实现程序,使用了thread并行,block并行,stream并行三种,使用三种方法法进行了五次计算,发现stream第一次计算时会出错,调用的子程序没有变化,没有搞懂?
Elasped time of stream is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {158,123,92,107,127}
Elasped time of Block is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000008
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of thread is : 0.000004
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000007
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
cuda并行计算的几种模式的更多相关文章
- 对称加密和分组加密中的四种模式(ECB、CBC、CFB、OFB)
一. AES对称加密: AES加密 分组 二. 分组密码的填充 分组密码的填充 e.g.: PKCS#5填充方式 三. 流密码: 四. 分组密码加密中的四种模式: 3.1 ECB模式 优点: 1. ...
- win7 64位下自行编译OpenCV2.4.10+CUDA toolkit 5.5的整个过程以及需要注意的问题(opencv+cuda并行计算元素的使用)
首先说明的是,这个帖子是成功的编译了dll,但是这个dll使用的时候还是很容易出现各种问题的. 发现错误可能是由于系统安装了太多版本的opencv,环境变量的设置混乱,造成dll版本加载 ...
- Spark On Yarn的两种模式yarn-cluster和yarn-client深度剖析
Spark On Yarn的优势 每个Spark executor作为一个YARN容器(container)运行.Spark可以使得多个Tasks在同一个容器(container)里面运行 1. Sp ...
- AES加密的四种模式详解
对称加密和分组加密中的四种模式(ECB.CBC.CFB.OFB) 一. AES对称加密: A ...
- Hadoop hadoop的介绍和几种模式
Hadoop简介 Hadoop软件库是一个开源框架,允许使用简单的编程模型跨计算机集群分布式处理大型数据集.它旨在从单个服务器扩展到数千台计算机,每台计算机都提供本地计算和存储.库本身不是依靠硬件来提 ...
- hadoop(1)---hadoop的介绍和几种模式。
一.什么是hadoop? Hadoop软件库是一个开源框架,允许使用简单的编程模型跨计算机集群分布式处理大型数据集.它旨在从单个服务器扩展到数千台计算机,每台计算机都提供本地计算和存储.库本身不是依靠 ...
- javascript 创建对象的7种模式
使用字面量方式创建一个 student 对象: var student = function (){ name : "redjoy", age : 21, sex: women, ...
- javascript面向对象系列第二篇——创建对象的5种模式
× 目录 [1]字面量 [2]工厂模式 [3]构造函数[4]原型模式[5]组合模式 前面的话 如何创建对象,或者说如何更优雅的创建对象,一直是一个津津乐道的话题.本文将从最简单的创建对象的方式入手,逐 ...
- javascript创建对象的几种模式
在js中有几种模式可以创建对象,通过对象操作所包含的属性与方法. 一般来说,构造函数名称的第一个字母为大写字母,非构造函数名称的第一个字母为小写字母,当然,构造函数与一般函数唯一的区别只是调用的方式不 ...
随机推荐
- 优秀工具推荐:两款很棒的 HTML5 游戏开发工具
HTML5 众多强大特性让我们不需要多么高深技术就能创建好玩的网页游戏,同时证明了开放的 Web 技术能与任何其他在游戏开发中使用的技术竞争.正如标题所说,这篇文章推荐的几款很棒 HTML5 游戏开发 ...
- SharePoint 2013 列表关于大数据的测试<二>
1.给测试列表添加查阅项字段,100个,代码如下: 2.插入测试数据的方法,注意查阅项字段的格式,代码如下: 3.插入10w条数据,时间花费如下(不建议List[LISTNAME].Items.Add ...
- Atitit jOrgChart的使用 组织架构图css html
Atitit jOrgChart的使用 组织架构图css html 1. 项目要做组织架构图,要把它做成自上而下的树形结构,于是决定1 2. Html导入 以来的css js1 2.1. 数据来源 ...
- svn(http)
对于有些公司的运维 他确实很垃圾 所以 你一定要谨记这一点!!!
- Xcode cannot launch because the device is locked.
When you plug in your iPhone, it will ask you to trust the computer. If you already trust and unlock ...
- 【Android自定义控件】支持多层嵌套RadioButton的RadioGroup
前言 非常喜欢用RadioButton+RadioGroup做Tabs,能自动处理选中等效果,但是自带的RadioGroup不支持嵌套RadioButton(从源码可看出仅仅是判断子控件是不是Radi ...
- PHPMailer不能发送邮件
PHPMailer不能连接SMTP服务器,和修改SMTP大小写没有关系 (2011-10-22 12:17:35) 转载▼ 标签: php phpmailer 杂谈 分类: 默认分类 PHPmaile ...
- css浮动
一.浮动介绍 历史: 浮动属性产生之初是为了实现“文字环绕”的效果,让文字环绕图片在网页实现类似word中“图文混排”. 定位方式: 浮动让元素脱离正常流,向父容器的左边或右边移动直到碰到包含容器的边 ...
- WebMatrix之WebMatrix.Data
WebMatrix之WebMatrix.Data WebMatrix数据访问系列目次: WebMatrix之数据访问 WebMatrix之WebMatrix.Data WebMatrix之WebMat ...
- yii2集成富文本编辑器redactor
作者:白狼 出处:http://www.manks.top/article/yii2_redactor本文版权归作者,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保 ...