#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
#include <stdlib.h> #define MAX 120
#define MIN 0 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
float* etime);
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
float* etime, int type);
__global__ void addKernel(int *c, const int *a, const int *b) {
int i = blockIdx.x;
c[i] = a[i] + b[i];
} __global__ void addKernelThread(int *c, const int *a, const int *b) {
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
int main() {
const int arraySize = ;
srand((unsigned) time(NULL));
int a[arraySize] = { , , , , };
int b[arraySize] = { , , , , }; for (int i = ; i < arraySize; i++) {
a[i] = rand() % (MAX + - MIN) + MIN;
b[i] = rand() % (MAX + - MIN) + MIN;
}
int c[arraySize] = { };
// Add vectors in parallel.
cudaError_t cudaStatus;
int num = ;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for (int i = ; i < num; i++) {
cudaGetDeviceProperties(&prop, i);
} float time;
cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
printf("Elasped time of stream is : %f \n", time);
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
a[arraySize - - ], a[arraySize - - ], a[arraySize - - ],
a[arraySize - - ], a[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], b[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], c[arraySize - - ], c[arraySize - - ],
c[arraySize - - ], c[arraySize - - ], c[arraySize - - ]);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCudaStream failed!");
return ;
}
cudaStatus = addWithCuda(c, a, b, arraySize, &time, );
printf("Elasped time of Block is : %f \n", time);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCudaStream failed!");
return ;
}
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
a[arraySize - - ], a[arraySize - - ], a[arraySize - - ],
a[arraySize - - ], a[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], b[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], c[arraySize - - ], c[arraySize - - ],
c[arraySize - - ], c[arraySize - - ], c[arraySize - - ]); cudaStatus = addWithCuda(c, a, b, arraySize, &time, );
printf("Elasped time of thread is : %f \n", time);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCudaStream failed!");
return ;
}
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
a[arraySize - - ], a[arraySize - - ], a[arraySize - - ],
a[arraySize - - ], a[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], b[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], c[arraySize - - ], c[arraySize - - ],
c[arraySize - - ], c[arraySize - - ], c[arraySize - - ]); cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
printf("Elasped time of stream is : %f \n", time);
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
a[arraySize - - ], a[arraySize - - ], a[arraySize - - ],
a[arraySize - - ], a[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], b[arraySize - - ], b[arraySize - - ],
b[arraySize - - ], c[arraySize - - ], c[arraySize - - ],
c[arraySize - - ], c[arraySize - - ], c[arraySize - - ]);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCudaStream failed!");
return ;
}
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaThreadExit failed!");
return ;
}
return ;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
float* etime) {
int *dev_a = ;
int *dev_b = ;
int *dev_c = ;
clock_t start, stop;
float time;
cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStream_t stream[];
for (int i = ; i < ; i++) {
cudaStreamCreate(&stream[i]); //创建流
}
// Launch a kernel on the GPU with one thread for each element.
for (int i = ; i < ; i++) {
addKernel<<<, , , stream[i]>>>(dev_c + i, dev_a + i, dev_b + i); //执行流
}
start = clock();
cudaDeviceSynchronize();
stop = clock();
time = (float) (stop - start) / CLOCKS_PER_SEC;
*etime = time;
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaThreadSynchronize returned error code %d after launching addKernel!\n",
cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error: for (int i = ; i < ; i++) {
cudaStreamDestroy(stream[i]); //销毁流
}
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
float * etime, int type) {
int *dev_a = ;
int *dev_b = ;
int *dev_c = ;
clock_t start, stop;
float time;
cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
} if (type == ) {
start = clock();
addKernel<<<size, >>>(dev_c, dev_a, dev_b);
} else {
start = clock();
addKernelThread<<<, size>>>(dev_c, dev_a, dev_b);
}
stop = clock();
time = (float) (stop - start) / CLOCKS_PER_SEC;
*etime = time;
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr,
"cudaThreadSynchronize returned error code %d after launching addKernel!\n",
cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error: cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}

如上文的实现程序,使用了thread并行,block并行,stream并行三种,使用三种方法法进行了五次计算,发现stream第一次计算时会出错,调用的子程序没有变化,没有搞懂?

Elasped time of stream is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {158,123,92,107,127}
Elasped time of Block is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000008
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of thread is : 0.000004
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000007
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}

cuda并行计算的几种模式的更多相关文章

  1. 对称加密和分组加密中的四种模式(ECB、CBC、CFB、OFB)

    一. AES对称加密: AES加密 分组 二. 分组密码的填充 分组密码的填充 e.g.: PKCS#5填充方式 三. 流密码:   四. 分组密码加密中的四种模式: 3.1 ECB模式 优点: 1. ...

  2. win7 64位下自行编译OpenCV2.4.10+CUDA toolkit 5.5的整个过程以及需要注意的问题(opencv+cuda并行计算元素的使用)

           首先说明的是,这个帖子是成功的编译了dll,但是这个dll使用的时候还是很容易出现各种问题的. 发现错误可能是由于系统安装了太多版本的opencv,环境变量的设置混乱,造成dll版本加载 ...

  3. Spark On Yarn的两种模式yarn-cluster和yarn-client深度剖析

    Spark On Yarn的优势 每个Spark executor作为一个YARN容器(container)运行.Spark可以使得多个Tasks在同一个容器(container)里面运行 1. Sp ...

  4. AES加密的四种模式详解

    对称加密和分组加密中的四种模式(ECB.CBC.CFB.OFB) 一. AES对称加密:                                                       A ...

  5. Hadoop hadoop的介绍和几种模式

    Hadoop简介 Hadoop软件库是一个开源框架,允许使用简单的编程模型跨计算机集群分布式处理大型数据集.它旨在从单个服务器扩展到数千台计算机,每台计算机都提供本地计算和存储.库本身不是依靠硬件来提 ...

  6. hadoop(1)---hadoop的介绍和几种模式。

    一.什么是hadoop? Hadoop软件库是一个开源框架,允许使用简单的编程模型跨计算机集群分布式处理大型数据集.它旨在从单个服务器扩展到数千台计算机,每台计算机都提供本地计算和存储.库本身不是依靠 ...

  7. javascript 创建对象的7种模式

    使用字面量方式创建一个 student 对象: var student = function (){ name : "redjoy", age : 21, sex: women, ...

  8. javascript面向对象系列第二篇——创建对象的5种模式

    × 目录 [1]字面量 [2]工厂模式 [3]构造函数[4]原型模式[5]组合模式 前面的话 如何创建对象,或者说如何更优雅的创建对象,一直是一个津津乐道的话题.本文将从最简单的创建对象的方式入手,逐 ...

  9. javascript创建对象的几种模式

    在js中有几种模式可以创建对象,通过对象操作所包含的属性与方法. 一般来说,构造函数名称的第一个字母为大写字母,非构造函数名称的第一个字母为小写字母,当然,构造函数与一般函数唯一的区别只是调用的方式不 ...

随机推荐

  1. 优秀工具推荐:两款很棒的 HTML5 游戏开发工具

    HTML5 众多强大特性让我们不需要多么高深技术就能创建好玩的网页游戏,同时证明了开放的 Web 技术能与任何其他在游戏开发中使用的技术竞争.正如标题所说,这篇文章推荐的几款很棒 HTML5 游戏开发 ...

  2. SharePoint 2013 列表关于大数据的测试<二>

    1.给测试列表添加查阅项字段,100个,代码如下: 2.插入测试数据的方法,注意查阅项字段的格式,代码如下: 3.插入10w条数据,时间花费如下(不建议List[LISTNAME].Items.Add ...

  3. Atitit jOrgChart的使用  组织架构图css html

    Atitit jOrgChart的使用  组织架构图css html 1. 项目要做组织架构图,要把它做成自上而下的树形结构,于是决定1 2. Html导入 以来的css js1 2.1. 数据来源 ...

  4. svn(http)

    对于有些公司的运维  他确实很垃圾   所以  你一定要谨记这一点!!!

  5. Xcode cannot launch because the device is locked.

    When you plug in your iPhone, it will ask you to trust the computer. If you already trust and unlock ...

  6. 【Android自定义控件】支持多层嵌套RadioButton的RadioGroup

    前言 非常喜欢用RadioButton+RadioGroup做Tabs,能自动处理选中等效果,但是自带的RadioGroup不支持嵌套RadioButton(从源码可看出仅仅是判断子控件是不是Radi ...

  7. PHPMailer不能发送邮件

    PHPMailer不能连接SMTP服务器,和修改SMTP大小写没有关系 (2011-10-22 12:17:35) 转载▼ 标签: php phpmailer 杂谈 分类: 默认分类 PHPmaile ...

  8. css浮动

    一.浮动介绍 历史: 浮动属性产生之初是为了实现“文字环绕”的效果,让文字环绕图片在网页实现类似word中“图文混排”. 定位方式: 浮动让元素脱离正常流,向父容器的左边或右边移动直到碰到包含容器的边 ...

  9. WebMatrix之WebMatrix.Data

    WebMatrix之WebMatrix.Data WebMatrix数据访问系列目次: WebMatrix之数据访问 WebMatrix之WebMatrix.Data WebMatrix之WebMat ...

  10. yii2集成富文本编辑器redactor

    作者:白狼 出处:http://www.manks.top/article/yii2_redactor本文版权归作者,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保 ...