cuda并行计算的几种模式

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include <stdio.h>

 #include <time.h>

 #include <stdlib.h>

 #define MAX 120

 #define MIN 0

 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,

         float* etime);

 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,

         float* etime, int type);

 __global__ void addKernel(int *c, const int *a, const int *b) {

     int i = blockIdx.x;

     c[i] = a[i] + b[i];

 }

 __global__ void addKernelThread(int *c, const int *a, const int *b) {

     int i = threadIdx.x;

     c[i] = a[i] + b[i];

 }

 int main() {

     const int arraySize = ;

     srand((unsigned) time(NULL));

     int a[arraySize] = { , , , ,  };

     int b[arraySize] = { , , , ,  };

     for (int i = ; i < arraySize; i++) {

         a[i] = rand() % (MAX +  - MIN) + MIN;

         b[i] = rand() % (MAX +  - MIN) + MIN;

     }

     int c[arraySize] = {  };

     // Add vectors in parallel.

     cudaError_t cudaStatus;

     int num = ;

     cudaDeviceProp prop;

     cudaStatus = cudaGetDeviceCount(&num);

     for (int i = ; i < num; i++) {

         cudaGetDeviceProperties(&prop, i);

     }

     float time;

     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);

     printf("Elasped time of stream is : %f \n", time);

     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",

             a[arraySize -  - ], a[arraySize -  - ], a[arraySize -  - ],

             a[arraySize -  - ], a[arraySize -  - ], b[arraySize -  - ],

             b[arraySize -  - ], b[arraySize -  - ], b[arraySize -  - ],

             b[arraySize -  - ], c[arraySize -  - ], c[arraySize -  - ],

             c[arraySize -  - ], c[arraySize -  - ], c[arraySize -  - ]);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "addWithCudaStream failed!");

         return ;

     }

     cudaStatus = addWithCuda(c, a, b, arraySize, &time, );

     printf("Elasped time of Block is : %f \n", time);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "addWithCudaStream failed!");

         return ;

     }

     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",

             a[arraySize -  - ], a[arraySize -  - ], a[arraySize -  - ],

             a[arraySize -  - ], a[arraySize -  - ], b[arraySize -  - ],

             b[arraySize -  - ], b[arraySize -  - ], b[arraySize -  - ],

             b[arraySize -  - ], c[arraySize -  - ], c[arraySize -  - ],

             c[arraySize -  - ], c[arraySize -  - ], c[arraySize -  - ]);

     cudaStatus = addWithCuda(c, a, b, arraySize, &time, );

     printf("Elasped time of thread is : %f \n", time);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "addWithCudaStream failed!");

         return ;

     }

     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",

             a[arraySize -  - ], a[arraySize -  - ], a[arraySize -  - ],

             a[arraySize -  - ], a[arraySize -  - ], b[arraySize -  - ],

             b[arraySize -  - ], b[arraySize -  - ], b[arraySize -  - ],

             b[arraySize -  - ], c[arraySize -  - ], c[arraySize -  - ],

             c[arraySize -  - ], c[arraySize -  - ], c[arraySize -  - ]);

     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);

     printf("Elasped time of stream is : %f \n", time);

     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",

             a[arraySize -  - ], a[arraySize -  - ], a[arraySize -  - ],

             a[arraySize -  - ], a[arraySize -  - ], b[arraySize -  - ],

             b[arraySize -  - ], b[arraySize -  - ], b[arraySize -  - ],

             b[arraySize -  - ], c[arraySize -  - ], c[arraySize -  - ],

             c[arraySize -  - ], c[arraySize -  - ], c[arraySize -  - ]);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "addWithCudaStream failed!");

         return ;

     }

     // cudaThreadExit must be called before exiting in order for profiling and

     // tracing tools such as Nsight and Visual Profiler to show complete traces.

     cudaStatus = cudaThreadExit();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaThreadExit failed!");

         return ;

     }

     return ;

 }

 // Helper function for using CUDA to add vectors in parallel.

 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,

         float* etime) {

     int *dev_a = ;

     int *dev_b = ;

     int *dev_c = ;

     clock_t start, stop;

     float time;

     cudaError_t cudaStatus;

     // Choose which GPU to run on, change this on a multi-GPU system.

     cudaStatus = cudaSetDevice();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr,

                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");

         goto Error;

     }

     // Allocate GPU buffers for three vectors (two input, one output)    .

     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     // Copy input vectors from host memory to GPU buffers.

     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),

             cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),

             cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     cudaStream_t stream[];

     for (int i = ; i < ; i++) {

         cudaStreamCreate(&stream[i]);   //创建流

     }

     // Launch a kernel on the GPU with one thread for each element.

     for (int i = ; i < ; i++) {

         addKernel<<<, , , stream[i]>>>(dev_c + i, dev_a + i, dev_b + i); //执行流

     }

     start = clock();

     cudaDeviceSynchronize();

     stop = clock();

     time = (float) (stop - start) / CLOCKS_PER_SEC;

     *etime = time;

     // cudaThreadSynchronize waits for the kernel to finish, and returns

     // any errors encountered during the launch.

     cudaStatus = cudaThreadSynchronize();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr,

                 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",

                 cudaStatus);

         goto Error;

     }

     // Copy output vector from GPU buffer to host memory.

     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),

             cudaMemcpyDeviceToHost);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     Error: for (int i = ; i < ; i++) {

         cudaStreamDestroy(stream[i]);   //销毁流

     }

     cudaFree(dev_c);

     cudaFree(dev_a);

     cudaFree(dev_b);

     return cudaStatus;

 }

 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,

         float * etime, int type) {

     int *dev_a = ;

     int *dev_b = ;

     int *dev_c = ;

     clock_t start, stop;

     float time;

     cudaError_t cudaStatus;

     // Choose which GPU to run on, change this on a multi-GPU system.

     cudaStatus = cudaSetDevice();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr,

                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");

         goto Error;

     }

     // Allocate GPU buffers for three vectors (two input, one output)    .

     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     // Copy input vectors from host memory to GPU buffers.

     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),

             cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),

             cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     if (type == ) {

         start = clock();

         addKernel<<<size, >>>(dev_c, dev_a, dev_b);

     } else {

         start = clock();

         addKernelThread<<<, size>>>(dev_c, dev_a, dev_b);

     }

     stop = clock();

     time = (float) (stop - start) / CLOCKS_PER_SEC;

     *etime = time;

     // cudaThreadSynchronize waits for the kernel to finish, and returns

     // any errors encountered during the launch.

     cudaStatus = cudaThreadSynchronize();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr,

                 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",

                 cudaStatus);

         goto Error;

     }

     // Copy output vector from GPU buffer to host memory.

     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),

             cudaMemcpyDeviceToHost);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     Error: cudaFree(dev_c);

     cudaFree(dev_a);

     cudaFree(dev_b);

     return cudaStatus;

 }

如上文的实现程序，使用了thread并行，block并行，stream并行三种，使用三种方法法进行了五次计算，发现stream第一次计算时会出错，调用的子程序没有变化，没有搞懂？

Elasped time of stream is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {158,123,92,107,127}
Elasped time of Block is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000008
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of thread is : 0.000004
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000007
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}

cuda并行计算的几种模式的更多相关文章

对称加密和分组加密中的四种模式(ECB、CBC、CFB、OFB)
一. AES对称加密: AES加密分组二. 分组密码的填充分组密码的填充 e.g.: PKCS#5填充方式三. 流密码: 四. 分组密码加密中的四种模式: 3.1 ECB模式优点: 1. ...
win7 64位下自行编译OpenCV2.4.10+CUDA toolkit 5.5的整个过程以及需要注意的问题（opencv+cuda并行计算元素的使用）
首先说明的是,这个帖子是成功的编译了dll,但是这个dll使用的时候还是很容易出现各种问题的. 发现错误可能是由于系统安装了太多版本的opencv,环境变量的设置混乱,造成dll版本加载 ...
Spark On Yarn的两种模式yarn-cluster和yarn-client深度剖析
Spark On Yarn的优势每个Spark executor作为一个YARN容器(container)运行.Spark可以使得多个Tasks在同一个容器(container)里面运行 1. Sp ...
AES加密的四种模式详解
对称加密和分组加密中的四种模式(ECB.CBC.CFB.OFB) 一. AES对称加密: A ...
Hadoop hadoop的介绍和几种模式
Hadoop简介 Hadoop软件库是一个开源框架,允许使用简单的编程模型跨计算机集群分布式处理大型数据集.它旨在从单个服务器扩展到数千台计算机,每台计算机都提供本地计算和存储.库本身不是依靠硬件来提 ...
hadoop(1)---hadoop的介绍和几种模式。
一.什么是hadoop? Hadoop软件库是一个开源框架,允许使用简单的编程模型跨计算机集群分布式处理大型数据集.它旨在从单个服务器扩展到数千台计算机,每台计算机都提供本地计算和存储.库本身不是依靠 ...
javascript 创建对象的7种模式
使用字面量方式创建一个 student 对象: var student = function (){ name : "redjoy", age : 21, sex: women, ...
javascript面向对象系列第二篇——创建对象的5种模式
× 目录 [1]字面量 [2]工厂模式 [3]构造函数[4]原型模式[5]组合模式前面的话如何创建对象,或者说如何更优雅的创建对象,一直是一个津津乐道的话题.本文将从最简单的创建对象的方式入手,逐 ...
javascript创建对象的几种模式
在js中有几种模式可以创建对象,通过对象操作所包含的属性与方法. 一般来说,构造函数名称的第一个字母为大写字母,非构造函数名称的第一个字母为小写字母,当然,构造函数与一般函数唯一的区别只是调用的方式不 ...

随机推荐

15款优雅的 WordPress 电子商务网站主题
WordPress 电子商务网站主题今年非常流行,特别是对那些想要在几分钟内创建一个在线商店,但又没有掌握网络开发的很多知识的人来说.WordPress 是一个功能强大的 CMS,它的灵活性和可用性是 ...
emberjs学习二(ember-data和localstorage_adapter)
emberjs学习二(ember-data和localstorage_adapter) 准备工作首先我们加入ember-data和ember-localstorage-adapter两个依赖项,使用 ...
Error: Error setting TTL index on collection : sessions
Error: Error setting TTL index on collection : sessions 一.步骤一: 这个问题一般是直接升级 mongodb和connect-mongo的版本为 ...
移动Web开发的bug及解决方案
我目前移动Web开发遇到的bug以及解决方案(慢慢补充当中). 1.android4.0以上一部分手机的webview中,当canvas小于屏幕大小时,绘图时会出现重影,就是说一个图只绘制了一遍,却出 ...
每次点击按钮后，判断页面是否已经有该行，没有弹出repeater的一行，并给他赋一个这行附值，没有则跳出
protected void btnAdd_click(object sender, EventArgs e) { try { //记录第几次追加 pressCount++; typeString.A ...
选中repeater表格中的一行使其变色
//table表中点击行,选中的那一行颜色会变成浅蓝色(颜色可以自己设定) //其中sellerTable 为table的id, $("#sellerTable tbody tr" ...
“破解大牛是怎么炼成的”之壳与ESP定律
文章难易度:★★★ 文章阅读点/知识点:逆向破解文章作者:Sp4ce 文章来源:i春秋关键字:网络信息安全技术本文参与i春秋社区原创文章奖励计划,未经许可禁止转载! 一.前言通过前面几篇 ...
N900快捷键
Ctrl + C 复制文本 Ctrl + V 粘贴文本 Ctrl + X 剪切文本 Ctrl + A 全部选择 Ctrl + O 打开 Ctrl + N 新建 Ctrl + S 保存 Ctrl + Z ...
UIView中的坐标转换
在使用 UITableViewCell 的frame属性获取origin得到的坐标是不变的. 也就是说如果UITableView初始化完毕后,每个cell的坐标是固定的,x不变,y 随index递增的 ...
Map集合概述
java集合最后一站之Map,给自己的总结画个句号... Map用于保存具有映射关系的数据. 1.HashMap和Hashtable实现类 HashMap和Hashtable都是Map接口的典型实现类 ...

cuda并行计算的几种模式

cuda并行计算的几种模式的更多相关文章

随机推荐

热门专题