CUDA入门1

1GPUs can handle thousands of concurrent threads.

2The pieces of code running on the gpu are called kernels

3A kernel is executed by a set of threads.

4All threads execute the same code (SPMD)

5Each thread has an index that is used to calculate memory addresses that this will access.

1Threads are grouped into blocks

2 Blocks are grouped into a grid

3 A kernel is executed as a grid of blocks of threads

 Built-in variables ⎯ threadIdx, blockIdx ⎯ blockDim, gridDim

CUDA的线程组织即Grid-Block-Thread结构。一组线程并行处理可以组织为一个block，而一组block并行处理可以组织为一个Grid。下面的程序分别为线程并行和块并行，线程并行为细粒度的并行，而块并行为粗粒度的并行。addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include <stdio.h>

 #include <time.h>

 #include <stdlib.h>

 #define MAX 255

 #define MIN 0

 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float* etime);

 __global__ void addKernelThread(int *c, const int *a, const int *b)

 {

   int i = threadIdx.x;

  c[i] = a[i] + b[i];

 }

 __global__ void addKernelBlock(int *c, const int *a, const int *b)

 {

   int i = blockIdx.x;

  c[i] = a[i] + b[i];

 }

 int main()

 {

     const int arraySize = ;

     int a[arraySize] = { , , , ,  };

    int b[arraySize] = { , , , ,  };

     for (int i = ; i< arraySize ; i++){

         a[i] = rand() % (MAX +  - MIN) + MIN;

         b[i] = rand() % (MAX +  - MIN) + MIN;

     }

     int c[arraySize] = {  };

     // Add vectors in parallel.

     cudaError_t cudaStatus;

     int num = ;

     float time;

     cudaDeviceProp prop;

     cudaStatus = cudaGetDeviceCount(&num);

     for(int i = ;i<num;i++)

     {

         cudaGetDeviceProperties(&prop,i);

     }

     cudaStatus = addWithCuda(c, a, b, arraySize,,&time);

     printf("Elasped time of thread is : %f \n", time);

     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]);

     cudaStatus = addWithCuda(c, a, b, arraySize,,&time);

     printf("Elasped time of block is : %f \n", time);

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "addWithCuda failed!");

         return ;

     }

     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]);

     // cudaThreadExit must be called before exiting in order for profiling and

     // tracing tools such as Nsight and Visual Profiler to show complete traces.

     cudaStatus = cudaThreadExit();

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaThreadExit failed!");

         return ;

     }

     return ;

 }

 // Helper function for using CUDA to add vectors in parallel.

 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float * etime)

 {

     int *dev_a = ;

     int *dev_b = ;

     int *dev_c = ;

     clock_t start, stop;

     float time;

     cudaError_t cudaStatus;

     // Choose which GPU to run on, change this on a multi-GPU system.

     cudaStatus = cudaSetDevice();

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");

         goto Error;

     }

     // Allocate GPU buffers for three vectors (two input, one output)    .

     cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

     // Copy input vectors from host memory to GPU buffers.

     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

     // Launch a kernel on the GPU with one thread for each element.

     if(type == ){

         start = clock();

         addKernelThread<<<, size>>>(dev_c, dev_a, dev_b);

     }

     else{

         start = clock();

         addKernelBlock<<<size, >>>(dev_c, dev_a, dev_b);

     }

     stop = clock();

     time = (float)(stop-start)/CLOCKS_PER_SEC;

     *etime = time;

   // cudaThreadSynchronize waits for the kernel to finish, and returns

     // any errors encountered during the launch.

     cudaStatus = cudaThreadSynchronize();

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

         goto Error;

     }

     // Copy output vector from GPU buffer to host memory.

     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

     if (cudaStatus != cudaSuccess)

     {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

 Error:

     cudaFree(dev_c);

     cudaFree(dev_a);

     cudaFree(dev_b);

     return cudaStatus;

 }

运行的结果是

Elasped time of thread is : 0.000010
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}
Elasped time of block is : 0.000005
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}

CUDA入门1的更多相关文章

CUDA入门
CUDA入门鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to ...
一篇不错的CUDA入门
鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to Genera ...
CUDA入门需要知道的东西
CUDA刚学习不久,做毕业要用,也没时间研究太多的东西,我的博客里有一些我自己看过的东西,不敢保证都特别有用,但是至少对刚入门的朋友或多或少希望对大家有一点帮助吧,若果你是大牛请指针不对的地方,如果你 ...
Cuda入门笔记
最近在学cuda ,找了好久入门的教程,感觉入门这个教程比较好,网上买的书基本都是在掌握基础后才能看懂,所以在这里记录一下.百度文库下载,所以不知道原作者是谁,向其致敬! 文章目录 1. CUDA是什 ...
CUDA 入门（转）
CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...
CUDA编程->CUDA入门了解（一）
安装好CUDA6.5+VS2012,操作系统为Win8.1版本号,首先下个GPU-Z检測了一下: 看出本显卡属于中低端配置.关键看两个: Shaders=384.也称作SM.或者说core/流处理器数 ...
CUDA中Bank conflict冲突
转自:http://blog.csdn.net/smsmn/article/details/6336060 其实这两天一直不知道什么叫bank conflict冲突,这两天因为要看那个矩阵转置优化的问 ...
【CUDA】CUDA框架介绍
引用出自Bookc的博客,链接在此http://bookc.github.io/2014/05/08/my-summery-the-book-cuda-by-example-an-introduct ...
转：ubuntu 下GPU版的 tensorflow / keras的环境搭建
http://blog.csdn.net/jerr__y/article/details/53695567 前言:本文主要介绍如何在 ubuntu 系统中配置 GPU 版本的 tensorflow 环 ...

随机推荐

【JS复习笔记】04 数组
JS里的数组其实并不是一个数组,它其实是一个对象,a[1]这种调用方式其实就是一个字面量为1的属性. 因为这东西实际上是一个对象,所以你就可以理解下面这种声明了吧! var arrName=['我可以 ...
sql:MySQL 6.7 表,视图,存储过程结构查询
#数据库MySQL 6.7 use sakila; #查询表名 show tables; # SELECT TABLE_NAME,TABLE_ROWS FROM INFORMATION_SCHEMA. ...
为什么.NET感觉上比Java差一点
其实,我本人很喜欢.NET技术.工作经历中,大部分时间也在使用.NET开发. 这几年,由于工作的原因,开始进入Java+Linux世界. 今年,开始学习Python和Scala. 使用.NET时,有种 ...
【JVM】2、关于jdk7的MethodHandle类
关于MethodHandle类,这个类是在jdk1.7之后加入的,这个类的作用类似函数指针的意思这个类中有一个方法这里我的jdk有一个问题,就是我在进行MethodHandle操作的时候,我们会发 ...
windows下使用makecert命令生成自签名证书
1.makecert命令路径 C:\Program Files (x86)\Windows Kits\8.1\bin\x64 2.生成一个自签名证书 makecert -r -pe -n " ...
mysql metadata lock锁
很多情况下,很多问题从理论上或者管理上而言都是可以避免或者说很好解决的,但是一旦涉及到现实由于管理或者协调或者规范执行的不够到位,就会出现各种各样本不该出现的问题,这些问题的通常在生产环境并不会出现, ...
PHP调用SQL Server存储过程
一.安装SQL Server Driver for PHP 在微软官网上发现了这个东西,他提供了一套PHP对MS2005/2008操作的全新函数库,并且支持UTF8,作为PHP的扩展运行.看来 ...
arcgis andriod 长按获得当前信息
// 长按显示鼠标点坐标及比例尺 private class myLongPressListener implements OnLongPressListener { private static f ...
android 保存用户名和密码设置等应用信息优化
1.传统的保存用户名,密码方式 SharedPreferences Editor editor = shareReference.edit(); editor.putString(KEY_NAME,& ...
UIWebView用法详解及代码分享
今天我们来详细UIWebView用法.UIWebView是iOS内置的浏览器控件,可以浏览网页.打开文档等能够加载html/htm.pdf.docx.txt等格式的文件. 用UIWebView我们就 ...

CUDA入门1

CUDA入门1的更多相关文章

随机推荐

热门专题