1GPUs can handle thousands of concurrent threads.

2The pieces of code running on the gpu are called kernels

3A kernel is executed by a set of threads.

4All threads execute the same code (SPMD)

5Each thread has an index that is used to calculate memory addresses that this will access.

1Threads are grouped into blocks

2 Blocks are grouped into a grid

3 A kernel is executed as a grid of blocks of threads

 Built-in variables ⎯ threadIdx, blockIdx ⎯ blockDim, gridDim

CUDA的线程组织即Grid-Block-Thread结构。一组线程并行处理可以组织为一个block,而一组block并行处理可以组织为一个Grid。下面的程序分别为线程并行和块并行,线程并行为细粒度的并行,而块并行为粗粒度的并行。addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);

 #include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
#include <stdlib.h> #define MAX 255
#define MIN 0
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float* etime);
__global__ void addKernelThread(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
__global__ void addKernelBlock(int *c, const int *a, const int *b)
{
int i = blockIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = ; int a[arraySize] = { , , , , };
int b[arraySize] = { , , , , }; for (int i = ; i< arraySize ; i++){
a[i] = rand() % (MAX + - MIN) + MIN;
b[i] = rand() % (MAX + - MIN) + MIN;
}
int c[arraySize] = { };
// Add vectors in parallel.
cudaError_t cudaStatus;
int num = ; float time;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for(int i = ;i<num;i++)
{
cudaGetDeviceProperties(&prop,i);
} cudaStatus = addWithCuda(c, a, b, arraySize,,&time); printf("Elasped time of thread is : %f \n", time);
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]); cudaStatus = addWithCuda(c, a, b, arraySize,,&time); printf("Elasped time of block is : %f \n", time); if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "addWithCuda failed!");
return ;
}
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]);
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadExit failed!");
return ;
}
return ;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float * etime)
{
int *dev_a = ;
int *dev_b = ;
int *dev_c = ;
clock_t start, stop;
float time;
cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
} // Launch a kernel on the GPU with one thread for each element.
if(type == ){
start = clock();
addKernelThread<<<, size>>>(dev_c, dev_a, dev_b);
}
else{
start = clock();
addKernelBlock<<<size, >>>(dev_c, dev_a, dev_b);
} stop = clock();
time = (float)(stop-start)/CLOCKS_PER_SEC;
*etime = time;
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}

运行的结果是

Elasped time of thread is : 0.000010
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}
Elasped time of block is : 0.000005
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}

CUDA入门1的更多相关文章

  1. CUDA入门

    CUDA入门 鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to ...

  2. 一篇不错的CUDA入门

    鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to Genera ...

  3. CUDA入门需要知道的东西

    CUDA刚学习不久,做毕业要用,也没时间研究太多的东西,我的博客里有一些我自己看过的东西,不敢保证都特别有用,但是至少对刚入门的朋友或多或少希望对大家有一点帮助吧,若果你是大牛请指针不对的地方,如果你 ...

  4. Cuda入门笔记

    最近在学cuda ,找了好久入门的教程,感觉入门这个教程比较好,网上买的书基本都是在掌握基础后才能看懂,所以在这里记录一下.百度文库下载,所以不知道原作者是谁,向其致敬! 文章目录 1. CUDA是什 ...

  5. CUDA 入门(转)

    CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...

  6. CUDA编程-&gt;CUDA入门了解(一)

    安装好CUDA6.5+VS2012,操作系统为Win8.1版本号,首先下个GPU-Z检測了一下: 看出本显卡属于中低端配置.关键看两个: Shaders=384.也称作SM.或者说core/流处理器数 ...

  7. CUDA中Bank conflict冲突

    转自:http://blog.csdn.net/smsmn/article/details/6336060 其实这两天一直不知道什么叫bank conflict冲突,这两天因为要看那个矩阵转置优化的问 ...

  8. 【CUDA】CUDA框架介绍

    引用 出自Bookc的博客,链接在此http://bookc.github.io/2014/05/08/my-summery-the-book-cuda-by-example-an-introduct ...

  9. 转:ubuntu 下GPU版的 tensorflow / keras的环境搭建

    http://blog.csdn.net/jerr__y/article/details/53695567 前言:本文主要介绍如何在 ubuntu 系统中配置 GPU 版本的 tensorflow 环 ...

随机推荐

  1. FL2440驱动添加(1):hello world 驱动模块添加

    试试第一个hello world模块添加: 1,在添加drivers/char/hello.c /*************************************************** ...

  2. js generator数据类型

    1. 概述 generator 是ES6引入的新的数据类型, 看上去像一个函数,除了使用return返回, yield可以返回多次. generator 由function* 定义, (注意*号), ...

  3. 终端&作业控制&会话启动过程

    进程组 每个进程除了有个进程id外,还属于一个进程组.进程组是一个或者多个进程的集合.通常他们与同一个作业相关联,可以接受来自同一终端的信号.进程组id等于其进程组长id.进程组的终止与进程组长是否存 ...

  4. WP修改ProgressBar的前景色

    重载默认主题中的值,修改ProgressBar的前景色 做一个小项目时,用到 ProgressBar添加一个加载效果,但是背景不是白色的,调试时发现自己选的主题色与背景色相近,特别不清晰,加载效果几乎 ...

  5. Static Cell-静态TableView

    使用静态TableView有两个前提,1.要在Storyboard上  2.需要使用TableViewController PS:如果需要设置不同的cell的高度不同的话,还是需要使用tableVie ...

  6. UISegmentedControl的使用

    #import "SegmentedControlTestViewController.h" @implementation SegmentedControlTestViewCon ...

  7. iOS 核心动画

    核心动画(Core Animation) : •CoreAnimation是一组非常强大的动画处理API,使用它能做出非常炫丽的动画效果,而且往往是事半功倍,使用它需要先添加QuartzCore.fr ...

  8. 百度地图开发的学习(一)——配置环境&基础地图

    由于项目需求缘故,最近在学习Android地图的开发,所以就记录一下学习过程.最近都会陆续更新啦.目前使用百度地图API的挺多的,所以就先以它为基础学习一些地图的调用. 一.AK的申请 与web开发不 ...

  9. Stronger (What Doesn't Kill You)

    今天听一个歌曲,挺不错的.以前一直不知道意思.这次把歌词摘抄下来. 试听音乐: 原版MV: You know the bed feels warmer 你知道被窝里的温暖 Sleeping here ...

  10. DP大作战—组合背包

    题目描述 组合背包:有的物品只可以取一次(01背包),有的物品可以取无限次(完全背包),有的物品可以取的次数有一个上限(多重背包). DD大牛的伪代码 for i = 1 to N if 第i件物品属 ...