CUDA入门1
1GPUs can handle thousands of concurrent threads.
2The pieces of code running on the gpu are called kernels
3A kernel is executed by a set of threads.
4All threads execute the same code (SPMD)
5Each thread has an index that is used to calculate memory addresses that this will access.


1Threads are grouped into blocks
2 Blocks are grouped into a grid
3 A kernel is executed as a grid of blocks of threads
Built-in variables ⎯ threadIdx, blockIdx ⎯ blockDim, gridDim

CUDA的线程组织即Grid-Block-Thread结构。一组线程并行处理可以组织为一个block,而一组block并行处理可以组织为一个Grid。下面的程序分别为线程并行和块并行,线程并行为细粒度的并行,而块并行为粗粒度的并行。addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
#include <stdlib.h> #define MAX 255
#define MIN 0
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float* etime);
__global__ void addKernelThread(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
__global__ void addKernelBlock(int *c, const int *a, const int *b)
{
int i = blockIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = ; int a[arraySize] = { , , , , };
int b[arraySize] = { , , , , }; for (int i = ; i< arraySize ; i++){
a[i] = rand() % (MAX + - MIN) + MIN;
b[i] = rand() % (MAX + - MIN) + MIN;
}
int c[arraySize] = { };
// Add vectors in parallel.
cudaError_t cudaStatus;
int num = ; float time;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for(int i = ;i<num;i++)
{
cudaGetDeviceProperties(&prop,i);
} cudaStatus = addWithCuda(c, a, b, arraySize,,&time); printf("Elasped time of thread is : %f \n", time);
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]); cudaStatus = addWithCuda(c, a, b, arraySize,,&time); printf("Elasped time of block is : %f \n", time); if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "addWithCuda failed!");
return ;
}
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]);
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadExit failed!");
return ;
}
return ;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float * etime)
{
int *dev_a = ;
int *dev_b = ;
int *dev_c = ;
clock_t start, stop;
float time;
cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
} // Launch a kernel on the GPU with one thread for each element.
if(type == ){
start = clock();
addKernelThread<<<, size>>>(dev_c, dev_a, dev_b);
}
else{
start = clock();
addKernelBlock<<<size, >>>(dev_c, dev_a, dev_b);
} stop = clock();
time = (float)(stop-start)/CLOCKS_PER_SEC;
*etime = time;
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
运行的结果是
Elasped time of thread is : 0.000010
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}
Elasped time of block is : 0.000005
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}
CUDA入门1的更多相关文章
- CUDA入门
CUDA入门 鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to ...
- 一篇不错的CUDA入门
鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to Genera ...
- CUDA入门需要知道的东西
CUDA刚学习不久,做毕业要用,也没时间研究太多的东西,我的博客里有一些我自己看过的东西,不敢保证都特别有用,但是至少对刚入门的朋友或多或少希望对大家有一点帮助吧,若果你是大牛请指针不对的地方,如果你 ...
- Cuda入门笔记
最近在学cuda ,找了好久入门的教程,感觉入门这个教程比较好,网上买的书基本都是在掌握基础后才能看懂,所以在这里记录一下.百度文库下载,所以不知道原作者是谁,向其致敬! 文章目录 1. CUDA是什 ...
- CUDA 入门(转)
CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...
- CUDA编程->CUDA入门了解(一)
安装好CUDA6.5+VS2012,操作系统为Win8.1版本号,首先下个GPU-Z检測了一下: 看出本显卡属于中低端配置.关键看两个: Shaders=384.也称作SM.或者说core/流处理器数 ...
- CUDA中Bank conflict冲突
转自:http://blog.csdn.net/smsmn/article/details/6336060 其实这两天一直不知道什么叫bank conflict冲突,这两天因为要看那个矩阵转置优化的问 ...
- 【CUDA】CUDA框架介绍
引用 出自Bookc的博客,链接在此http://bookc.github.io/2014/05/08/my-summery-the-book-cuda-by-example-an-introduct ...
- 转:ubuntu 下GPU版的 tensorflow / keras的环境搭建
http://blog.csdn.net/jerr__y/article/details/53695567 前言:本文主要介绍如何在 ubuntu 系统中配置 GPU 版本的 tensorflow 环 ...
随机推荐
- SystemMenu类的用法
先声明对象以及相应常数: //SystemMenu对象 private SystemMenu m_systemMenu = null; // ID 常数定义 (可变,只要不与系统冲突即可) priva ...
- sql2008 查询字段所属表
select a.name as 表名, g.*from sysobjects as a left join syscolumns as b on a.id=b.id left JOIN sys.ex ...
- eclipse中的web项目路径和发布好的项目路径
现在企业开发中,我们都会创建一个javaWeb工程,在eclipse中指的是新建一个dynamic web project,创建完工程之后,我们在IDE中大体看到如下的工程目录: 我们主要关心的文件夹 ...
- jQuery中的尺寸及位置的取和设
1.offset(); 获取位置值: $(selector).offset().left; $(selector).offset().top; 设置位置值: $(selector).offset({t ...
- Lync 客户端:无法登陆到Lync,验证服务器中的证书时遇到问题
安装完Lync客户端后,运行时Lync客户端时,报出如下错误: [原因解析] Lync客户端没有正确安装CA证书链. [解决办法] 第一种方法:将计算机加入域. 第二种方法:不加入域的处理方法: 1. ...
- 转:EClipse 10个最有用的快捷键
Eclipse快捷键 10个最有用的快捷键 Eclipse中10个最有用的快捷键组合 一个Eclipse骨灰级开发者总结了他认为最有用但又不太为人所知的快捷键组合.通过这些组合可以更加容易的浏览源代 ...
- 中国象棋引擎的C#源代码
以前写的中国象棋引擎的C#源程序,可在VS2010中编译运行,由于个人精力有限,难以完成后续的开发工作,如果谁感兴趣,请关注微信公众号(“申龙斌的程序人生”,ID:slbGTD),发送后台消息“象棋引 ...
- UWP开发中的流媒体
写这篇的目的只是为了记住这个东西, win10原生支持HLS了 AdaptiveMediaSourceCreationResult amsResult = await AdaptiveMediaSou ...
- IOS枚举使用
1.方法一: typedef enum { one = 0, two, }Name; 2.方法二: typedef NS_ENUM(NSInteger, name) { one, two }; 注:a ...
- UISegmentedControl的使用
#import "SegmentedControlTestViewController.h" @implementation SegmentedControlTestViewCon ...