CUDA入门1
1GPUs can handle thousands of concurrent threads.
2The pieces of code running on the gpu are called kernels
3A kernel is executed by a set of threads.
4All threads execute the same code (SPMD)
5Each thread has an index that is used to calculate memory addresses that this will access.


1Threads are grouped into blocks
2 Blocks are grouped into a grid
3 A kernel is executed as a grid of blocks of threads
Built-in variables ⎯ threadIdx, blockIdx ⎯ blockDim, gridDim

CUDA的线程组织即Grid-Block-Thread结构。一组线程并行处理可以组织为一个block,而一组block并行处理可以组织为一个Grid。下面的程序分别为线程并行和块并行,线程并行为细粒度的并行,而块并行为粗粒度的并行。addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
#include <stdlib.h> #define MAX 255
#define MIN 0
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float* etime);
__global__ void addKernelThread(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
__global__ void addKernelBlock(int *c, const int *a, const int *b)
{
int i = blockIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = ; int a[arraySize] = { , , , , };
int b[arraySize] = { , , , , }; for (int i = ; i< arraySize ; i++){
a[i] = rand() % (MAX + - MIN) + MIN;
b[i] = rand() % (MAX + - MIN) + MIN;
}
int c[arraySize] = { };
// Add vectors in parallel.
cudaError_t cudaStatus;
int num = ; float time;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for(int i = ;i<num;i++)
{
cudaGetDeviceProperties(&prop,i);
} cudaStatus = addWithCuda(c, a, b, arraySize,,&time); printf("Elasped time of thread is : %f \n", time);
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]); cudaStatus = addWithCuda(c, a, b, arraySize,,&time); printf("Elasped time of block is : %f \n", time); if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "addWithCuda failed!");
return ;
}
printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",a[],a[],a[],a[],a[],b[],b[],b[],b[],b[],c[],c[],c[],c[],c[]);
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadExit failed!");
return ;
}
return ;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,int type,float * etime)
{
int *dev_a = ;
int *dev_b = ;
int *dev_c = ;
clock_t start, stop;
float time;
cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
} // Launch a kernel on the GPU with one thread for each element.
if(type == ){
start = clock();
addKernelThread<<<, size>>>(dev_c, dev_a, dev_b);
}
else{
start = clock();
addKernelBlock<<<size, >>>(dev_c, dev_a, dev_b);
} stop = clock();
time = (float)(stop-start)/CLOCKS_PER_SEC;
*etime = time;
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
运行的结果是
Elasped time of thread is : 0.000010
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}
Elasped time of block is : 0.000005
{103,105,81,74,41} + {198,115,255,236,205} = {301,220,336,310,246}
CUDA入门1的更多相关文章
- CUDA入门
CUDA入门 鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to ...
- 一篇不错的CUDA入门
鉴于自己的毕设需要使用GPU CUDA这项技术,想找一本入门的教材,选择了Jason Sanders等所著的书<CUDA By Example an Introduction to Genera ...
- CUDA入门需要知道的东西
CUDA刚学习不久,做毕业要用,也没时间研究太多的东西,我的博客里有一些我自己看过的东西,不敢保证都特别有用,但是至少对刚入门的朋友或多或少希望对大家有一点帮助吧,若果你是大牛请指针不对的地方,如果你 ...
- Cuda入门笔记
最近在学cuda ,找了好久入门的教程,感觉入门这个教程比较好,网上买的书基本都是在掌握基础后才能看懂,所以在这里记录一下.百度文库下载,所以不知道原作者是谁,向其致敬! 文章目录 1. CUDA是什 ...
- CUDA 入门(转)
CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...
- CUDA编程->CUDA入门了解(一)
安装好CUDA6.5+VS2012,操作系统为Win8.1版本号,首先下个GPU-Z检測了一下: 看出本显卡属于中低端配置.关键看两个: Shaders=384.也称作SM.或者说core/流处理器数 ...
- CUDA中Bank conflict冲突
转自:http://blog.csdn.net/smsmn/article/details/6336060 其实这两天一直不知道什么叫bank conflict冲突,这两天因为要看那个矩阵转置优化的问 ...
- 【CUDA】CUDA框架介绍
引用 出自Bookc的博客,链接在此http://bookc.github.io/2014/05/08/my-summery-the-book-cuda-by-example-an-introduct ...
- 转:ubuntu 下GPU版的 tensorflow / keras的环境搭建
http://blog.csdn.net/jerr__y/article/details/53695567 前言:本文主要介绍如何在 ubuntu 系统中配置 GPU 版本的 tensorflow 环 ...
随机推荐
- hadoop2.2.0伪分布式搭建2--安装JDK
2.1上传FileZilla 上传 https://filezilla-project.org/ 2.2解压jdk #创建文件夹 mkdir /usr/java #解压 tar -zxvf jdk-7 ...
- 关于网络上的各种mysql性能测试结论
关于网上的各种性能测试帖子,我想说以下几点: 1.为了使性能测试更加的客观.实际,应该说明针对什么场景进行测试,查询.还是修改,是否包含了主键,包含了几个索引,各自的差别是什么.因为不同的mysql分 ...
- SQL数据库基础(三)
认识数据库备份和事务日志备份 数据库备份与日志备份是数据库维护的日常工作,备份的目的是在于当数据库出现故障或者遭到破坏时可以根据备份的数据库及事务日志文件还原到最近的时间点将损失降到最低点. 数据库备 ...
- mongodb 基本指令学习
启动 : 1)创建一个文件夹存放mongodb的数据 启动的时候指定这个文件夹为存储mongodb的存储路径 我的目录是D:\data 2)启动mongodb服务 进入安装mongodb的bin ...
- 初识Asp.net Identity
第一篇,多多指教啦! 之前做asp.net的网站只知道Asp.net的身份验证方式有:Windows验证和Forms验证.今天初步了解了下asp.net的Identity技术,顺带了解了它之前的Mem ...
- javascript宿主对象之window.frames
window.frames属性是当前页面所有框架的集合.要注意的事,这里并没有frame和iframe做出区分.而且,无论页面存不存在框架,window.frames属性总是存在的,并总是指向wind ...
- Microsoft Dynamics CRM 2013 CD-KEY
Microsoft Dynamics CRM Workgroup Server 2013 (5 CAL limit):NX77Y-BTBCV-JP3T3-8W7JH-94QJP Microsoft D ...
- 转:【前端福利】用grunt搭建自动化的web前端开发环境-完整教程
原文地址:http://blog.csdn.net/wangfupeng1988/article/details/46418203 jQuery在使用grunt,bootstrap在使用grunt,百 ...
- 每日vim插件--vim中的文本对象及相关插件
最近在个人博客上 http://foocoder.com 每天都会介绍一个vim插件,想起来园子也好久没更新了,也来更新一篇. 今天按读者留言的要求,介绍下文本对象.同时还会介绍我在用的几个文本相关 ...
- STL--向量(vector)
STL的组成 标准模板库STL关注的重点是泛型数据结构和算法,其关键组成部分是容器(containers).算法(algorithms).迭代器(iterators).函数对象(Function Ob ...