【Cuda编程】加法归约
cuda编程并行归约
AtomicAdd调用出错
在cuda中调用atomicAdd函数,但总显示未定义标识符,在网上送了一下,于是做了如下修改,
右键解决方案属性-》配置属性-》CUDA C/C++-》Device-》Code Generation,加入compute_20,sm_20,并且把下面的“从父级或项目属性默认设置继承”的勾选去掉
gpu cpu下时间计算
//cpu 下
#include <time.h>
clock_t start,end;
start = clock();
//cpu codes
end = clock();
printf("CPU Time: %.5f\n", (float)(end-start));
//gpu 下
cudaEvent_t st,ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
cudaEventRecord(st,0);
//gpu codes
cudaEventRecord(ed,0);
cudaEventSynchronize(ed);
float gpu_time;
cudaEventElapsedTime(&gpu_time,st,ed);
printf("GPU Time: %.5f\n",gpu_time);
cudaEventDestroy(st);
cudaEventDestroy(ed);
加法的归约
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#include <book.h>
const int Size = 256;
const int block = 8;
const int thread = 32;
__global__ void calc(float *in, float *out){
unsigned int tid = threadIdx.x;
unsigned int bid = blockIdx.x;
//target array
float * target = in + blockIdx.x * blockDim.x;
//bounding
if(tid > thread)
return;
for(int stride = 1 ; stride < blockDim.x ; stride *= 2)
{
if(tid % (stride*2) == 0)
{
target[tid] += target[tid+stride];
}
__syncthreads();
}
if(tid == 0)
{
out[blockIdx.x] = target[tid];
}
}
__global__ void calc2(float *in, float *out)
{
unsigned int tid = threadIdx.x;
unsigned int bid = tid + blockIdx.x*blockDim.x;
float * target = in + blockIdx.x * blockDim.x;
//bounding
if(tid > thread)
return;
//stride = 1,2,4,8
for(int stride = 1 ; stride < blockDim.x ; stride *= 2)
{
unsigned int index = 2*stride*tid;
if(index < blockDim.x)
target[index] += target[index+stride];
__syncthreads();
}
if(tid == 0)
{
out[blockIdx.x] = target[tid];
}
}
//跨步规约
__global__ void calc3(float *in, float *out)
{
unsigned int tid = threadIdx.x;
unsigned int bid = tid + blockIdx.x*blockDim.x;
float * target = in + blockIdx.x * blockDim.x;
//bounding
if(tid > thread)
return;
for(int stride = blockDim.x/2 ; stride > 0 ; stride /=2)
{
if(tid < stride)
target[tid] += target[tid+stride];
__syncthreads();
}
if(tid == 0)
{
out[blockIdx.x] = target[tid];
}
}
__global__ void calc4(float *in, float *out)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
float * target=in + bid * blockDim.x;
if(tid < thread)
return;
__shared__ float share_in[thread];
share_in[tid] = target[tid];
__syncthreads();
for(int stride = blockDim.x/2 ; stride > 0; stride /= 2)
{
if(tid < stride)
{
share_in[tid] += share_in[tid+stride];
}
__syncthreads();
}
if(tid == 0)
{
out[blockIdx.x] = share_in[tid];
}
}
int main()
{
//host
float * indata; // Size
float * outdata; // block
float * ans; // 1
// device
float * dev_indata; // Size
float * dev_outdata; // block
// host malloc
indata = (float*)malloc(sizeof(float)*Size);
outdata = (float*)malloc(sizeof(float)*block);
ans = (float*)malloc(sizeof(float));
// device malloc
cudaMalloc((void**)&dev_indata,sizeof(float)*Size);
cudaMalloc((void**)&dev_outdata,sizeof(float)*block);
// init & generate data
for(int i = 0 ; i < Size ; i++)
{
indata[i] = i;
}
*ans = 0;
// time start
cudaEvent_t st,ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
cudaEventRecord(st,0);
// memcpy to device
HANDLE_ERROR(cudaMemcpy(dev_indata,indata,sizeof(float)*Size,cudaMemcpyHostToDevice));
// kernal functions
cudaDeviceSynchronize();
calc4<<<block,thread>>>(dev_indata,dev_outdata);
cudaDeviceSynchronize();
// memcpy to host
HANDLE_ERROR(cudaMemcpy(outdata,dev_outdata,sizeof(float)*block,cudaMemcpyDeviceToHost));
// time end
cudaEventRecord(ed,0);
cudaEventSynchronize(ed);
float gpu_time;
cudaEventElapsedTime(&gpu_time,st,ed);
// test output
for(int i = 0 ; i < block ; i++)
{
//printf("%.3f\n",outdata[i]);
*ans += outdata[i];
}
printf("GPU Time: %.5f\nAns: %.5f\n",gpu_time,*ans);
//time destory
cudaEventDestroy(st);
cudaEventDestroy(ed);
//device destory
cudaFree(indata);
cudaFree(outdata);
cudaFree(ans);
getchar();
return 0;
}
矩阵乘法
#include <stdlib.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <cuda.h>
#include <device_launch_parameters.h>
const int N = 20;
__global__ void mul(int *a,int* b,int *out)
{
unsigned int tidx = threadIdx.x;
unsigned int tidy = threadIdx.y;
unsigned int offset = tidx*N + tidy;
if(offset > N*N)return;
int t = 0;
for(int i = 0 ; i < N ; i++)
{
t += a[tidx*N+i]*b[i*N+tidy];
}
out[offset] = t;
}
int main()
{
//host
int * matrix1;
int * matrix2;
int * output;
//device
int * dev_matrix1;
int * dev_matrix2;
int * dev_output;
//host malloc
matrix1 = (int*)malloc(sizeof(int)*N*N);
matrix2 = (int*)malloc(sizeof(int)*N*N);
output = (int*)malloc(sizeof(int)*N*N);
//device malloc
cudaMalloc((void**)&dev_matrix1,sizeof(int)*N*N);
cudaMalloc((void**)&dev_matrix2,sizeof(int)*N*N);
cudaMalloc((void**)&dev_output,sizeof(int)*N*N);
//init generate data
for(int i = 0 ; i < N*N ; i++)
{
matrix1[i] = i+1;
matrix2[i] = i+1;
output[i] = 0;
}
//CPU
for(int i = 0 ; i < N ; i++)
{
for(int j = 0 ; j < N ; j++){
int tp = 0;
for(int k = 0 ; k < N ; k++)
{
tp += matrix1[i*N+k] * matrix2[k*N+j];
}
printf("%d ",tp);
}
}
printf("\n----------\n");
//time start
cudaEvent_t st,ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
cudaEventRecord(st,0);
//memcpy to device
cudaMemcpy(dev_matrix1,matrix1,sizeof(int)*N*N,cudaMemcpyHostToDevice);
cudaMemcpy(dev_matrix2,matrix2,sizeof(int)*N*N,cudaMemcpyHostToDevice);
//kernel functions
mul<<<2,dim3(N,N)>>>(dev_matrix1,dev_matrix2,dev_output);
//memcpy to host
cudaMemcpy(output,dev_output,sizeof(int)*N*N,cudaMemcpyDeviceToHost);
//output
for(int i = 0 ; i < N*N ; i++)
{
printf("%d ",output[i]);
}
printf("\n");
//time end
cudaEventRecord(ed,0);
cudaEventSynchronize(ed);
float gpu_time;
cudaEventElapsedTime(&gpu_time,st,ed);
printf("gpu time: %.5f\n",gpu_time);
//time destory
cudaEventDestroy(st);
cudaEventDestroy(ed);
//device destory
cudaFree(dev_matrix1);
cudaFree(dev_matrix2);
cudaFree(dev_output);
free(matrix1);
free(matrix2);
free(output);
return 0;
}
矩阵转置
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
const int N = 5;
void output(int * arr)
{
for(int i = 0 ; i < N*N ; i++)
{
printf("%d\t",arr[i]);
if((i+1) % N == 0)
printf("\n");
}
printf("\n");
}
__global__ void trans(int * in, int * out)
{
unsigned int xIndex = threadIdx.x + blockDim.x * blockIdx.x;
unsigned int yIndex = threadIdx.y + blockDim.y * blockIdx.y;
if(xIndex < N && yIndex < N)
{
unsigned int index_in = xIndex + N * yIndex;
unsigned int index_out = yIndex + N * xIndex;
out[index_out] = in[index_in];
}
}
__global__ void trans2(int * in , int * out)
{
__shared__ float block[N][N];
unsigned int xIndex = blockIdx.x * N + threadIdx.x;
unsigned int yIndex = blockIdx.y * N + threadIdx.y;
if((xIndex < N) && (yIndex < N))
{
unsigned int index_in = yIndex * N +xIndex;
block[threadIdx.x][threadIdx.y] = in[index_in];
}
__syncthreads();
xIndex = blockIdx.y * N + threadIdx.x;
yIndex = blockIdx.x * N + threadIdx.y;
if((xIndex < N) && (yIndex < N))
{
unsigned int index_out = yIndex * N + xIndex;
out[index_out] = block[threadIdx.x][threadIdx.y];
}
}
int main()
{
//host
int * in;
int * out;
//device
int * dev_in;
int * dev_out;
//host cudaMalloc
in = (int*)malloc(sizeof(int)*N*N);
out = (int*)malloc(sizeof(int)*N*N);
//device cudaMalloc
cudaMalloc((void**)&dev_in,sizeof(int)*N*N);
cudaMalloc((void**)&dev_out,sizeof(int)*N*N);
//init
for(int i = 0 ; i < N*N ; i++){
in[i] = i+1;
}
//cudaMemcpy
cudaMemcpy(dev_in,in,sizeof(int)*N*N,cudaMemcpyHostToDevice);
//kernel functions
trans<<<1,dim3(N,N)>>>(dev_in,dev_out);
//memcpy back
cudaMemcpy(out,dev_out,sizeof(int)*N*N,cudaMemcpyDeviceToHost);
//dev_output
output(in);
printf("\n--------\n");
output(out);
//cudaFree
cudaFree(dev_in);
cudaFree(dev_out);
free(in);
free(out);
return 0;
}
统计数目
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
const int N = 26;
const int L = 128;
const int S = L*4;
const int block = 4;
const int thread = 32;
__global__ void rec(char* book, int * record)
{
unsigned int tid = threadIdx.x;
__shared__ int temp[N];
temp[tid] = 0;
__syncthreads();
int index = tid + blockIdx.x * blockDim.x;
int offset = blockDim.x * gridDim.x;
//printf("%d-%d\n",index,offset);
while(index < S)
{
atomicAdd(&(temp[book[index]]),1);
index += offset;
}
__syncthreads();
atomicAdd(&(record[tid]),temp[tid]);
}
int main()
{
//host
char * book;
int * record;
//device
char * dev_book;
int * dev_record;
//host cudaMalloc
book = (char*)malloc(sizeof(char)*S);
record = (int*)malloc(sizeof(int)*N);
//device malloc
cudaMalloc((void**)&dev_book,sizeof(char)*S);
cudaMalloc((void**)&dev_record,sizeof(int)*N);
//init
for(int i = 0 ; i < S ; i++)
{
srand(i+rand());
book[i] = (i+i*i+rand())%26;
}
//cpu
int tp[N]={0};
for(int i = 0 ; i < S ; i++)
{
tp[book[i]]++;
}
for(int i = 0 ; i < N ; i++)
printf("%d ",tp[i]);
printf("\n");
//memcpy To device
cudaMemcpy(dev_book,book,sizeof(char)*S,cudaMemcpyHostToDevice);
//kernel functions
rec<<<block,thread>>>(dev_book,dev_record);
//memcpy To host
cudaMemcpy(record,dev_record,sizeof(int)*N,cudaMemcpyDeviceToHost);
//output
for(int i = 0 ; i < N ; i++)
{
printf("%d ",record[i]);
}
printf("\n");
//destory
cudaFree(dev_book);
cudaFree(dev_record);
free(book);
free(record);
return 0;
}
平方和求和
分块处理
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>
/*
* author : pprp
* theme : 平方和
*/
const int N = 128;
const int block = 4;
const int thread = 32;
__global__ void calc0(int * arr, int * result)
{
int tid = threadIdx.x;
int Size = N / block;
int sum = 0;
for(int i = tid * Size ; i <(tid+1)*Size; i++)
{
sum += arr[i]*arr[i];
}
result[tid] = sum;
//printf("sum: %d\n",sum);
}
int main()
{
//host
int * arr;
int * result;
//device
int * dev_arr;
int * dev_result;
//host malloc
arr = (int*)malloc(sizeof(int)*N);
result = (int*)malloc(sizeof(int)*block);
//device malloc
cudaMalloc((void**)&dev_arr,sizeof(int)*N);
cudaMalloc((void**)&dev_result,sizeof(int)*block);
//init
for(int i = 0 ; i < N ; i++)
{
arr[i] = i+1;
if(i < block)
{
result[i] = 0;
}
}
//cpu
clock_t start,end;
start = clock();
unsigned int res = 0;
for(int i = 0 ; i < N ; i++)
{
res += arr[i]*arr[i];
}
end = clock();
printf("cpu ans : %d\ncpu time: %.5f\n",res,float(end-start));
//time start
cudaEvent_t st,ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
cudaEventRecord(st,0);
//memcpy To Host
cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice);
//kernel functions
calc0<<<1,4>>>(dev_arr,dev_result);
//memcpy To Device
cudaMemcpy(result,dev_result,sizeof(int)*block,cudaMemcpyDeviceToHost);
//output
int res2=0;
for(int i = 0 ; i < block ; i++)
{
res2 += result[i];
//printf("test: %d\n",result[i]);
}
//time end
cudaEventRecord(ed,0);
cudaEventSynchronize(ed);
float gpu_time;
cudaEventElapsedTime(&gpu_time,st,ed);
printf("gpu ans :%d\ngpu time: %.5f\n",res2,gpu_time);
//time destroy
cudaEventDestroy(st);
cudaEventDestroy(ed);
//device free
cudaFree(dev_arr);
cudaFree(dev_result);
free(arr);
free(result);
return 0;
}
线程相邻
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>
/*
* author : pprp
* theme : 平方和
*/
const int N = 128;
const int block = 4;
const int thread = 32;
__global__ void calc0(int * arr, int * result)
{
int tid = threadIdx.x;
if(tid > block)return;
int sum = 0;
for(int i = tid; i < N ; i+=block)
{
sum += arr[i]*arr[i];
}
result[tid] = sum;
}
int main()
{
//host
int * arr;
int * result;
//device
int * dev_arr;
int * dev_result;
//host malloc
arr = (int*)malloc(sizeof(int)*N);
result = (int*)malloc(sizeof(int)*block);
//device malloc
cudaMalloc((void**)&dev_arr,sizeof(int)*N);
cudaMalloc((void**)&dev_result,sizeof(int)*block);
//init
for(int i = 0 ; i < N ; i++)
{
arr[i] = i+1;
if(i < block)
{
result[i] = 0;
}
}
//cpu
clock_t start,end;
start = clock();
unsigned int res = 0;
for(int i = 0 ; i < N ; i++)
{
res += arr[i]*arr[i];
}
end = clock();
printf("cpu ans : %d\ncpu time: %.5f\n",res,float(end-start));
//time start
cudaEvent_t st,ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
cudaEventRecord(st,0);
//memcpy To Host
cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice);
//kernel functions
calc0<<<1,block>>>(dev_arr,dev_result);
//memcpy To Device
cudaMemcpy(result,dev_result,sizeof(int)*block,cudaMemcpyDeviceToHost);
//output
int res2=0;
for(int i = 0 ; i < block ; i++)
{
res2 += result[i];
//printf("test: %d\n",result[i]);
}
//time end
cudaEventRecord(ed,0);
cudaEventSynchronize(ed);
float gpu_time;
cudaEventElapsedTime(&gpu_time,st,ed);
printf("gpu ans :%d\ngpu time: %.5f\n",res2,gpu_time);
//time destroy
cudaEventDestroy(st);
cudaEventDestroy(ed);
//device free
cudaFree(dev_arr);
cudaFree(dev_result);
free(arr);
free(result);
return 0;
}
多block计算
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>
/*
* author : pprp
* theme : 平方和
*/
const int N = 32;
const int block = 4;
const int thread = 8;
__global__ void calc0(int * arr, int * result)
{
int tid = threadIdx.x;
int bid = blockIdx.x;
int sum = 0;
for(int i = bid*blockDim.x+tid; i < N ; i += blockDim.x*gridDim.x)
{
sum += arr[i]*arr[i];
}
__syncthreads();
result[bid*blockDim.x+tid] = sum;
printf("++%d \n",sum);
}
int main()
{
//host
int * arr;
int * result;
//device
int * dev_arr;
int * dev_result;
//host malloc
arr = (int*)malloc(sizeof(int)*N);
result = (int*)malloc(sizeof(int)*N);
//device malloc
cudaMalloc((void**)&dev_arr,sizeof(int)*N);
cudaMalloc((void**)&dev_result,sizeof(int)*N);
//init
for(int i = 0 ; i < N ; i++)
{
arr[i] = i+1;
if(i < thread)
{
result[i] = 0;
}
}
//cpu
clock_t start,end;
start = clock();
unsigned int res = 0;
for(int i = 0 ; i < N ; i++)
{
res += arr[i]*arr[i];
}
end = clock();
printf("cpu ans : %d\ncpu time: %.5f\n",res,float(end-start));
//time start
cudaEvent_t st,ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
cudaEventRecord(st,0);
//memcpy To Host
cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice);
//kernel functions
calc0<<<block,thread>>>(dev_arr,dev_result);
//memcpy To Device
cudaMemcpy(result,dev_result,sizeof(int)*N,cudaMemcpyDeviceToHost);
//output
int res2=0;
for(int i = 0 ; i < N ; i++)
{
res2 += result[i];
//printf("test: %d\n",result[i]);
}
//time end
cudaEventRecord(ed,0);
cudaEventSynchronize(ed);
float gpu_time;
cudaEventElapsedTime(&gpu_time,st,ed);
printf("gpu ans :%d\ngpu time: %.5f\n",res2,gpu_time);
//time destroy
cudaEventDestroy(st);
cudaEventDestroy(ed);
//device free
cudaFree(dev_arr);
cudaFree(dev_result);
free(arr);
free(result);
return 0;
}
【Cuda编程】加法归约的更多相关文章
- CUDA中的归约
CUDA编程实战书中的乘方和解决办法: 对一个数组执行某种计算,然后产生一个更小的结果数组. 由一个线程在共享内存上进行迭代并计算出总和值.而如果用并行,所花时间就与数组长度的对数成正比. 代码的思想 ...
- CUDA编程(六)进一步并行
CUDA编程(六) 进一步并行 在之前我们使用Thread完毕了简单的并行加速,尽管我们的程序运行速度有了50甚至上百倍的提升,可是依据内存带宽来评估的话我们的程序还远远不够.在上一篇博客中给大家介绍 ...
- CUDA编程之快速入门
CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...
- CUDA编程(十)使用Kahan's Summation Formula提高精度
CUDA编程(十) 使用Kahan's Summation Formula提高精度 上一次我们准备去并行一个矩阵乘法.然后我们在GPU上完毕了这个程序,当然是非常单纯的把任务分配给各个线程.也没有经过 ...
- CUDA编程学习笔记1
CUDA编程模型是一个异构模型,需要CPU和GPU协同工作. host和device host和device是两个重要的概念 host指代CPU及其内存 device指代GPU及其内存 __globa ...
- CUDA编程之快速入门【转】
https://www.cnblogs.com/skyfsm/p/9673960.html CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架 ...
- 不同版本CUDA编程的问题
1 无法装上CUDA的toolkit 卸载所有的NVIDIA相关的app,包括NVIDIA的显卡驱动,然后重装. 2之前的文件打不开,one or more projects in the solut ...
- cuda编程基础
转自: http://blog.csdn.net/augusdi/article/details/12529247 CUDA编程模型 CUDA编程模型将CPU作为主机,GPU作为协处理器(co-pro ...
- CUDA学习笔记(一)——CUDA编程模型
转自:http://blog.sina.com.cn/s/blog_48b9e1f90100fm56.html CUDA的代码分成两部分,一部分在host(CPU)上运行,是普通的C代码:另一部分在d ...
随机推荐
- MyEclipse10.0 注册破解步骤
MyEclipse 10.0破解 激活(java编写,适用于装有java环境的各种操作系统,Windows,Linux,MacOS) =====[方法一]=====[第一步]:输入任意用户名[第二步] ...
- 14 jmeter性能测试实战--数据库MySQL
需求 测试用户表(对用户表select操作) 测试步骤 1.MySQL驱动下载并安装. 2.测试计划面板点击“浏览”按钮,将JDBC驱动(mysql-connector-java.jar)添加进来. ...
- 010-spring cloud gateway-过滤器-自定义局部、全局过滤器、区别
一.自定义局部过滤器 自定义过滤器需要实现GatewayFilter和Ordered.其中GatewayFilter中的这个方法就是用来实现你的自定义的逻辑的 Mono<Void> fil ...
- 007-ant design 对象属性赋值,双向绑定
1.state对象属性赋值 设对象为 state={ datavalue:{ id:'', name:'', }, } 修改对象属性为: SetName=(e)=>{ let val=e.tar ...
- MySQL中MyISAM与InnoDB区别及选择,mysql添加外键
InnoDB:支持事务处理等不加锁读取支持外键支持行锁不支持FULLTEXT类型的索引不保存表的具体行数,扫描表来计算有多少行DELETE 表时,是一行一行的删除InnoDB 把数据和索引存放在表空间 ...
- 机器学习理论基础学习3.3--- Linear classification 线性分类之logistic regression(基于经验风险最小化)
一.逻辑回归是什么? 1.逻辑回归 逻辑回归假设数据服从伯努利分布,通过极大化似然函数的方法,运用梯度下降来求解参数,来达到将数据二分类的目的. logistic回归也称为逻辑回归,与线性回归这样输出 ...
- Python做接口自动化测试框架
框架结构如下: Test_Api_Project||---base.py|---base_api| |---register_api.py | |---send_sms_code_api.py|--- ...
- iOS 界面启动时,功能新特征显示
APP新启动的时候,都会有几张新的图片滑动,才能到主的界面.现在,我们新建一个控制器,专门来处理新特性,直接上代码. 第一步:新建一个NewfeatureController // // HWNewf ...
- jquery事件 on(),live(),delegate(),blind()
jQuery推出on()的目的有2个,一是为了统一接口,二是为了提高性能, 所以从现在开始用on()替换bind(), live(), delegate吧. 尤其是不要再用live()了,因为它已经处 ...
- HTML ajax 上传文件限制文件的类型和文件大小
html <input type="file" name="excel" id="excel_input" accept=&qu ...