cuda编程-矩阵乘法(2)
采用shared memory加速

代码
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <algorithm>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "functions.h" #define TILE_SIZE 16 __global__ void matrixMulKernel(float *C, float *A, float *B, int width, int height){
__shared__ float tile_A[TILE_SIZE][TILE_SIZE];
__shared__ float tile_B[TILE_SIZE][TILE_SIZE];
unsigned int tx = threadIdx.x;
unsigned int ty = threadIdx.y;
unsigned int gx = blockIdx.x * TILE_SIZE + tx;
unsigned int gy = blockIdx.y * TILE_SIZE + ty;
if (gx >= width || gy >= height)
return; // Load shared memory
int tile_num = (width + TILE_SIZE - ) / TILE_SIZE;
float sum = ;
for (int i = ; i < tile_num; ++i){
int bound = min(width, TILE_SIZE);
for (int j = tx; j < bound; j += blockDim.x){
tile_A[ty][j] = A[gy * width + i * bound + j];
}
for (int j = ty; j < bound; j += blockDim.y){
tile_B[j][tx] = B[(i * bound + j) * width + gx];
}
//Synchronize to make sure the sub-matrices are loaded before starting the computation
__syncthreads(); for (int j = ; j < bound; ++j){
sum += tile_A[ty][j] * tile_B[j][tx];
}
//Synchronize to make sure that the preceding computation is done before loading two new
//sub-matrices of M and N in the next iteration
__syncthreads();
}
C[gy*width + gx] = sum;
} void constantInit(float *data, int size, float val){
for (int i = ; i < size; ++i){
data[i] = val;
}
} void matrixMul(){
int dev_id = ;
cudaSetDevice(dev_id); // Allocate host memory for matrices A and B
int width = ;
int height = ;
unsigned int size = width * height;
unsigned int mem_size = sizeof(float)* size;
float *h_A = (float *)malloc(mem_size);
float *h_B = (float *)malloc(mem_size);
float *h_C = (float *)malloc(mem_size); // Initialize host memory
const float valB = 0.01f;
constantInit(h_A, size, 1.0f);
constantInit(h_B, size, valB); // Allocate device memory
float *d_A, *d_B, *d_C;
cudaMalloc((void **)&d_A, mem_size);
cudaMalloc((void **)&d_B, mem_size);
cudaMalloc((void **)&d_C, mem_size); // Memcpy
cudaMemcpy(d_A, h_A, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size, cudaMemcpyHostToDevice); // Config dim
dim3 block(TILE_SIZE, TILE_SIZE);
dim3 grid((width + block.x - ) / block.x, (height + block.y - ) / block.y);
matrixMulKernel <<<grid, block >>>(d_C, d_A, d_B, width, height); // Memcpy device to host
cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost); // Check
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula // |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = .e-;
// machine zero
for (int i = ; i < (int)(width * height); i++) {
double abs_err = fabs(h_C[i] - (width * valB));
double dot_length = width;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (abs_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], (float)(width*height), eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
}
合并访存:tile_A按行存储,tile_B按列存储,sum=row_tile_A * row_tile_B
__global__ void matrixMulKernel(float *C, float *A, float *B, int width, int height){
__shared__ float tile_A[TILE_SIZE][TILE_SIZE];
__shared__ float tile_B[TILE_SIZE][TILE_SIZE];
unsigned int tx = threadIdx.x;
unsigned int ty = threadIdx.y;
unsigned int gx = blockIdx.x * TILE_SIZE + tx;
unsigned int gy = blockIdx.y * TILE_SIZE + ty;
if (gx >= width || gy >= height)
return;
// Load shared memory
int tile_num = (width + TILE_SIZE - ) / TILE_SIZE;
float sum = ;
for (int i = ; i < tile_num; ++i){
tile_A[tx][ty] = A[gy * width + i * TILE_SIZE + tx];
tile_B[ty][tx] = B[(i * TILE_SIZE + ty) * width + gx];
//Synchronize to make sure the sub-matrices are loaded before starting the computation
__syncthreads();
for (int j = ; j < TILE_SIZE; ++j){
sum += tile_A[j][ty] * tile_B[j][tx];
}
//Synchronize to make sure that the preceding computation is done before loading two new
//sub-matrices of M and N in the next iteration
__syncthreads();
}
C[gy*width + gx] = sum;
}
cuda编程-矩阵乘法(2)的更多相关文章
- cuda编程-矩阵乘法(1)
本方法采用简单的单线程计算每组行和列乘加运算 代码如下: #include <stdio.h> #include <stdlib.h> #include <iostrea ...
- cuda(2) 矩阵乘法优化过程
Created on 2013-8-5URL : http://blog.sina.com.cn/s/blog_a502f1a30101mjch.html@author: zhxfl转载请说明出处 # ...
- CUDA编程之快速入门
CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...
- CUDA编程之快速入门【转】
https://www.cnblogs.com/skyfsm/p/9673960.html CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架 ...
- 详解CUDA编程
CUDA 是 NVIDIA 的 GPGPU 模型,它使用 C 语言为基础,可以直接以大多数人熟悉的 C 语言,写出在显示芯片上执行的程序,而不需要去学习特定的显示芯片的指令或是特殊的结构.” 编者注: ...
- CUDA 矩阵乘法终极优化指南
作者:马骏 | 旷视 MegEngine 架构师 前言 单精度矩阵乘法(SGEMM)几乎是每一位学习 CUDA 的同学绕不开的案例,这个经典的计算密集型案例可以很好地展示 GPU 编程中常用的优化技巧 ...
- OpenCL 矩阵乘法
▶ 矩阵乘法,按照书里的内容进行了几方面的优化,包括局部内存,矢量数据类型,寄存器,流水线等. ● 最直接的乘法.调用时 main.c 中使用 size_t globalSize[] = { rowA ...
- 【Cuda编程】加法归约
目录 cuda编程并行归约 AtomicAdd调用出错 gpu cpu下时间计算 加法的归约 矩阵乘法 矩阵转置 统计数目 平方和求和 分块处理 线程相邻 多block计算 cuda编程并行归约 At ...
- CUDA编程(十)使用Kahan's Summation Formula提高精度
CUDA编程(十) 使用Kahan's Summation Formula提高精度 上一次我们准备去并行一个矩阵乘法.然后我们在GPU上完毕了这个程序,当然是非常单纯的把任务分配给各个线程.也没有经过 ...
随机推荐
- poj-1330(暴力写的lca)
传送门 一看就是lca的板子题 然而 (写这个的时候我忘了怎么写lca) 于是我就试着写暴力了 本以为会tle结果e了一次后居然a掉了 开心到起飞.嘿嘿嘿 但还是格式输出错误了一次而且在ce之前也de ...
- C#总结(五)调用C++动态库(类型对照)
函数调用导致堆栈不对称.原因可能是托管的 PInvoke 签名与非托管的目标签名不匹配. 在dllimport中加入CallingConvention参数就行了, [DllImport(PCAP_DL ...
- MangoDB高级应用
MongoDB高级应用 Author:SimpleWu 聚合 聚合(aggregate)主要用于处理数据(诸如统计平均值,求和等),并返回计算后的数据结果.有点类似sql语句中的 count(*). ...
- Python Revisited Day 03 (组合数据类型)
目录 第三章 组合数据类型 3.1 序列类型 3.1.1 元组 3.1.2 命名的元组 (collections.nametuple()) 3.1.3 列表 (查询有关函数点这) 3.1.4 列表内涵 ...
- Bus Video System CodeForces - 978E (思维)
The busses in Berland are equipped with a video surveillance system. The system records information ...
- Karen and Coffee CodeForces - 816B (差分数组+预处理前缀和)
To stay woke and attentive during classes, Karen needs some coffee! Karen, a coffee aficionado, want ...
- #Leetcode# 633. Sum of Square Numbers
https://leetcode.com/problems/sum-of-square-numbers/ Given a non-negative integer c, your task is to ...
- swagger 指定字段不显示到文档里
Swagger UI 隐藏指定接口类或方法 - 宁静致远 - CSDN博客https://blog.csdn.net/lqh4188/article/details/53538201 swagger ...
- Tomcat Cluster
Tomcat群集配置| Tomcat集群| MuleSofthttps://www.mulesoft.com/tcat/tomcat-cluster Tomcat Clustering - A Ste ...
- JS 验证输入框输入 只允许输入正实数(正整数,正小数),其他情况下不能输入 oninput事件
input标签的oninput事件 要求输入框只输入正实数,包括整数和小数. 具体要求:整数部分不超过7位,可以没有小数,若有位数不超过2位. <input type="text&qu ...