cuda编程-矩阵乘法（2）

采用shared memory加速

代码

#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include <algorithm>

#include <cuda_runtime.h>

#include <device_launch_parameters.h>

#include "functions.h"

#define TILE_SIZE 16

__global__ void matrixMulKernel(float *C, float *A, float *B, int width, int height){

    __shared__ float tile_A[TILE_SIZE][TILE_SIZE];

    __shared__ float tile_B[TILE_SIZE][TILE_SIZE];

    unsigned int tx = threadIdx.x;

    unsigned int ty = threadIdx.y;

    unsigned int gx = blockIdx.x * TILE_SIZE + tx;

    unsigned int gy = blockIdx.y * TILE_SIZE + ty;

    if (gx >= width || gy >= height)

        return;

    // Load shared memory

    int tile_num = (width + TILE_SIZE - ) / TILE_SIZE;

    float sum = ;

    for (int i = ; i < tile_num; ++i){

        int bound = min(width, TILE_SIZE);

        for (int j = tx; j < bound; j += blockDim.x){

            tile_A[ty][j] = A[gy * width + i * bound + j];

        }

        for (int j = ty; j < bound; j += blockDim.y){

            tile_B[j][tx] = B[(i * bound + j) * width + gx];

        }

        //Synchronize to make sure the sub-matrices are loaded before starting the computation

        __syncthreads();

        for (int j = ; j < bound; ++j){

            sum += tile_A[ty][j] * tile_B[j][tx];

        }

        //Synchronize to make sure that the preceding computation is done before loading two new

        //sub-matrices of M and N in the next iteration

        __syncthreads();

    }

    C[gy*width + gx] = sum;

} 

void constantInit(float *data, int size, float val){

    for (int i = ; i < size; ++i){

        data[i] = val;

    }

} 

void matrixMul(){

    int dev_id = ;

    cudaSetDevice(dev_id); 

    // Allocate host memory for matrices A and B

    int width = ;

    int height = ;

    unsigned int size = width * height;

    unsigned int mem_size = sizeof(float)* size;

    float *h_A = (float *)malloc(mem_size);

    float *h_B = (float *)malloc(mem_size);

    float *h_C = (float *)malloc(mem_size); 

    // Initialize host memory

    const float valB = 0.01f;

    constantInit(h_A, size, 1.0f);

    constantInit(h_B, size, valB); 

    // Allocate device memory

    float *d_A, *d_B, *d_C;

    cudaMalloc((void **)&d_A, mem_size);

    cudaMalloc((void **)&d_B, mem_size);

    cudaMalloc((void **)&d_C, mem_size); 

    // Memcpy

    cudaMemcpy(d_A, h_A, mem_size, cudaMemcpyHostToDevice);

    cudaMemcpy(d_B, h_B, mem_size, cudaMemcpyHostToDevice); 

    // Config dim

    dim3 block(TILE_SIZE, TILE_SIZE);

    dim3 grid((width + block.x - ) / block.x, (height + block.y - ) / block.y);

    matrixMulKernel <<<grid, block >>>(d_C, d_A, d_B, width, height); 

    // Memcpy device to host

    cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost); 

    // Check

    printf("Checking computed result for correctness: ");

    bool correct = true;

    // test relative error by the formula // |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps

    double eps = .e-;

    // machine zero

    for (int i = ; i < (int)(width * height); i++) {

        double abs_err = fabs(h_C[i] - (width * valB));

        double dot_length = width;

        double abs_val = fabs(h_C[i]);

        double rel_err = abs_err / abs_val / dot_length;

        if (abs_err > eps) {

            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], (float)(width*height), eps);

            correct = false;

        }

    }

    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

}

合并访存：tile_A按行存储，tile_B按列存储，sum=row_tile_A * row_tile_B

__global__ void matrixMulKernel(float *C, float *A, float *B, int width, int height){

    __shared__ float tile_A[TILE_SIZE][TILE_SIZE];

    __shared__ float tile_B[TILE_SIZE][TILE_SIZE];

    unsigned int tx = threadIdx.x;

    unsigned int ty = threadIdx.y;

    unsigned int gx = blockIdx.x * TILE_SIZE + tx;

    unsigned int gy = blockIdx.y * TILE_SIZE + ty;

    if (gx >= width || gy >= height)

        return;

    // Load shared memory

    int tile_num = (width + TILE_SIZE - ) / TILE_SIZE;

    float sum = ;

    for (int i = ; i < tile_num; ++i){

        tile_A[tx][ty] = A[gy * width + i * TILE_SIZE + tx];

        tile_B[ty][tx] = B[(i * TILE_SIZE + ty) * width + gx];

        //Synchronize to make sure the sub-matrices are loaded before starting the computation

        __syncthreads();

        for (int j = ; j < TILE_SIZE; ++j){

            sum += tile_A[j][ty] * tile_B[j][tx];

        }

        //Synchronize to make sure that the preceding computation is done before loading two new

        //sub-matrices of M and N in the next iteration

        __syncthreads();

    }

    C[gy*width + gx] = sum;

}

cuda编程-矩阵乘法（2）的更多相关文章

cuda编程-矩阵乘法（1）
本方法采用简单的单线程计算每组行和列乘加运算代码如下: #include <stdio.h> #include <stdlib.h> #include <iostrea ...
cuda(2) 矩阵乘法优化过程
Created on 2013-8-5URL : http://blog.sina.com.cn/s/blog_a502f1a30101mjch.html@author: zhxfl转载请说明出处 # ...
CUDA编程之快速入门
CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架构.做图像视觉领域的同学多多少少都会接触到CUDA,毕竟要做性能速度优化,CUDA是个很重要 ...
CUDA编程之快速入门【转】
https://www.cnblogs.com/skyfsm/p/9673960.html CUDA(Compute Unified Device Architecture)的中文全称为计算统一设备架 ...
详解CUDA编程
CUDA 是 NVIDIA 的 GPGPU 模型,它使用 C 语言为基础,可以直接以大多数人熟悉的 C 语言,写出在显示芯片上执行的程序,而不需要去学习特定的显示芯片的指令或是特殊的结构.” 编者注: ...
CUDA 矩阵乘法终极优化指南
作者:马骏 | 旷视 MegEngine 架构师前言单精度矩阵乘法(SGEMM)几乎是每一位学习 CUDA 的同学绕不开的案例,这个经典的计算密集型案例可以很好地展示 GPU 编程中常用的优化技巧 ...
OpenCL 矩阵乘法
▶ 矩阵乘法,按照书里的内容进行了几方面的优化,包括局部内存,矢量数据类型,寄存器,流水线等. ● 最直接的乘法.调用时 main.c 中使用 size_t globalSize[] = { rowA ...
【Cuda编程】加法归约
目录 cuda编程并行归约 AtomicAdd调用出错 gpu cpu下时间计算加法的归约矩阵乘法矩阵转置统计数目平方和求和分块处理线程相邻多block计算 cuda编程并行归约 At ...
CUDA编程（十）使用Kahan's Summation Formula提高精度
CUDA编程(十) 使用Kahan's Summation Formula提高精度上一次我们准备去并行一个矩阵乘法.然后我们在GPU上完毕了这个程序,当然是非常单纯的把任务分配给各个线程.也没有经过 ...

随机推荐

5239-回忆京都-洛谷3月赛gg祭
传送门题目背景第十五届东方人气投票音乐部门 106名第四次国内不知道东方的人对东方原曲的投票调查 51名回忆京都副歌我tm吹爆,东方文花帖我tm吹爆! 题目描述射命丸文在取材中发现了一个好 ...
win7 64位操作系统电脑桌面出现this computer is being attacked的窗口
本人为win7 64位操作系统,戴尔笔记本电脑. 昨天在教室写程序来着,突然桌面上出现this computer is being attacked的窗口,如下所示.每隔半分钟左右出现在电脑桌面上转两 ...
Intel发6款全新9代i9/i7/i5 CPU：巅峰8核
在旧金山举办的GDC19活动中,Intel正式发布9代酷睿新品,面向移动平台的H系列标压处理器,定于今年第二季度上市. 换言之,最快4月份我们就能见到搭载后缀H的9代酷睿CPU笔记本(游戏本)等发售了 ...
<网络编程>套接字介绍
1.端口:IANA(Internet Assigned Numbers Authority)维护着一个端口号分配状况的清单. 众所周知的端口(0-1023):由IANA分配和控制,可能的话,相同的端口 ...
编程&学习总结格式
编程&学习总结格式一.本周完成的作业: 题目1.A乘以B 题目内容描述:看我没骗你吧 -- 这是一道你可以在10秒内完成的题:给定两个绝对值不超过100的整数A和B,输出A乘以B的值. 1) ...
hackbar增强版 & 在Firefox上安装未通过验证的扩展
hackbar是Firefox的经典插件之一.介绍如下(懒得翻译了) This toolbar will help you in testing sql injections, XSS holes a ...
记一次项目上线后Log4j2不输出日志的坑
公司项目采用了Log4j2来输出日志,在开发环境和测试环境下均可以输出日志,但在生成环境就没有日志输出.开始毫无头绪,后来通过不断的排查,终于解决了这个问题.在此记录下该问题的解决过程,便于后 ...
UINavigationController - BNR
继续上篇UITableView的编辑操作. 当你初始化一个UINavigationController对象时,它将拥有一个根视图控制器,即UIViewController.根视图控制器一直存在于sta ...
任务调度工具Quartz入门笔记
一,导包 1)官网下载:http://www.quartz-scheduler.org/downloads/ 2)Maven <dependency> <groupId>org ...
R实战第十二篇：随机数
由R生成的随机数实际上伪随机数,也就是说,随机数是由某种算法而不是真正的随机过程产生的,随机数生成器需要一个初始值来生成数字,该初始值叫做种子.通过把种子设置为特定的值,可以保证每次运行同一段代码时都 ...

cuda编程-矩阵乘法（2）

cuda编程-矩阵乘法（2）的更多相关文章

随机推荐

热门专题