0_Simple__matrixMul + 0_Simple__matrixMul

矩阵乘法，使用一维线程块和共享内存。并且在静态代码和运行时编译两种条件下使用。

▶ 源代码：静态使用

 #include <stdio.h>

 #include <assert.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_functions.h>

 #include <helper_cuda.h>

 template <int BLOCK_SIZE> __global__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)

 {

     int bx = blockIdx.x;

     int by = blockIdx.y;

     int tx = threadIdx.x;

     int ty = threadIdx.y;

     int aBegin = wA * BLOCK_SIZE * by;  // A的行程起点

     int aEnd   = aBegin + wA - ;       // A的行程终点

     int aStep  = BLOCK_SIZE;            // A的跨度（一个 block 为宽 BLOCK_SIZE 的一维条带，各线程分别对应其中的一个元素）

     int bBegin = BLOCK_SIZE * bx;       // B的行程起点

     int bStep  = BLOCK_SIZE * wB;       // B的跨度（一个 block 为高 BLOCK_SIZE 的一维条带，各线程分别对应其中的一个元素）

     float Csub = ;

     for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)

     {

         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

         As[ty][tx] = A[a + wA * ty + tx];

         Bs[ty][tx] = B[b + wB * ty + tx];

         __syncthreads();

 #pragma unroll// 循环展开为 BLOCK_SIZE 个赋值语句，提高效率

         for (int k = ; k < BLOCK_SIZE; ++k)

             Csub += As[ty][k] * Bs[k][tx];

         __syncthreads();

     }

     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

     C[c + wB * ty + tx] = Csub;

 }

 void constantInit(float *data, int size, float val)

 {

     for (int i = ; i < size; ++i)

         data[i] = val;

 }

 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)

 {

     unsigned int size_A = dimsA.x * dimsA.y;

     unsigned int mem_size_A = sizeof(float) * size_A;

     float *h_A = (float *)malloc(mem_size_A);

     unsigned int size_B = dimsB.x * dimsB.y;

     unsigned int mem_size_B = sizeof(float) * size_B;

     float *h_B = (float *)malloc(mem_size_B);

     constantInit(h_A, size_A, 1.0f);

     constantInit(h_B, size_B, 0.01f);

     dim3 dimsC(dimsB.x, dimsA.y, );

     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);

     float *h_C = (float *) malloc(mem_size_C);

     float *d_A, *d_B, *d_C;

     cudaMalloc((void **) &d_A, mem_size_A);

     cudaMalloc((void **) &d_B, mem_size_B);

     cudaMalloc((void **) &d_C, mem_size_C);

     cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);

     cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

     // 热身

     dim3 threads(block_size, block_size);

     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

     if (block_size == )

         matrixMulCUDA<><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);

     else

         matrixMulCUDA<><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);

     printf("done\n");

     cudaDeviceSynchronize();

     printf("Computing result using CUDA Kernel...\n");

     cudaEvent_t start;

     cudaEventCreate(&start);

     cudaEvent_t stop;

     cudaEventCreate(&stop);

     cudaEventRecord(start, NULL);

     int nIter = ;

     for (int j = ; j < nIter; j++)

     {

         if (block_size == )

             matrixMulCUDA<><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);

         else

             matrixMulCUDA<><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);

     }

     cudaEventRecord(stop, NULL);

     cudaEventSynchronize(stop);

     float msecTotal = 0.0f;

     cudaEventElapsedTime(&msecTotal, start, stop);

     float msecPerMatrixMul = msecTotal / nIter;

     double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;

     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);

     printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",

         gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);

     cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

     // 检查结果，要求相对误差：|<x, y>_cpu - <x,y>_gpu| / <|x|, |y|>  < eps

     printf("Checking computed result for correctness: ");

     bool correct = true;

     double eps = .e- ; // machine zero

     for (int i = ; i < (int)(dimsC.x * dimsC.y); i++)

     {

         double abs_err = fabs(h_C[i] - (dimsA.x * valB));

         double dot_length = dimsA.x;

         double abs_val = fabs(h_C[i]);

         double rel_err = abs_err/abs_val/dot_length ;

         if (rel_err > eps)

         {

             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);

             correct = false;

         }

     }

     printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

     free(h_A);

     free(h_B);

     free(h_C);

     cudaFree(d_A);

     cudaFree(d_B);

     cudaFree(d_C);

     printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");

     if (correct)

         return EXIT_SUCCESS;

     else

         return EXIT_FAILURE;

 }

 int main(int argc, char **argv)

 {

     printf("[Matrix Multiply Using CUDA] - Starting...\n");

     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?"))

     {

         printf("Usage -device=n (n >= 0 for deviceID)\n");

         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");

         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");

         printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");

         exit(EXIT_SUCCESS);

     }

     int devID = ;// 指定设备，默认用0号设备

     if (checkCmdLineFlag(argc, (const char **)argv, "device"))

     {

         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");

         cudaSetDevice(devID);

     }

     cudaDeviceProp deviceProp;

     cudaGetDevice(&devID);

     cudaGetDeviceProperties(&deviceProp, devID);

     if (deviceProp.computeMode == cudaComputeModeProhibited)

     {

         fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");

         exit(EXIT_SUCCESS);

     }

     int block_size = (deviceProp.major < ) ?  : ;

     dim3 dimsA(**block_size, **block_size, );

     dim3 dimsB(**block_size, **block_size, );

     // 使用命令行指定的A、B的维度参数

     if (checkCmdLineFlag(argc, (const char **)argv, "wA"))

         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");

     if (checkCmdLineFlag(argc, (const char **)argv, "hA"))

         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");

     if (checkCmdLineFlag(argc, (const char **)argv, "wB"))

         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");

     if (checkCmdLineFlag(argc, (const char **)argv, "hB"))

         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");

     if (dimsA.x != dimsB.y)

     {

         printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",

                dimsA.x, dimsB.y);

         exit(EXIT_FAILURE);

     }

     printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);

     int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);

     getchar();

     exit(matrix_result);

 }

▶ 源代码：运行时编译

 /*matrixMul_kernel.cu*/

 template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)

 {

     int bx = blockIdx.x;

     int by = blockIdx.y;

     int tx = threadIdx.x;

     int ty = threadIdx.y;

     int aBegin = wA * BLOCK_SIZE * by;

     int aEnd   = aBegin + wA - ;

     int aStep  = BLOCK_SIZE;

     int bBegin = BLOCK_SIZE * bx;

     int bStep = BLOCK_SIZE * wB;

     float Csub = ;

     for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)

     {

         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

         As[ty][tx] = A[a + wA * ty + tx];

         Bs[ty][tx] = B[b + wB * ty + tx];

         __syncthreads();

 #pragma unroll

         for (int k = ; k < BLOCK_SIZE; ++k)

             Csub += As[ty][k] * Bs[k][tx];

         __syncthreads();

     }

     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

     C[c + wB * ty + tx] = Csub;

 }

 extern "C" __global__ void  matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)

 {

     matrixMulCUDA<>(C,A,B,wA,wB);

 }

 extern "C" __global__ void  matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)

 {

     matrixMulCUDA<>(C,A,B,wA,wB);

 }

 /*matrixMul.cpp*/

 #include <stdio.h>

 #include <assert.h>

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include "nvrtc_helper.h"

 #include <helper_functions.h>

 void constantInit(float *data, int size, float val)

 {

     for (int i = ; i < size; ++i)

         data[i] = val;

 }

 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)

 {

     // Allocate host memory for matrices A and B

     unsigned int size_A = dimsA.x * dimsA.y;

     unsigned int mem_size_A = sizeof(float) * size_A;

     float *h_A = (float *)malloc(mem_size_A);

     unsigned int size_B = dimsB.x * dimsB.y;

     unsigned int mem_size_B = sizeof(float) * size_B;

     float *h_B = (float *)malloc(mem_size_B);

     const float valB = 0.01f;

     constantInit(h_A, size_A, 1.0f);

     constantInit(h_B, size_B, valB);

     CUdeviceptr d_A, d_B, d_C;

     char *ptx, *kernel_file;

     size_t ptxSize;

     kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[]);

     compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize);

     CUmodule module = loadPTX(ptx, argc, argv);

     dim3 dimsC(dimsB.x, dimsA.y, );

     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);

     float *h_C = (float *) malloc(mem_size_C);

     cuMemAlloc(&d_A, mem_size_A);

     cuMemAlloc(&d_B, mem_size_B);

     cuMemAlloc(&d_C, mem_size_C);

     cuMemcpyHtoD(d_A, h_A, mem_size_A);

     cuMemcpyHtoD(d_B, h_B, mem_size_B);

     dim3 threads(block_size, block_size);

     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

     printf("Computing result using CUDA Kernel...\n");

     CUfunction kernel_addr;

     if (block_size == )

       cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16");

     else

       cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32");

     void *arr[] = { (void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x };

     // Execute the kernel

     int nIter = ;

     for (int j = ; j < nIter; j++)

     {

         cuLaunchKernel(kernel_addr,

             grid.x, grid.y, grid.z,

             threads.x, threads.y, threads.z,

             , , &arr[], );

         cuCtxSynchronize();

     }

     cuMemcpyDtoH(h_C, d_C, mem_size_C);

     printf("Checking computed result for correctness: ");

     bool correct = true;

     double eps = .e- ;

     for (int i = ; i < (int)(dimsC.x * dimsC.y); i++)

     {

         double abs_err = fabs(h_C[i] - (dimsA.x * valB);

         double dot_length = dimsA.x;

         double abs_val = fabs(h_C[i]);

         double rel_err = abs_err/abs_val/dot_length ;

         if (rel_err > eps)

         {

             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);

             correct = false;

         }

     }

     printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

     printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");

     free(h_A);

     free(h_B);

     free(h_C);

     cuMemFree(d_A);

     cuMemFree(d_B);

     cuMemFree(d_C);

     if (correct)

         return EXIT_SUCCESS;

     else

         return EXIT_FAILURE;

 }

 int main(int argc, char **argv)

 {

     printf("[Matrix Multiply Using CUDA] - Starting...\n");

     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?"))

     {

         printf("Usage -device=n (n >= 0 for deviceID)\n");

         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");

         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");

         printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");

         exit(EXIT_SUCCESS);

     }

     int block_size = ;

     dim3 dimsA(**block_size, **block_size, );

     dim3 dimsB(**block_size, **block_size, );

     if (checkCmdLineFlag(argc, (const char **)argv, "wA"))

         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");

     if (checkCmdLineFlag(argc, (const char **)argv, "hA"))

         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");

     if (checkCmdLineFlag(argc, (const char **)argv, "wB"))

         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");

     if (checkCmdLineFlag(argc, (const char **)argv, "hB"))

         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");

     if (dimsA.x != dimsB.y)

     {

         printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);

     }   exit(EXIT_FAILURE);

     printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);

     int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);

     getchar();

     exit(matrix_result);

 }

▶ 输出结果：

[Matrix Multiply Using CUDA] - Starting...

GPU Device : "GeForce GTX 1070" with compute capability 6.1

MatrixA(,), MatrixB(,)

Computing result using CUDA Kernel...

done

Performance= 22.95 GFlop/s, Time= 5.712 msec, Size=  Ops, WorkgroupSize=  threads/block

Checking computed result for correctness: Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势：

● 程序写得很烂，各种声明、初始化杂糅。

● 一个根据cuda错误种类返回错误描述的函数

extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);

● 预编译命令展开循环

 #pragma unroll

 for (i = ; i < m; i++)

     c[i] = a[i] + b[i];

等价于

 c[] = a[] + b[];

 c[] = a[] + b[];

 c[] = a[] + b[];

 ...

 c[m-] = a[m-] + b[m-];

#pragma unroll 命令后面可接数字，表明展开前多少次迭代，例如 #pragma unroll 4

● 核函数泛型编程。可以在调用核函数时传入一个常量参数，变相使用动态数组来规定共享内存等数组的大小。

 template <int BLOCK_SIZE> __global__ void functionName(void)

 {

     __shared__ int shareArray[BLOCK_SIZE];

     ...

 }    

 cunctionName<> << < blocksize, threadsize >> >();

● 热身，在多次重复实验前提前算一次。对缓存有帮助，有效减小实验结果（计算耗时）的方差。

0_Simple__matrixMul + 0_Simple__matrixMul_nvrtc的更多相关文章

随机推荐

初学者一些常用的SQL语句（一）
一.数据库的创建create database 数据库名create database bbb二.表的创建 ***[]:可选项*** null:空值 not null 不为空***只有字符型能指定长度 ...
Linux的硬盘使用情况、挂载、SSD挂载（查看df -h不能看到的卷）
linux上的盘和window的有区别,磁盘空间必须挂载在目录上,要不然没用对与新增的硬盘.SSD固态硬盘.挂载到linux上的操作如下: df -h #显示目前在Linux系统上的文件系 ...
HDFS概述（6）————用户手册
目的本文档是使用Hadoop分布式文件系统(HDFS)作为Hadoop集群或独立通用分布式文件系统的一部分的用户的起点.虽然HDFS旨在在许多环境中"正常工作",但HDFS的工作 ...
Training little cats poj3735
Training little cats Time Limit: 2000MS Memory Limit: 65536K Total Submissions: 9299 Accepted: 2 ...
K相邻算法
刚开始学习机器学习,先跟这<机器学习实战>学一些基本的算法 ----------------------------------分割线--------------------------- ...
面向对象oop
类和对象 1.什么是类?什么是对象? 1)现实世界是由很多很多对象组成的基于对象抽出了类 2)对象:真实存在的单个的个体类:类型/类别,代表一类个体 3)类中可以包含: 3.1)所有对象所共有的属 ...
PowerBI开发第七篇：数据集和数据刷新
PowerBI报表是基于数据分析的引擎,数据真正的来源(Data Source)是数据库,文件等数据存储媒介,PowerBI支持的数据源类型多种多样.PowerBI Service(云端)有时不直接访 ...
java基本的要点
我想告诉大家的不是什么java基本要点,只是对初学者的一点忠告,本人是从八维学校亲身经历过的学生,要想学好并且快速了解java,那你首先必须有英语底子,没有英语底子,几个单词都不会的,我觉得还是放弃学 ...
python之控制台（console）颜色显示
#coding=utf-8 import ctypes,sys STD_INPUT_HANDLE = -10 STD_OUTPUT_HANDLE = -11 STD_ERROR_HANDLE = -1 ...
css系列教程1-选择器全解
全栈工程师开发手册 (作者:栾鹏) 一个demo学会css css系列教程1-选择器全解 css系列教程2-样式操作全解 css选择器全解: css选择器包括:基本选择器.属性选择器.伪类选择器.伪元 ...

0_Simple__matrixMul + 0_Simple__matrixMul_nvrtc

0_Simple__matrixMul + 0_Simple__matrixMul_nvrtc的更多相关文章

随机推荐

热门专题