矩阵乘法,使用一维线程块和共享内存。并且在静态代码和运行时编译两种条件下使用。

▶ 源代码:静态使用

 #include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_functions.h>
#include <helper_cuda.h> template <int BLOCK_SIZE> __global__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y; int aBegin = wA * BLOCK_SIZE * by; // A的行程起点
int aEnd = aBegin + wA - ; // A的行程终点
int aStep = BLOCK_SIZE; // A的跨度(一个 block 为宽 BLOCK_SIZE 的一维条带,各线程分别对应其中的一个元素)
int bBegin = BLOCK_SIZE * bx; // B的行程起点
int bStep = BLOCK_SIZE * wB; // B的跨度(一个 block 为高 BLOCK_SIZE 的一维条带,各线程分别对应其中的一个元素)
float Csub = ; for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
__syncthreads(); #pragma unroll// 循环展开为 BLOCK_SIZE 个赋值语句,提高效率
for (int k = ; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
__syncthreads();
} int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
} void constantInit(float *data, int size, float val)
{
for (int i = ; i < size; ++i)
data[i] = val;
} int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
{
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, 0.01f);
dim3 dimsC(dimsB.x, dimsA.y, );
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = (float *) malloc(mem_size_C);
float *d_A, *d_B, *d_C;
cudaMalloc((void **) &d_A, mem_size_A);
cudaMalloc((void **) &d_B, mem_size_B);
cudaMalloc((void **) &d_C, mem_size_C);
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); // 热身
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
if (block_size == )
matrixMulCUDA<><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
else
matrixMulCUDA<><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
printf("done\n");
cudaDeviceSynchronize(); printf("Computing result using CUDA Kernel...\n");
cudaEvent_t start;
cudaEventCreate(&start);
cudaEvent_t stop;
cudaEventCreate(&stop);
cudaEventRecord(start, NULL); int nIter = ;
for (int j = ; j < nIter; j++)
{
if (block_size == )
matrixMulCUDA<><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
else
matrixMulCUDA<><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
cudaEventRecord(stop, NULL);
cudaEventSynchronize(stop); float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop);
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); // 检查结果,要求相对误差:|<x, y>_cpu - <x,y>_gpu| / <|x|, |y|> < eps
printf("Checking computed result for correctness: ");
bool correct = true;
double eps = .e- ; // machine zero
for (int i = ; i < (int)(dimsC.x * dimsC.y); i++)
{
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err/abs_val/dot_length ;
if (rel_err > eps)
{
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
if (correct)
return EXIT_SUCCESS;
else
return EXIT_FAILURE;
} int main(int argc, char **argv)
{
printf("[Matrix Multiply Using CUDA] - Starting...\n"); if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?"))
{
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
exit(EXIT_SUCCESS);
} int devID = ;// 指定设备,默认用0号设备
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
cudaSetDevice(devID);
}
cudaDeviceProp deviceProp;
cudaGetDevice(&devID);
cudaGetDeviceProperties(&deviceProp, devID); if (deviceProp.computeMode == cudaComputeModeProhibited)
{
fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
exit(EXIT_SUCCESS);
} int block_size = (deviceProp.major < ) ? : ; dim3 dimsA(**block_size, **block_size, );
dim3 dimsB(**block_size, **block_size, ); // 使用命令行指定的A、B的维度参数
if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
if (dimsA.x != dimsB.y)
{
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); getchar();
exit(matrix_result);
}

▶ 源代码:运行时编译

 /*matrixMul_kernel.cu*/
template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int aBegin = wA * BLOCK_SIZE * by;
int aEnd = aBegin + wA - ;
int aStep = BLOCK_SIZE;
int bBegin = BLOCK_SIZE * bx;
int bStep = BLOCK_SIZE * wB;
float Csub = ;
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
__syncthreads();
#pragma unroll
for (int k = ; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
__syncthreads();
}
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
} extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
{
matrixMulCUDA<>(C,A,B,wA,wB);
} extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
{
matrixMulCUDA<>(C,A,B,wA,wB);
}
 /*matrixMul.cpp*/
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include "nvrtc_helper.h"
#include <helper_functions.h> void constantInit(float *data, int size, float val)
{
for (int i = ; i < size; ++i)
data[i] = val;
} int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
{
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
CUdeviceptr d_A, d_B, d_C; char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[]);
compileFileToPTX(kernel_file, , NULL, &ptx, &ptxSize);
CUmodule module = loadPTX(ptx, argc, argv); dim3 dimsC(dimsB.x, dimsA.y, );
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = (float *) malloc(mem_size_C);
cuMemAlloc(&d_A, mem_size_A);
cuMemAlloc(&d_B, mem_size_B);
cuMemAlloc(&d_C, mem_size_C);
cuMemcpyHtoD(d_A, h_A, mem_size_A);
cuMemcpyHtoD(d_B, h_B, mem_size_B); dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); printf("Computing result using CUDA Kernel...\n"); CUfunction kernel_addr;
if (block_size == )
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16");
else
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"); void *arr[] = { (void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x }; // Execute the kernel
int nIter = ; for (int j = ; j < nIter; j++)
{
cuLaunchKernel(kernel_addr,
grid.x, grid.y, grid.z,
threads.x, threads.y, threads.z,
, , &arr[], );
cuCtxSynchronize();
}
cuMemcpyDtoH(h_C, d_C, mem_size_C); printf("Checking computed result for correctness: ");
bool correct = true;
double eps = .e- ;
for (int i = ; i < (int)(dimsC.x * dimsC.y); i++)
{
double abs_err = fabs(h_C[i] - (dimsA.x * valB);
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err/abs_val/dot_length ;
if (rel_err > eps)
{
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
free(h_A);
free(h_B);
free(h_C);
cuMemFree(d_A);
cuMemFree(d_B);
cuMemFree(d_C);
if (correct)
return EXIT_SUCCESS;
else
return EXIT_FAILURE;
} int main(int argc, char **argv)
{
printf("[Matrix Multiply Using CUDA] - Starting...\n"); if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?"))
{
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
exit(EXIT_SUCCESS);
} int block_size = ;
dim3 dimsA(**block_size, **block_size, );
dim3 dimsB(**block_size, **block_size, ); if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
if (dimsA.x != dimsB.y)
{
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
} exit(EXIT_FAILURE);
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); getchar();
exit(matrix_result);
}

▶ 输出结果:

[Matrix Multiply Using CUDA] - Starting...
GPU Device : "GeForce GTX 1070" with compute capability 6.1 MatrixA(,), MatrixB(,)
Computing result using CUDA Kernel...
done
Performance= 22.95 GFlop/s, Time= 5.712 msec, Size= Ops, WorkgroupSize= threads/block
Checking computed result for correctness: Result = PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势:

● 程序写得很烂,各种声明、初始化杂糅。

● 一个根据cuda错误种类返回错误描述的函数

extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);

● 预编译命令展开循环

 #pragma unroll
for (i = ; i < m; i++)
c[i] = a[i] + b[i];

等价于

 c[] = a[] + b[];
c[] = a[] + b[];
c[] = a[] + b[];
...
c[m-] = a[m-] + b[m-];

#pragma unroll 命令后面可接数字,表明展开前多少次迭代,例如 #pragma unroll 4

● 核函数泛型编程。可以在调用核函数时传入一个常量参数,变相使用动态数组来规定共享内存等数组的大小。

 template <int BLOCK_SIZE> __global__ void functionName(void)
{
__shared__ int shareArray[BLOCK_SIZE];
...
} cunctionName<> << < blocksize, threadsize >> >();

● 热身,在多次重复实验前提前算一次。对缓存有帮助,有效减小实验结果(计算耗时)的方差。

0_Simple__matrixMul + 0_Simple__matrixMul_nvrtc的更多相关文章

随机推荐

  1. webstorm配置scss环境

    1.下载 Ruby  (安装过程中记得勾选添加到环境变量,安装结束最后可能会弹出一个cmd弹框,可以忽略) 2. cmd安装sass gem install sass 3. cmd检查是否安装 sas ...

  2. 如何从两个List中筛选出相同的值

    问题 现有社保卡和身份证若干,想要匹配筛选出一一对应的社保卡和身份证. 转换为List socialList,和List idList,从二者中找出匹配的社保卡. 模型 创建社保卡类 /** * @a ...

  3. Qt 打开文件的默认路径 QFileDialog::getOpenFileName()

    为了说明QFileDialog::getOpenFileName()函数的用法,还是先把函数签名放在这里:   QString QFileDialog::getOpenFileName (       ...

  4. Chinese Rings hdu 2842 矩阵快速幂

    Chinese Rings Time Limit: 2000/1000 MS (Java/Others)    Memory Limit: 32768/32768 K (Java/Others)Tot ...

  5. python中with学习

    python中with是非常强大的一个管理器,我个人的理解就是,我们可以通过在我们的类里面自定义enter(self)和exit(self,err_type,err_value,err_tb)这两个内 ...

  6. 苹果iPhone X上搭载的那颗A11仿生芯片,到底牛在哪?

    苹果iPhone X上搭载的那颗A11仿生芯片,到底牛在哪? 上周,苹果公司在刚刚落成投入使用的“飞船”新总部(Apple Park)举行2017年秋季新品发布会,整场发布会基本被iPhone X抢尽 ...

  7. Windows+Apache2.4.10+PHP7.0+MySQL5.6.21安装

    一.安装包下载 apache2.4.10 http://www.apachelounge.com/download/win64/ PHP7.0.7 http://windows.php.net/dow ...

  8. SQLServer中SQL语句与可执行二进制语句

    SQLServer可以执行正常SQL语句也可以执行被转换的二进制语句,一般会用此方法进行数据库注入操作,骗过基本的字符过滤 --将二进制格式转为普通SQL语句 ) = 0x53454C45435420 ...

  9. (转)TabIndex 属性

    html中的tabIndex属性可以设置键盘中的TAB键在控件中的移动顺序,即焦点的顺序.   把控件的tabIndex属性设成1到32767的一个值,就可以把这个控件加入到TAB键的序列中.   这 ...

  10. web容器启动后自动执行程序的几种方式比较

    1.       背景 1.1.       背景介绍 在web项目中我们有时会遇到这种需求,在web项目启动后需要开启线程去完成一些重要的工作,例如:往数据库中初始化一些数据,开启线程,初始化消息队 ...