OpenACC 优化矩阵乘法
▶ 按书上的步骤使用不同的导语优化矩阵乘法
● 已经优化的代码
#include <iostream>
#include <cstdlib>
#include <chrono> #define SIZE 1024 using namespace std;
using namespace std::chrono; double a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE], d[SIZE][SIZE]; // 四个数组放入 main 里会报错 Segmentation fault (core dumped) int main()
{
//int i, j, k; // ijk 和 tmp 在循环中使用时才声明会导致运行时间变长
double tmp; #pragma acc enter data create(a, b, c)
#pragma acc kernels present(a, b, c)
{
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
a[i][j] = (double)(i + j);
}
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
b[i][j] = (double)(i - j);
}
for (int i = ; i < SIZE; i++) // 每种方法前都要清空 c
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
}
} high_resolution_clock::time_point t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 1,每层循环都 auto
{
#pragma acc loop auto
for (int i = ; i < SIZE; i++)
{
#pragma acc loop auto
for (int j = ; j < SIZE; j++)
{
#pragma acc loop auto
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Auto: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 2,外两层 independent,最里层串行
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
#pragma acc loop independent
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Seq: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 3,外两层 independent,最里层规约
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Reduction: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 4,手动指定 gang 和 vector
{
#pragma acc loop gang(32)
for (int i = ; i < SIZE; i++)
{
#pragma acc loop vector(16)
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Gang Vector: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 5,分块重排
{
#pragma acc loop tile(32, 32)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+ \
: tmp)
for (int k = ; k < SIZE; ++k)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - tile: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 6,合并多层迭代
{
#pragma acc loop collapse(2) independent
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Collapse: %.6lf s.\n\n", time.count()); #pragma acc exit data copyout(a, b, c) #pragma omp parallel for shared(d)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
d[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma omp parallel for default(none) shared(a, b, d) // 使用 OpenMP
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
for (int k = ; k < SIZE; k++)
d[i][j] += a[i][k] * b[k][j];
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenMP: %.6lf s.\n\n", time.count()); for (int i = ; i < SIZE; i++) // 检查结果
{
for (int j = ; j < SIZE; j++)
{
if (c[i][j] != d[i][j])
printf("\nError at [%d, %d],c = %f d = %f \n", i, j, c[i][j], d[i][j]);
}
}
return ;
}
● 输出结果(数据管理优化前)
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe Time OpenACC - Auto: 4.589736 s. Time OpenACC - Independent Seq: 4.823721 s. Time OpenACC - Independent Reduction: 3.669336 s. Time OpenACC - Gang Vector: 3.611391 s. Time OpenACC - tile: 3.609573 s. Time OpenACC - Collapse: 3.605792 s. Time OpenMP: 4.345018 s.
● 输出结果(数据管理优化后)
cuan@CUAN:~/acc$ pgc++ main.cpp -std=c++ -acc -mp -Minfo -o main.exe
main:
, include "chrono"
, include "chrono"
, Parallel region activated
, Parallel region terminated
, Parallel region activated
, Parallel region terminated
, Generating enter data create(b[:][:],c[:][:],a[:][:])
Generating present(a[:][:],b[:][:],c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Complex loop carried dependence of c prevents parallelization
Loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
Inner sequential loop scheduled on accelerator
Generating Tesla code
, #pragma acc loop gang /* blockIdx.y */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Complex loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang /* blockIdx.z */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop gang /* blockIdx.y */
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, FMA (fused multiply-add) instruction(s) generated
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang(32), vector(8) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(16) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x tiled */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x collapsed */
, #pragma acc loop seq
, Loop is parallelizable
, Generating exit data copyout(c[:][:],b[:][:],a[:][:])
Parallel loop activated with static block schedule
, Memory zero idiom, loop replaced by call to __c_mzero8
, Barrier
, Parallel loop activated with static block schedule
FMA (fused multiply-add) instruction(s) generated
, Barrier
cuan@CUAN:~/acc$ ./main.exe
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024 block=
Time OpenACC - Auto: 0.018726 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024x4 block=
Time OpenACC - Independent Seq: 0.040719 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
Time OpenACC - Independent Reduction: 0.012491 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=64x32 block=16x8
Time OpenACC - Gang Vector: 0.012314 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - tile: 0.013609 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - Collapse: 0.012676 s. Time OpenMP: 0.504436 s. Accelerator Kernel Timing data
/home/cuan/acc/main.cpp
main NVIDIA devicenum=
time(us): ,
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024x4] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [64x32] block: [16x8]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: data region reached time
: data copyout transfers:
device time(us): total=, max= min= avg=
OpenACC 优化矩阵乘法的更多相关文章
- Strassen优化矩阵乘法(复杂度O(n^lg7))
按照算法导论写的 还没有测试复杂度到底怎么样 不过这个真的很卡内存,挖个坑,以后写空间优化 还有Matthew Anderson, Siddharth Barman写了一个关于矩阵乘法的论文 < ...
- poj3613:Cow Relays(倍增优化+矩阵乘法floyd+快速幂)
Cow Relays Time Limit: 1000MS Memory Limit: 65536K Total Submissions: 7825 Accepted: 3068 Descri ...
- 利用Cayley-Hamilton theorem 优化矩阵线性递推
平时有关线性递推的题,很多都可以利用矩阵乘法来解决. 时间复杂度一般是O(K3logn)因此对矩阵的规模限制比较大. 下面介绍一种利用利用Cayley-Hamilton theorem加速矩阵乘法的方 ...
- [BZOJ 1009] [HNOI2008] GT考试 【AC自动机 + 矩阵乘法优化DP】
题目链接:BZOJ - 1009 题目分析 题目要求求出不包含给定字符串的长度为 n 的字符串的数量. 既然这样,应该就是 KMP + DP ,用 f[i][j] 表示长度为 i ,匹配到模式串第 j ...
- bzoj 3240: [Noi2013]矩阵游戏 矩阵乘法+十进制快速幂+常数优化
3240: [Noi2013]矩阵游戏 Time Limit: 10 Sec Memory Limit: 256 MBSubmit: 613 Solved: 256[Submit][Status] ...
- HDU 4914 Linear recursive sequence(矩阵乘法递推的优化)
题解见X姐的论文 矩阵乘法递推的优化.仅仅是mark一下. .
- [转]OpenBLAS项目与矩阵乘法优化
课程内容 OpenBLAS项目介绍 矩阵乘法优化算法 一步步调优实现 以下为公开课完整视频,共64分钟: 以下为公开课内容的文字及 PPT 整理. 雷锋网的朋友们大家好,我是张先轶,今天主要介绍一下我 ...
- 矩阵乘法优化DP
本文讲一下一些基本的矩阵优化DP的方法技巧. 定义三个矩阵A,B,C,其中行和列分别为$m\times n,n \times p,m\times p$,(其中行是从上往下数的,列是从左往右数的) $C ...
- 【BZOJ 3326】[Scoi2013]数数 数位dp+矩阵乘法优化
挺好的数位dp……先说一下我个人的做法:经过观察,发现这题按照以往的思路从后往前递增,不怎么好推,然后我就大胆猜想,从前往后推,发现很好推啊,维护四个变量,从开始位置到现在有了i个数 f[i]:所有数 ...
随机推荐
- rm: 无法删除"xxxx.dir": 是一个目录
rm命令 -f:在删除过程中不给任何指示,直接删除. -r:将参数中列出的全部目录和子目录都递归地删除. -i:与-f选项相反,交互式删除,在删除每个文件时都给出提示. 删除文件可以直接使用rm命令, ...
- java8 流操作 好文网址
api 各方法详解(很不错!) http://blog.51cto.com/turnsole/2093185 api 各方法 简介: https://www.cnblogs.com/guguli/p/ ...
- 修改ThinkPHP缓存为Memcache的方法
一般来说,ThinkPHP的默认缓存方式是以File文件方式实现的,运行时会在/Runtime/Temp 下生成很多的缓存文件. 有的情况下服务器装了memcached之后,需要将ThinkPHP的缓 ...
- Tomcat:Several ports are already in use问题
Several ports (8005, 8080, 8009) required by Tomcat v6.0 Server at localhost are already in use. The ...
- pow 的使用和常见问题
版权声明:本文为博主原创文章,未经博主同意不得转载. https://blog.csdn.net/menxu_work/article/details/24540045 1.安装: $ curl ge ...
- Git的一些基本操作和命令
1.创建版本库 在D盘中打开Git Bash Here 1.1.创建一个空目录 Mkdir gitDemo –创建一个目录 Cd gitDemo --进入gitDemo目录 Pwd –查看当前目录的 ...
- FastAdmin 开发时对数据库进行版本管理 (非 think-migration)
因为开必项目,暂时还不没用 think-migration,先用 脚本处理. 在导出 SQL 时将相关字段数据还原,比如 admin logitime updatetime token. 把 admi ...
- Linux elasticsearch 安装 遇到的问题
备注:我的 Linux 测试机 是2G 内存的 ,估计内存小于 我的内存肯定会出这个问题 .(安装的最新版6.3.2) 1. 下载文件 解压 2 .试着 运行 bin 下面的 elasticse ...
- php curl文件上传兼容php5.0~5.6各版本
PHP 5.0~5.6 各版本兼容的cURL文件上传 最近做的一个需求,使用PHP cURL上传文件.踩坑若干,整理如下. 不同版本PHP之间cURL的区别 PHP的cURL支持通过给CURL_POS ...
- jenkins进阶-集成钉钉机器人(6)
最早做Jenkins发布完成以后通过邮件发送信息通知相关的联系人,发现邮件会受限于大家接收的设置,导致不能及时的看到相关的发布内容,公司使用钉钉做为公司内部的通讯工具,所以想通过Jenkins发布完成 ...