▶ 按书上的步骤使用不同的导语优化矩阵乘法

● 已经优化的代码

 #include <iostream>
#include <cstdlib>
#include <chrono> #define SIZE 1024 using namespace std;
using namespace std::chrono; double a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE], d[SIZE][SIZE]; // 四个数组放入 main 里会报错 Segmentation fault (core dumped) int main()
{
//int i, j, k; // ijk 和 tmp 在循环中使用时才声明会导致运行时间变长
double tmp; #pragma acc enter data create(a, b, c)
#pragma acc kernels present(a, b, c)
{
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
a[i][j] = (double)(i + j);
}
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
b[i][j] = (double)(i - j);
}
for (int i = ; i < SIZE; i++) // 每种方法前都要清空 c
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
}
} high_resolution_clock::time_point t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 1,每层循环都 auto
{
#pragma acc loop auto
for (int i = ; i < SIZE; i++)
{
#pragma acc loop auto
for (int j = ; j < SIZE; j++)
{
#pragma acc loop auto
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Auto: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 2,外两层 independent,最里层串行
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
#pragma acc loop independent
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Seq: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 3,外两层 independent,最里层规约
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Reduction: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 4,手动指定 gang 和 vector
{
#pragma acc loop gang(32)
for (int i = ; i < SIZE; i++)
{
#pragma acc loop vector(16)
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Gang Vector: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 5,分块重排
{
#pragma acc loop tile(32, 32)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+ \
: tmp)
for (int k = ; k < SIZE; ++k)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - tile: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 6,合并多层迭代
{
#pragma acc loop collapse(2) independent
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Collapse: %.6lf s.\n\n", time.count()); #pragma acc exit data copyout(a, b, c) #pragma omp parallel for shared(d)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
d[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma omp parallel for default(none) shared(a, b, d) // 使用 OpenMP
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
for (int k = ; k < SIZE; k++)
d[i][j] += a[i][k] * b[k][j];
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenMP: %.6lf s.\n\n", time.count()); for (int i = ; i < SIZE; i++) // 检查结果
{
for (int j = ; j < SIZE; j++)
{
if (c[i][j] != d[i][j])
printf("\nError at [%d, %d],c = %f d = %f \n", i, j, c[i][j], d[i][j]);
}
}
return ;
}

● 输出结果(数据管理优化前)

cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe

Time OpenACC - Auto: 4.589736 s.

Time OpenACC - Independent Seq: 4.823721 s.

Time OpenACC - Independent Reduction: 3.669336 s.

Time OpenACC - Gang Vector: 3.611391 s.

Time OpenACC - tile: 3.609573 s.

Time OpenACC - Collapse: 3.605792 s.

Time OpenMP: 4.345018 s.

● 输出结果(数据管理优化后)

cuan@CUAN:~/acc$ pgc++ main.cpp -std=c++ -acc -mp -Minfo -o main.exe
main:
, include "chrono"
, include "chrono"
, Parallel region activated
, Parallel region terminated
, Parallel region activated
, Parallel region terminated
, Generating enter data create(b[:][:],c[:][:],a[:][:])
Generating present(a[:][:],b[:][:],c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Complex loop carried dependence of c prevents parallelization
Loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
Inner sequential loop scheduled on accelerator
Generating Tesla code
, #pragma acc loop gang /* blockIdx.y */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Complex loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang /* blockIdx.z */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop gang /* blockIdx.y */
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, FMA (fused multiply-add) instruction(s) generated
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang(32), vector(8) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(16) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x tiled */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x collapsed */
, #pragma acc loop seq
, Loop is parallelizable
, Generating exit data copyout(c[:][:],b[:][:],a[:][:])
Parallel loop activated with static block schedule
, Memory zero idiom, loop replaced by call to __c_mzero8
, Barrier
, Parallel loop activated with static block schedule
FMA (fused multiply-add) instruction(s) generated
, Barrier
cuan@CUAN:~/acc$ ./main.exe
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024 block=
Time OpenACC - Auto: 0.018726 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024x4 block=
Time OpenACC - Independent Seq: 0.040719 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
Time OpenACC - Independent Reduction: 0.012491 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=64x32 block=16x8
Time OpenACC - Gang Vector: 0.012314 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - tile: 0.013609 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - Collapse: 0.012676 s. Time OpenMP: 0.504436 s. Accelerator Kernel Timing data
/home/cuan/acc/main.cpp
main NVIDIA devicenum=
time(us): ,
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024x4] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [64x32] block: [16x8]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: data region reached time
: data copyout transfers:
device time(us): total=, max= min= avg=

OpenACC 优化矩阵乘法的更多相关文章

  1. Strassen优化矩阵乘法(复杂度O(n^lg7))

    按照算法导论写的 还没有测试复杂度到底怎么样 不过这个真的很卡内存,挖个坑,以后写空间优化 还有Matthew Anderson, Siddharth Barman写了一个关于矩阵乘法的论文 < ...

  2. poj3613:Cow Relays(倍增优化+矩阵乘法floyd+快速幂)

    Cow Relays Time Limit: 1000MS   Memory Limit: 65536K Total Submissions: 7825   Accepted: 3068 Descri ...

  3. 利用Cayley-Hamilton theorem 优化矩阵线性递推

    平时有关线性递推的题,很多都可以利用矩阵乘法来解决. 时间复杂度一般是O(K3logn)因此对矩阵的规模限制比较大. 下面介绍一种利用利用Cayley-Hamilton theorem加速矩阵乘法的方 ...

  4. [BZOJ 1009] [HNOI2008] GT考试 【AC自动机 + 矩阵乘法优化DP】

    题目链接:BZOJ - 1009 题目分析 题目要求求出不包含给定字符串的长度为 n 的字符串的数量. 既然这样,应该就是 KMP + DP ,用 f[i][j] 表示长度为 i ,匹配到模式串第 j ...

  5. bzoj 3240: [Noi2013]矩阵游戏 矩阵乘法+十进制快速幂+常数优化

    3240: [Noi2013]矩阵游戏 Time Limit: 10 Sec  Memory Limit: 256 MBSubmit: 613  Solved: 256[Submit][Status] ...

  6. HDU 4914 Linear recursive sequence(矩阵乘法递推的优化)

    题解见X姐的论文 矩阵乘法递推的优化.仅仅是mark一下. .

  7. [转]OpenBLAS项目与矩阵乘法优化

    课程内容 OpenBLAS项目介绍 矩阵乘法优化算法 一步步调优实现 以下为公开课完整视频,共64分钟: 以下为公开课内容的文字及 PPT 整理. 雷锋网的朋友们大家好,我是张先轶,今天主要介绍一下我 ...

  8. 矩阵乘法优化DP

    本文讲一下一些基本的矩阵优化DP的方法技巧. 定义三个矩阵A,B,C,其中行和列分别为$m\times n,n \times p,m\times p$,(其中行是从上往下数的,列是从左往右数的) $C ...

  9. 【BZOJ 3326】[Scoi2013]数数 数位dp+矩阵乘法优化

    挺好的数位dp……先说一下我个人的做法:经过观察,发现这题按照以往的思路从后往前递增,不怎么好推,然后我就大胆猜想,从前往后推,发现很好推啊,维护四个变量,从开始位置到现在有了i个数 f[i]:所有数 ...

随机推荐

  1. CTF之ROT加解密

    常见的ROT加密包括ROT5,ROT13,ROT18,ROT47 ROT5:只是对数字进行编码.用当前数字往后数的第五个数字替换当前数字: 例:123sb——>678sb ROT13:只是对字母 ...

  2. idea快捷键 好的网址收藏

    http://blog.csdn.net/u010800804/article/details/48491395http://blog.csdn.net/wei83523408/article/det ...

  3. ssh 免 密码登录另一台机器 和 secureCRT的乱码问题

    PS: 就是你把密钥生成好以后,放入B机器中,再登录的时候就已经有了所以就不用验证了 ========================================================= ...

  4. ehci及其伴随ohci主机控制器驱动分析

    1. 正常插入 插上U盘产生中断调用usb_hcd_irq: usb_hcd_irq ehci_irq usb_hcd_resume_root_hub queue_work(pm_wq, &h ...

  5. GOOGLE高级搜索的秘籍

    一.摘要 本文内容来源自互联网,全面的介绍Google搜索的各种功能和技巧. 二.GOOGLE简介 Google(http://www.google.com/)是一个搜索引擎,由两个斯坦福大学博士生L ...

  6. stty(set tty)

    tty [ -a ] [ -g ] [ Options ]   stty(set tty)命令用于显示和修改当前注册的终端的属性. UNIX系统为键盘的输入和终端的输出提供了重要的控制手段,可以通过s ...

  7. Linux下载

    免费下载地址在 http://linux.linuxidc.com/ 用户名与密码都是www.linuxidc.com 下载方法见 http://www.linuxidc.com/Linux/2013 ...

  8. 关于 Vue 方法前面的美元符号

    关于 Vue 方法前面的美元符号 学到这一段,不明白什么意思,然后手贱把 $ 删除了,出现未定义方法. vm.$watch('counter', function(nval, oval) { aler ...

  9. 【转】每天一个linux命令(19):find 命令概览

    原文网址:http://www.cnblogs.com/peida/archive/2012/11/13/2767374.html Linux下find命令在目录结构中搜索文件,并执行指定的操作.Li ...

  10. commonJS模块规范 和 es6模块规范 区别

    ES6 模块与 CommonJS 模块的差异 CommonJS 模块输出的是一个值的拷贝,ES6 模块输出的是值的引用. CommonJS 模块是运行时加载,ES6 模块是编译时输出接口. Commo ...