▶ 按书上的步骤使用不同的导语优化矩阵乘法

● 已经优化的代码

 #include <iostream>
#include <cstdlib>
#include <chrono> #define SIZE 1024 using namespace std;
using namespace std::chrono; double a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE], d[SIZE][SIZE]; // 四个数组放入 main 里会报错 Segmentation fault (core dumped) int main()
{
//int i, j, k; // ijk 和 tmp 在循环中使用时才声明会导致运行时间变长
double tmp; #pragma acc enter data create(a, b, c)
#pragma acc kernels present(a, b, c)
{
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
a[i][j] = (double)(i + j);
}
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
b[i][j] = (double)(i - j);
}
for (int i = ; i < SIZE; i++) // 每种方法前都要清空 c
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
}
} high_resolution_clock::time_point t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 1,每层循环都 auto
{
#pragma acc loop auto
for (int i = ; i < SIZE; i++)
{
#pragma acc loop auto
for (int j = ; j < SIZE; j++)
{
#pragma acc loop auto
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Auto: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 2,外两层 independent,最里层串行
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
#pragma acc loop independent
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Seq: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 3,外两层 independent,最里层规约
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Reduction: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 4,手动指定 gang 和 vector
{
#pragma acc loop gang(32)
for (int i = ; i < SIZE; i++)
{
#pragma acc loop vector(16)
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Gang Vector: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 5,分块重排
{
#pragma acc loop tile(32, 32)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+ \
: tmp)
for (int k = ; k < SIZE; ++k)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - tile: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 6,合并多层迭代
{
#pragma acc loop collapse(2) independent
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Collapse: %.6lf s.\n\n", time.count()); #pragma acc exit data copyout(a, b, c) #pragma omp parallel for shared(d)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
d[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma omp parallel for default(none) shared(a, b, d) // 使用 OpenMP
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
for (int k = ; k < SIZE; k++)
d[i][j] += a[i][k] * b[k][j];
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenMP: %.6lf s.\n\n", time.count()); for (int i = ; i < SIZE; i++) // 检查结果
{
for (int j = ; j < SIZE; j++)
{
if (c[i][j] != d[i][j])
printf("\nError at [%d, %d],c = %f d = %f \n", i, j, c[i][j], d[i][j]);
}
}
return ;
}

● 输出结果(数据管理优化前)

cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe

Time OpenACC - Auto: 4.589736 s.

Time OpenACC - Independent Seq: 4.823721 s.

Time OpenACC - Independent Reduction: 3.669336 s.

Time OpenACC - Gang Vector: 3.611391 s.

Time OpenACC - tile: 3.609573 s.

Time OpenACC - Collapse: 3.605792 s.

Time OpenMP: 4.345018 s.

● 输出结果(数据管理优化后)

cuan@CUAN:~/acc$ pgc++ main.cpp -std=c++ -acc -mp -Minfo -o main.exe
main:
, include "chrono"
, include "chrono"
, Parallel region activated
, Parallel region terminated
, Parallel region activated
, Parallel region terminated
, Generating enter data create(b[:][:],c[:][:],a[:][:])
Generating present(a[:][:],b[:][:],c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Complex loop carried dependence of c prevents parallelization
Loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
Inner sequential loop scheduled on accelerator
Generating Tesla code
, #pragma acc loop gang /* blockIdx.y */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Complex loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang /* blockIdx.z */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop gang /* blockIdx.y */
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, FMA (fused multiply-add) instruction(s) generated
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang(32), vector(8) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(16) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x tiled */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x collapsed */
, #pragma acc loop seq
, Loop is parallelizable
, Generating exit data copyout(c[:][:],b[:][:],a[:][:])
Parallel loop activated with static block schedule
, Memory zero idiom, loop replaced by call to __c_mzero8
, Barrier
, Parallel loop activated with static block schedule
FMA (fused multiply-add) instruction(s) generated
, Barrier
cuan@CUAN:~/acc$ ./main.exe
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024 block=
Time OpenACC - Auto: 0.018726 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024x4 block=
Time OpenACC - Independent Seq: 0.040719 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
Time OpenACC - Independent Reduction: 0.012491 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=64x32 block=16x8
Time OpenACC - Gang Vector: 0.012314 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - tile: 0.013609 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - Collapse: 0.012676 s. Time OpenMP: 0.504436 s. Accelerator Kernel Timing data
/home/cuan/acc/main.cpp
main NVIDIA devicenum=
time(us): ,
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024x4] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [64x32] block: [16x8]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: data region reached time
: data copyout transfers:
device time(us): total=, max= min= avg=

OpenACC 优化矩阵乘法的更多相关文章

  1. Strassen优化矩阵乘法(复杂度O(n^lg7))

    按照算法导论写的 还没有测试复杂度到底怎么样 不过这个真的很卡内存,挖个坑,以后写空间优化 还有Matthew Anderson, Siddharth Barman写了一个关于矩阵乘法的论文 < ...

  2. poj3613:Cow Relays(倍增优化+矩阵乘法floyd+快速幂)

    Cow Relays Time Limit: 1000MS   Memory Limit: 65536K Total Submissions: 7825   Accepted: 3068 Descri ...

  3. 利用Cayley-Hamilton theorem 优化矩阵线性递推

    平时有关线性递推的题,很多都可以利用矩阵乘法来解决. 时间复杂度一般是O(K3logn)因此对矩阵的规模限制比较大. 下面介绍一种利用利用Cayley-Hamilton theorem加速矩阵乘法的方 ...

  4. [BZOJ 1009] [HNOI2008] GT考试 【AC自动机 + 矩阵乘法优化DP】

    题目链接:BZOJ - 1009 题目分析 题目要求求出不包含给定字符串的长度为 n 的字符串的数量. 既然这样,应该就是 KMP + DP ,用 f[i][j] 表示长度为 i ,匹配到模式串第 j ...

  5. bzoj 3240: [Noi2013]矩阵游戏 矩阵乘法+十进制快速幂+常数优化

    3240: [Noi2013]矩阵游戏 Time Limit: 10 Sec  Memory Limit: 256 MBSubmit: 613  Solved: 256[Submit][Status] ...

  6. HDU 4914 Linear recursive sequence(矩阵乘法递推的优化)

    题解见X姐的论文 矩阵乘法递推的优化.仅仅是mark一下. .

  7. [转]OpenBLAS项目与矩阵乘法优化

    课程内容 OpenBLAS项目介绍 矩阵乘法优化算法 一步步调优实现 以下为公开课完整视频,共64分钟: 以下为公开课内容的文字及 PPT 整理. 雷锋网的朋友们大家好,我是张先轶,今天主要介绍一下我 ...

  8. 矩阵乘法优化DP

    本文讲一下一些基本的矩阵优化DP的方法技巧. 定义三个矩阵A,B,C,其中行和列分别为$m\times n,n \times p,m\times p$,(其中行是从上往下数的,列是从左往右数的) $C ...

  9. 【BZOJ 3326】[Scoi2013]数数 数位dp+矩阵乘法优化

    挺好的数位dp……先说一下我个人的做法:经过观察,发现这题按照以往的思路从后往前递增,不怎么好推,然后我就大胆猜想,从前往后推,发现很好推啊,维护四个变量,从开始位置到现在有了i个数 f[i]:所有数 ...

随机推荐

  1. Arpa’s obvious problem and Mehrdad’s terrible solution 思维

    There are some beautiful girls in Arpa’s land as mentioned before. Once Arpa came up with an obvious ...

  2. 【MVC】使用MvcPager进行分页

    1.添加引用: mvcPager 版本高的提供的功能也更多. 注:下载了第一个,但是里面的一些字段是只读的.(eg:PagedList<T> .TotalItemCount)这是不符合的. ...

  3. javascript的单例模式

    单例模式是javascript最基本,最有用的模式之一,它提供了一种将代码组织为一个逻辑单元的手段,这个逻辑单元中的代码通过单一的变量进行访问.我的理解是在这个作用域中,只有通过单一的变量来访问,不存 ...

  4. kettle--window开发环境和linux运行环境的迁移

    首先要做的是将kettle在linux下搭建好. 一.搭建linux的kettle环境 1.1解压 (my_python_env)[root@hadoop26 ~]# .zip -d /usr/loc ...

  5. mysql四种修改密码的方式

    方法1: 用SET PASSWORD命令 首先登录MySQL. 格式:mysql> set password for 用户名@localhost = password('新密码'); 例子:my ...

  6. java 中一些需要注意的知识点

    java数组的length属性是容量,而不是数组真实元素的个数: 多线程中的interrupt()方法并不会终止处于"运行状态"的线程,它只是将线程的中断标记设为true. juc ...

  7. jmeter ---模拟发送TCP/UDP/HTTP/FTP等请求包

    JMeter安装UDP插件后支持发送UDP协议的请求包,官方介绍安装插件后可以用来测试DNS, NTP, TFTP, Boot servers and many-many other systems. ...

  8. mySQL 教程 第2章 安装和介绍mySQL

    设置mySQL字符集 支持中文的字符集是utf8,该设置可以更改mySQL配置文件进行全局设置,也可以针对数据库设置,也可以针对表设置,也可以针对列设置.字符集更改后新插入的数据生效,对以前不生效. ...

  9. hello word 应用程序的编写

    1.各类文件的书写 src中的文件: hello文件夹中的Makefile文件 # # Copyright (C) - OpenWrt.org # # This is free software, l ...

  10. php文章付费阅读系统球料付费阅读系统

    服务项目 新手技术咨询 企业技术咨询 定制开发 服务说明 QQ有问必答 QQ.微信.电话 微信开发.php开发,网站开发,系统定制,小程序开发 价格说明 200元/月 1000/月 商议       ...