▶ 按书上的步骤使用不同的导语优化矩阵乘法

● 已经优化的代码

 #include <iostream>
#include <cstdlib>
#include <chrono> #define SIZE 1024 using namespace std;
using namespace std::chrono; double a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE], d[SIZE][SIZE]; // 四个数组放入 main 里会报错 Segmentation fault (core dumped) int main()
{
//int i, j, k; // ijk 和 tmp 在循环中使用时才声明会导致运行时间变长
double tmp; #pragma acc enter data create(a, b, c)
#pragma acc kernels present(a, b, c)
{
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
a[i][j] = (double)(i + j);
}
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
b[i][j] = (double)(i - j);
}
for (int i = ; i < SIZE; i++) // 每种方法前都要清空 c
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
}
} high_resolution_clock::time_point t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 1,每层循环都 auto
{
#pragma acc loop auto
for (int i = ; i < SIZE; i++)
{
#pragma acc loop auto
for (int j = ; j < SIZE; j++)
{
#pragma acc loop auto
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Auto: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 2,外两层 independent,最里层串行
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
#pragma acc loop independent
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Seq: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 3,外两层 independent,最里层规约
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Reduction: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 4,手动指定 gang 和 vector
{
#pragma acc loop gang(32)
for (int i = ; i < SIZE; i++)
{
#pragma acc loop vector(16)
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Gang Vector: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 5,分块重排
{
#pragma acc loop tile(32, 32)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+ \
: tmp)
for (int k = ; k < SIZE; ++k)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - tile: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 6,合并多层迭代
{
#pragma acc loop collapse(2) independent
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Collapse: %.6lf s.\n\n", time.count()); #pragma acc exit data copyout(a, b, c) #pragma omp parallel for shared(d)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
d[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma omp parallel for default(none) shared(a, b, d) // 使用 OpenMP
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
for (int k = ; k < SIZE; k++)
d[i][j] += a[i][k] * b[k][j];
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenMP: %.6lf s.\n\n", time.count()); for (int i = ; i < SIZE; i++) // 检查结果
{
for (int j = ; j < SIZE; j++)
{
if (c[i][j] != d[i][j])
printf("\nError at [%d, %d],c = %f d = %f \n", i, j, c[i][j], d[i][j]);
}
}
return ;
}

● 输出结果(数据管理优化前)

cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe

Time OpenACC - Auto: 4.589736 s.

Time OpenACC - Independent Seq: 4.823721 s.

Time OpenACC - Independent Reduction: 3.669336 s.

Time OpenACC - Gang Vector: 3.611391 s.

Time OpenACC - tile: 3.609573 s.

Time OpenACC - Collapse: 3.605792 s.

Time OpenMP: 4.345018 s.

● 输出结果(数据管理优化后)

cuan@CUAN:~/acc$ pgc++ main.cpp -std=c++ -acc -mp -Minfo -o main.exe
main:
, include "chrono"
, include "chrono"
, Parallel region activated
, Parallel region terminated
, Parallel region activated
, Parallel region terminated
, Generating enter data create(b[:][:],c[:][:],a[:][:])
Generating present(a[:][:],b[:][:],c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Complex loop carried dependence of c prevents parallelization
Loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
Inner sequential loop scheduled on accelerator
Generating Tesla code
, #pragma acc loop gang /* blockIdx.y */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Complex loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang /* blockIdx.z */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop gang /* blockIdx.y */
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, FMA (fused multiply-add) instruction(s) generated
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang(32), vector(8) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(16) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x tiled */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x collapsed */
, #pragma acc loop seq
, Loop is parallelizable
, Generating exit data copyout(c[:][:],b[:][:],a[:][:])
Parallel loop activated with static block schedule
, Memory zero idiom, loop replaced by call to __c_mzero8
, Barrier
, Parallel loop activated with static block schedule
FMA (fused multiply-add) instruction(s) generated
, Barrier
cuan@CUAN:~/acc$ ./main.exe
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024 block=
Time OpenACC - Auto: 0.018726 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024x4 block=
Time OpenACC - Independent Seq: 0.040719 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
Time OpenACC - Independent Reduction: 0.012491 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=64x32 block=16x8
Time OpenACC - Gang Vector: 0.012314 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - tile: 0.013609 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - Collapse: 0.012676 s. Time OpenMP: 0.504436 s. Accelerator Kernel Timing data
/home/cuan/acc/main.cpp
main NVIDIA devicenum=
time(us): ,
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024x4] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [64x32] block: [16x8]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: data region reached time
: data copyout transfers:
device time(us): total=, max= min= avg=

OpenACC 优化矩阵乘法的更多相关文章

  1. Strassen优化矩阵乘法(复杂度O(n^lg7))

    按照算法导论写的 还没有测试复杂度到底怎么样 不过这个真的很卡内存,挖个坑,以后写空间优化 还有Matthew Anderson, Siddharth Barman写了一个关于矩阵乘法的论文 < ...

  2. poj3613:Cow Relays(倍增优化+矩阵乘法floyd+快速幂)

    Cow Relays Time Limit: 1000MS   Memory Limit: 65536K Total Submissions: 7825   Accepted: 3068 Descri ...

  3. 利用Cayley-Hamilton theorem 优化矩阵线性递推

    平时有关线性递推的题,很多都可以利用矩阵乘法来解决. 时间复杂度一般是O(K3logn)因此对矩阵的规模限制比较大. 下面介绍一种利用利用Cayley-Hamilton theorem加速矩阵乘法的方 ...

  4. [BZOJ 1009] [HNOI2008] GT考试 【AC自动机 + 矩阵乘法优化DP】

    题目链接:BZOJ - 1009 题目分析 题目要求求出不包含给定字符串的长度为 n 的字符串的数量. 既然这样,应该就是 KMP + DP ,用 f[i][j] 表示长度为 i ,匹配到模式串第 j ...

  5. bzoj 3240: [Noi2013]矩阵游戏 矩阵乘法+十进制快速幂+常数优化

    3240: [Noi2013]矩阵游戏 Time Limit: 10 Sec  Memory Limit: 256 MBSubmit: 613  Solved: 256[Submit][Status] ...

  6. HDU 4914 Linear recursive sequence(矩阵乘法递推的优化)

    题解见X姐的论文 矩阵乘法递推的优化.仅仅是mark一下. .

  7. [转]OpenBLAS项目与矩阵乘法优化

    课程内容 OpenBLAS项目介绍 矩阵乘法优化算法 一步步调优实现 以下为公开课完整视频,共64分钟: 以下为公开课内容的文字及 PPT 整理. 雷锋网的朋友们大家好,我是张先轶,今天主要介绍一下我 ...

  8. 矩阵乘法优化DP

    本文讲一下一些基本的矩阵优化DP的方法技巧. 定义三个矩阵A,B,C,其中行和列分别为$m\times n,n \times p,m\times p$,(其中行是从上往下数的,列是从左往右数的) $C ...

  9. 【BZOJ 3326】[Scoi2013]数数 数位dp+矩阵乘法优化

    挺好的数位dp……先说一下我个人的做法:经过观察,发现这题按照以往的思路从后往前递增,不怎么好推,然后我就大胆猜想,从前往后推,发现很好推啊,维护四个变量,从开始位置到现在有了i个数 f[i]:所有数 ...

随机推荐

  1. pycharm PYTHONPATH

    Hi brandenju! I believe os.chdir doesn't affect PYTHONPATH so changing your working directory at run ...

  2. bzoj 3528 [ZJOI2014] 星系调查 题解

    [原题] 星系调查 [问题描写叙述] 银河历59451年.在银河系有许很多多已被人类殖民的星系.如果想要在行 星系间往来,大家一般使用连接两个行星系的跳跃星门.  一个跳跃星门能够把 物质在它所连接的 ...

  3. 启用Win8/10(中文版/核心版/家庭版)中被阉割的远程桌面服务端

    Windows 8/8.1/10 标准版(中文版/核心版/家庭版)中取消了远程桌面服务端,想通过远程连接到自己的电脑就很麻烦了,第三方远程桌面速度又不理想(如TeamViewer).通过以下方法可让系 ...

  4. js搞定网页的简繁转换

    对网页进行简繁字体转换的方法一般有两种:一是使用<简繁通>这样的专业软件,另外一种是制作两套版本的网页.显然,这两种方法都较为麻烦,而且专业软件一般不能用于免费的空间.笔者在这里给大家提供 ...

  5. C# 爬取网页上的数据

    最近工作中需求定时爬取不同城市每天的温度.其实就是通过编程的方法去抓取不同网站网页进行分析筛选的过程..NET提供了很多类去访问并获得远程网页的数据,比如WebClient类和HttpWebReque ...

  6. vue 感觉很好的渲染模式

    <ul v-if="todos.length"> <li v-for="todo in todos"> {{ todo }} </ ...

  7. 【Spring-AOP-学习笔记-5】@AfterReturning增强处理简单示例

    项目结构 业务代码 @Component("hello") public class HelloImpl implements Hello {     // 定义一个简单方法,模拟 ...

  8. VMware全屏时, 隐藏上方工具栏横条

    VMware全屏时, 隐藏上方横条 菜单栏打开 编辑 选择 首选项 找到 显示 取消勾选 在全屏时取消固定时显示工具栏边缘

  9. 求交错序列前N项和(15 分)

    7-2 求交错序列前N项和(15 分) 本题要求编写程序,计算交错序列 1-2/3+3/5-4/7+5/9-6/11+... 的前N项之和. 输入格式: 输入在一行中给出一个正整数N. 输出格式: 在 ...

  10. Find substring with K distinct characters

    Given a string and number K, find the substrings of size K with K distinct characters. If no, output ...