OpenACC 优化矩阵乘法

▶ 按书上的步骤使用不同的导语优化矩阵乘法

● 已经优化的代码

 #include <iostream>

 #include <cstdlib>

 #include <chrono>

 #define SIZE 1024

 using namespace std;

 using namespace std::chrono;

 double a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE], d[SIZE][SIZE]; // 四个数组放入 main 里会报错 Segmentation fault (core dumped)

 int main()

 {

     //int i, j, k;                          // ijk 和 tmp 在循环中使用时才声明会导致运行时间变长

     double tmp;

 #pragma acc enter data create(a, b, c)

 #pragma acc kernels present(a, b, c)

     {

         for (int i = ; i < SIZE; i++)      // 初始化 ab

         {

             for (int j = ; j < SIZE; j++)

                 a[i][j] = (double)(i + j);

         }

         for (int i = ; i < SIZE; i++)      // 初始化 ab

         {

             for (int j = ; j < SIZE; j++)

                 b[i][j] = (double)(i - j);

         }

         for (int i = ; i < SIZE; i++)      // 每种方法前都要清空 c

         {

             for (int j = ; j < SIZE; j++)

                 c[i][j] = 0.0;

         }

     }

     high_resolution_clock::time_point t1 = high_resolution_clock::now();

 #pragma acc kernels present(a, b, c)       // 方法 1，每层循环都 auto

     {

 #pragma acc loop auto

         for (int i = ; i < SIZE; i++)

         {

 #pragma acc loop auto

             for (int j = ; j < SIZE; j++)

             {

 #pragma acc loop auto

                 for (int k = ; k < SIZE; k++)

                     c[i][j] += a[i][k] * b[k][j];

             }

         }

     }

     high_resolution_clock::time_point t2 = high_resolution_clock::now();

     duration<double> time = duration_cast<duration<double>>(t2 - t1);

     printf("Time OpenACC - Auto: %.6lf s.\n\n", time.count());

 #pragma acc kernels present(c)

     for (int i = ; i < SIZE; i++)

     {

         for (int j = ; j < SIZE; j++)

             c[i][j] = 0.0;

     }

     t1 = high_resolution_clock::now();

 #pragma acc kernels present(a, b, c)        // 方法 2，外两层 independent，最里层串行

     {

 #pragma acc loop independent

         for (int i = ; i < SIZE; i++)

         {

 #pragma acc loop independent

             for (int j = ; j < SIZE; j++)

             {

 #pragma acc loop independent

                 for (int k = ; k < SIZE; k++)

                     c[i][j] += a[i][k] * b[k][j];

             }

         }

     }

     t2 = high_resolution_clock::now();

     time = duration_cast<duration<double>>(t2 - t1);

     printf("Time OpenACC - Independent Seq: %.6lf s.\n\n", time.count());

 #pragma acc kernels present(c)

     for (int i = ; i < SIZE; i++)

     {

         for (int j = ; j < SIZE; j++)

             c[i][j] = 0.0;

     }

     t1 = high_resolution_clock::now();

 #pragma acc kernels present(a, b, c)        // 方法 3，外两层 independent，最里层规约

     {

 #pragma acc loop independent

         for (int i = ; i < SIZE; i++)

         {

 #pragma acc loop independent

             for (int j = ; j < SIZE; j++)

             {

                 tmp = 0.0f;

 #pragma acc loop reduction(+: tmp)

                 for (int k = ; k < SIZE; k++)

                     tmp += a[i][k] * b[k][j];

                 c[i][j] = tmp;

             }

         }

     }

     t2 = high_resolution_clock::now();

     time = duration_cast<duration<double>>(t2 - t1);

     printf("Time OpenACC - Independent Reduction: %.6lf s.\n\n", time.count());

 #pragma acc kernels present(c)

     for (int i = ; i < SIZE; i++)

     {

         for (int j = ; j < SIZE; j++)

             c[i][j] = 0.0;

     }

     t1 = high_resolution_clock::now();

 #pragma acc kernels present(a, b, c)        // 方法 4，手动指定 gang 和 vector

     {

 #pragma acc loop gang(32)

         for (int i = ; i < SIZE; i++)

         {

 #pragma acc loop vector(16)

             for (int j = ; j < SIZE; j++)

             {

                 tmp = 0.0f;

 #pragma acc loop reduction(+: tmp)

                 for (int k = ; k < SIZE; k++)

                     tmp += a[i][k] * b[k][j];

                 c[i][j] = tmp;

             }

         }

     }

     t2 = high_resolution_clock::now();

     time = duration_cast<duration<double>>(t2 - t1);

     printf("Time OpenACC - Gang Vector: %.6lf s.\n\n", time.count());

 #pragma acc kernels present(c)

     for (int i = ; i < SIZE; i++)

     {

         for (int j = ; j < SIZE; j++)

             c[i][j] = 0.0;

     }

     t1 = high_resolution_clock::now();

 #pragma acc kernels present(a, b, c)        // 方法 5，分块重排

     {

 #pragma acc loop tile(32, 32)

         for (int i = ; i < SIZE; i++)

         {

             for (int j = ; j < SIZE; j++)

             {

                 tmp = 0.0f;

 #pragma acc loop reduction(+ \

                            : tmp)

                 for (int k = ; k < SIZE; ++k)

                     tmp += a[i][k] * b[k][j];

                 c[i][j] = tmp;

             }

         }

     }

     t2 = high_resolution_clock::now();

     time = duration_cast<duration<double>>(t2 - t1);

     printf("Time OpenACC - tile: %.6lf s.\n\n", time.count());

 #pragma acc kernels present(c)

     for (int i = ; i < SIZE; i++)

     {

         for (int j = ; j < SIZE; j++)

             c[i][j] = 0.0;

     }

     t1 = high_resolution_clock::now();

 #pragma acc kernels present(a, b, c)        // 方法 6，合并多层迭代

     {

 #pragma acc loop collapse(2) independent

         for (int i = ; i < SIZE; i++)

         {

             for (int j = ; j < SIZE; j++)

             {

                 tmp = 0.0f;

 #pragma acc loop reduction(+: tmp)

                 for (int k = ; k < SIZE; k++)

                     tmp += a[i][k] * b[k][j];

                 c[i][j] = tmp;

             }

         }

     }

     t2 = high_resolution_clock::now();

     time = duration_cast<duration<double>>(t2 - t1);

     printf("Time OpenACC - Collapse: %.6lf s.\n\n", time.count());

 #pragma acc exit data copyout(a, b, c)

 #pragma omp parallel for shared(d)

     for (int i = ; i < SIZE; i++)

     {

         for (int j = ; j < SIZE; j++)

             d[i][j] = 0.0;

     }

     t1 = high_resolution_clock::now();

 #pragma omp parallel for default(none) shared(a, b, d)  // 使用 OpenMP

     for (int i = ; i < SIZE; i++)

     {

         for (int j = ; j < SIZE; j++)

         {

             for (int k = ; k < SIZE; k++)

                 d[i][j] += a[i][k] * b[k][j];

         }

     }

     t2 = high_resolution_clock::now();

     time = duration_cast<duration<double>>(t2 - t1);

     printf("Time OpenMP: %.6lf s.\n\n", time.count());

     for (int i = ; i < SIZE; i++)                      // 检查结果

     {

         for (int j = ; j < SIZE; j++)

         {

             if (c[i][j] != d[i][j])

                 printf("\nError at [%d, %d],c = %f d = %f \n", i, j, c[i][j], d[i][j]);

         }

     }

     return ;

 }

● 输出结果（数据管理优化前）

cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe

Time OpenACC - Auto: 4.589736 s.

Time OpenACC - Independent Seq: 4.823721 s.

Time OpenACC - Independent Reduction: 3.669336 s.

Time OpenACC - Gang Vector: 3.611391 s.

Time OpenACC - tile: 3.609573 s.

Time OpenACC - Collapse: 3.605792 s.

Time OpenMP: 4.345018 s.

● 输出结果（数据管理优化后）

cuan@CUAN:~/acc$ pgc++ main.cpp -std=c++ -acc -mp -Minfo -o main.exe

main:

      , include "chrono"

          , include "chrono"

              , Parallel region activated

              , Parallel region terminated

              , Parallel region activated

              , Parallel region terminated

     , Generating enter data create(b[:][:],c[:][:],a[:][:])

         Generating present(a[:][:],b[:][:],c[:][:])

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , Memory zero idiom, loop replaced by call to __c_mzero8

     , Generating present(a[:][:],c[:][:],b[:][:])

     , Loop is parallelizable

     , Loop is parallelizable

     , Complex loop carried dependence of c prevents parallelization

         Loop carried dependence of c prevents parallelization

         Loop carried backward dependence of c prevents vectorization

         Inner sequential loop scheduled on accelerator

         Generating Tesla code

         , #pragma acc loop gang /* blockIdx.y */

         , #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */

         , #pragma acc loop seq

     , Complex loop carried dependence of c prevents parallelization

         Loop carried backward dependence of c prevents vectorization

     , Generating present(c[:][:])

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , Memory zero idiom, loop replaced by call to __c_mzero8

     , Generating present(a[:][:],c[:][:],b[:][:])

     , Loop is parallelizable

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang /* blockIdx.z */

         , #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */

         , #pragma acc loop gang /* blockIdx.y */

     , Generating present(c[:][:])

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , Memory zero idiom, loop replaced by call to __c_mzero8

     , Generating present(a[:][:],c[:][:],b[:][:])

     , Loop is parallelizable

    , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

        , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

        , #pragma acc loop seq

    , FMA (fused multiply-add) instruction(s) generated

    , Loop is parallelizable

    , Generating present(c[:][:])

    , Loop is parallelizable

    , Loop is parallelizable

         Generating Tesla code

        , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

        , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

    , Memory zero idiom, loop replaced by call to __c_mzero8

    , Generating present(a[:][:],c[:][:],b[:][:])

    , Loop is parallelizable

    , Loop is parallelizable

         Generating Tesla code

        , #pragma acc loop gang(32), vector(8) /* blockIdx.y threadIdx.y */

        , #pragma acc loop gang, vector(16) /* blockIdx.x threadIdx.x */

        , #pragma acc loop seq

    , Loop is parallelizable

    , Generating present(c[:][:])

    , Loop is parallelizable

    , Loop is parallelizable

         Generating Tesla code

        , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

        , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

    , Memory zero idiom, loop replaced by call to __c_mzero8

    , Generating present(a[:][:],c[:][:],b[:][:])

    , Loop is parallelizable

    , Loop is parallelizable

         Generating Tesla code

        , #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */

        ,   /* blockIdx.x threadIdx.x tiled */

        , #pragma acc loop seq

    , Loop is parallelizable

    , Generating present(c[:][:])

    , Loop is parallelizable

    , Loop is parallelizable

         Generating Tesla code

        , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

        , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

    , Memory zero idiom, loop replaced by call to __c_mzero8

    , Generating present(a[:][:],c[:][:],b[:][:])

    , Loop is parallelizable

    , Loop is parallelizable

         Generating Tesla code

        , #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */

        ,   /* blockIdx.x threadIdx.x collapsed */

        , #pragma acc loop seq

    , Loop is parallelizable

    , Generating exit data copyout(c[:][:],b[:][:],a[:][:])

         Parallel loop activated with static block schedule

    , Memory zero idiom, loop replaced by call to __c_mzero8

    , Barrier

    , Parallel loop activated with static block schedule

         FMA (fused multiply-add) instruction(s) generated

    , Barrier

cuan@CUAN:~/acc$ ./main.exe

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024 block=

Time OpenACC - Auto: 0.018726 s.

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024x4 block=

Time OpenACC - Independent Seq: 0.040719 s.

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4

Time OpenACC - Independent Reduction: 0.012491 s.

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=64x32 block=16x8

Time OpenACC - Gang Vector: 0.012314 s.

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=

Time OpenACC - tile: 0.013609 s.

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4

launch CUDA kernel  file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=

Time OpenACC - Collapse: 0.012676 s.

Time OpenMP: 0.504436 s.

Accelerator Kernel Timing data

/home/cuan/acc/main.cpp

  main  NVIDIA  devicenum=

    time(us): ,

    : compute region reached  time

        : kernel launched  time

            grid: [32x256]  block: [32x4]

             device time(us): total= max= min= avg=

            elapsed time(us): total= max= min= avg=

        : kernel launched  time

            grid: [32x256]  block: [32x4]

             device time(us): total= max= min= avg=

            elapsed time(us): total= max= min= avg=

        : kernel launched  time

            grid: [32x256]  block: [32x4]

             device time(us): total= max= min= avg=

            elapsed time(us): total= max= min= avg=

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: [8x1024]  block: []

             device time(us): total=, max=, min=, avg=,

            elapsed time(us): total=, max=, min=, avg=,

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: [32x256]  block: [32x4]

             device time(us): total= max= min= avg=

            elapsed time(us): total= max= min= avg=

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: [8x1024x4]  block: []

             device time(us): total=, max=, min=, avg=,

            elapsed time(us): total=, max=, min=, avg=,

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: [32x256]  block: [32x4]

             device time(us): total= max= min= avg=

            elapsed time(us): total= max= min= avg=

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: [32x256]  block: [32x4]

             device time(us): total=, max=, min=, avg=,

            elapsed time(us): total=, max=, min=, avg=,

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: [32x256]  block: [32x4]

             device time(us): total= max= min= avg=

            elapsed time(us): total= max= min= avg=

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: [64x32]  block: [16x8]

             device time(us): total=, max=, min=, avg=,

            elapsed time(us): total=, max=, min=, avg=,

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: [32x256]  block: [32x4]

             device time(us): total= max= min= avg=

            elapsed time(us): total= max= min= avg=

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: []  block: []

             device time(us): total=, max=, min=, avg=,

            elapsed time(us): total=, max=, min=, avg=,

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: [32x256]  block: [32x4]

             device time(us): total= max= min= avg=

            elapsed time(us): total= max= min= avg=

    : data region reached  times

    : compute region reached  time

        : kernel launched  time

            grid: []  block: []

             device time(us): total=, max=, min=, avg=,

            elapsed time(us): total=, max=, min=, avg=,

    : data region reached  times

    : data region reached  time

        : data copyout transfers:

             device time(us): total=, max= min= avg=

OpenACC 优化矩阵乘法的更多相关文章

Strassen优化矩阵乘法(复杂度O(n^lg7))
按照算法导论写的还没有测试复杂度到底怎么样不过这个真的很卡内存,挖个坑,以后写空间优化还有Matthew Anderson, Siddharth Barman写了一个关于矩阵乘法的论文 < ...
poj3613：Cow Relays（倍增优化+矩阵乘法floyd+快速幂）
Cow Relays Time Limit: 1000MS Memory Limit: 65536K Total Submissions: 7825 Accepted: 3068 Descri ...
利用Cayley-Hamilton theorem 优化矩阵线性递推
平时有关线性递推的题,很多都可以利用矩阵乘法来解决. 时间复杂度一般是O(K3logn)因此对矩阵的规模限制比较大. 下面介绍一种利用利用Cayley-Hamilton theorem加速矩阵乘法的方 ...
[BZOJ 1009] [HNOI2008] GT考试【AC自动机 + 矩阵乘法优化DP】
题目链接:BZOJ - 1009 题目分析题目要求求出不包含给定字符串的长度为 n 的字符串的数量. 既然这样,应该就是 KMP + DP ,用 f[i][j] 表示长度为 i ,匹配到模式串第 j ...
bzoj 3240: [Noi2013]矩阵游戏矩阵乘法+十进制快速幂+常数优化
3240: [Noi2013]矩阵游戏 Time Limit: 10 Sec Memory Limit: 256 MBSubmit: 613 Solved: 256[Submit][Status] ...
HDU 4914 Linear recursive sequence(矩阵乘法递推的优化)
题解见X姐的论文矩阵乘法递推的优化.仅仅是mark一下. .
[转]OpenBLAS项目与矩阵乘法优化
课程内容 OpenBLAS项目介绍矩阵乘法优化算法一步步调优实现以下为公开课完整视频,共64分钟: 以下为公开课内容的文字及 PPT 整理. 雷锋网的朋友们大家好,我是张先轶,今天主要介绍一下我 ...
矩阵乘法优化DP
本文讲一下一些基本的矩阵优化DP的方法技巧. 定义三个矩阵A,B,C,其中行和列分别为$m\times n,n \times p,m\times p$,(其中行是从上往下数的,列是从左往右数的) $C ...
【BZOJ 3326】[Scoi2013]数数数位dp+矩阵乘法优化
挺好的数位dp……先说一下我个人的做法:经过观察,发现这题按照以往的思路从后往前递增,不怎么好推,然后我就大胆猜想,从前往后推,发现很好推啊,维护四个变量,从开始位置到现在有了i个数 f[i]:所有数 ...

随机推荐

xdoj-1149(多重集合+容斥原理+组合数取模）
#include <iostream> #include <algorithm> #include <cstdio> using namespace std; ty ...
VS2013、VS2015中，新建项目没有看到解决方案的问题（已解决）
问题描述:装好VS2013 专业版 for Update5不知怎么弄的,突然新建项目没有了解决方案,于是各种折腾,最后终于解决了! ================================== ...
php 使用 file_exists 还是 is_file
Jesns 提出 file_exists 比较老了,建议使用 is_file 来判断文件. 经过我的测试,is_file 果然快很多,以后可以改 is_file 来判断文件. 还有相关链接: is_f ...
IdentityHashMap 与 HashMap 的区别
IdentityHashMap 中的 key 允许重复 IdentityHashMap 使用的是 == 比较 key 的值(比较内存地址),而 HashMap 使用的是 equals()(比较存储值) ...
Java JDBC连接Oracle
1. 安装Oracle数据库,我这里使用的是Oracle 12c 2. 创建Java工程 connection-oracle 注意:使用的JavaSE-1.8 3. 在Oracle的安装目录里,将dj ...
Action<T>和Func<T>委托
Action<T>和Func<T>委托泛型Action<T>委托和Func<T>委托是系统定义的两个泛型委托. Action<T>委托表示 ...
<dedecms>织梦内页调用会员信息
1.织梦CMS v5.7调用文章所属会员信息标签打开官方默认模板article_artcile.htm,我们可以提取出如下代码: {dede:memberinfos} 会员头像:<a h ...
js实现动态球形标签云
HTML 原文演示地址:http://www.17sucai.com/pins/demoshow/8108 <!DOCTYPE html PUBLIC "-//W3C//DTD XHT ...
Linux下解压缩命令
1. tar格式解包:[*******]$ tar xvf FileName.tar 打包[---]:[*******]$ tar cvf FileName.tar DirName(注:tar是打包 ...
volatile关键字的作用、原理
在只有双重检查锁,没有volatile的懒加载单例模式中,由于指令重排序的问题,我确实不会拿到两个不同的单例了,但我会拿到"半个"单例. 而发挥神奇作用的volatile,可以当之 ...

OpenACC 优化矩阵乘法

OpenACC 优化矩阵乘法的更多相关文章

随机推荐

热门专题