OpenACC 书上的范例代码（Jacobi 迭代），part 1

▶ 使用Jacobi 迭代求泊松方程的数值解

● 原始串行版本

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>

 #if defined(_WIN32) || defined(_WIN64)                                                      // 统一计时器

 #include <C:\Program Files\PGI\win64\19.4\include\wrap\sys\timeb.h>

 #define gettime(a)  _ftime(a)

 #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm))        // 单位 ms

 typedef struct _timeb timestruct;

 #else

 #include <sys/time.h>

 #define gettime(a)  gettimeofday(a, NULL)

 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)   // 单位 us

 typedef struct timeval timestruct;

 #endif

 #define IMPROV                                          // 是否额外使用 “每次计算的修正量” 作为退出循环的条件

 inline float uval(float x, float y)                     // 求该点到原点距离的平方

 {

     return x * x + y * y;

 }

 int main()

 {

     const int row = , col = ;                   // 网格行数和列数，

     const float height = 1.0, width = 2.0;              // 实际高度和宽度，与网格行列数不成比例说明是矩形网格

     const float hx = height / row, wy = width / col;    // 每个网格的高度和宽度

     const float fij = -4.0f;                            // 函数 f(x,y) = -4，此时方程的解为 z = x^2 + y^2

     const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2));// 其他用到的参数

     const int maxIter = ;                            // 最大迭代次数

     const int colPlus = col + ;                        // 实际列数

 #ifdef IMPROV

     const float errControl = 0.0f;                      // 修正量控制，取 0 表示无用

     float err = 0.0f;                                   // 修正量

 #endif

     float *u0 = (float *)malloc(sizeof(float)*(row + )*colPlus);       // 用来存放网格数据的两张表，行列数等于 row 和 col 各自加 1，

     float *u1 = (float *)malloc(sizeof(float)*(row + )*colPlus);

     float *utemp = NULL;                                                // 用于交换 u1 和 u0 的临时指针    

     // 初始化边界为 g(x,y) = x^2+y^2

     for (int ix = ; ix <= row; ix++)                                   // 左右边界

     {

         u0[ix*colPlus + ] = u1[ix*colPlus + ] = uval(ix * hx, 0.0f);

         u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy);

     }

     for (int jy = ; jy <= col; jy++)                                   // 上下边界

     {

         u0[jy] = u1[jy] = uval(0.0f, jy * wy);

         u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy);

     }

     for (int ix = ; ix < row; ix++)                                    // 内部格点初始化为 0.0f

     {

         for (int jy = ; jy < col; jy++)

             u0[ix*colPlus + jy] = 0.0f;

     }

     // 计算

     timestruct t1, t2;

     gettime(&t1);

     for (int iter = ; iter < maxIter; iter++)

     {

         for (int ix = ; ix < row; ix++)

         {

             for (int jy = ; jy < col; jy++)

             {

                 u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - )*colPlus + jy] + u0[(ix + )*colPlus + jy]) + \

                     hx2 * (u0[ix*colPlus + jy - ] + u0[ix*colPlus + jy + ])) * c2;

 #ifdef IMPROV

                 err = max(fabs(u0[ix*colPlus + jy] - u1[ix*colPlus + jy]), err);  // 记录整张表上的最大修正量

 #endif

             }

         }

 #ifdef IMPROV

         //printf("\niter = %d, err = %e\n", iter, err);                 // 逐次输出

         if (err < errControl)                                           // 修正量小于指定量就可以退出

             break;

 #endif

         utemp = u0, u0 = u1, u1 = utemp;                                // 交换指针

     }

     gettime(&t2);

     long long timeElapse = usec(t1, t2);

     printf("\nElapsed time: %13ld ms.\n", timeElapse);

     free(u0);

     free(u1);

     getchar();

     return ;

 }

● 输出结果（使用 IMPROV），可以看到很多 not fused，这都是可以改进的地方

D:\Code\OpenACC>pgcc main.c -Minfo -o main.exe                      // 普通编译

main:

     , FMA (fused multiply-add) instruction(s) generated          // 使用乘加指令

uval:

     , FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>pgcc main.c -Minfo -o main-fast.exe -fast           // 添加 fast 选项

main:

     , uval inlined, size= (inline) file main.c ()             // 4 个内联函数

          , Loop not fused: different loop trip count             // 担心 for 中存在数据依赖，拒绝并行

              Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed  // 担心 u0 和 u1是否重叠，拒绝并行

              Loop not vectorized: data dependency

              Loop unrolled  times  // 循环展开

              Generated  prefetches in scalar loop

     , uval inlined, size= (inline) file main.c ()

     , uval inlined, size= (inline) file main.c ()

          , Loop not vectorized: data dependency

              Loop unrolled  times

     , uval inlined, size= (inline) file main.c ()

     , Memory zero idiom, loop replaced by call to __c_mzero4     // 使用 memcpy 来赋零值

     , Loop not vectorized/parallelized: potential early exits    // 有额外脱离循环的条件，拒绝并行

     , Loop not vectorized: data dependency

         Loop unrolled  times

         FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>main.exe

Elapsed time:           ms.

D:\Code\OpenACC>main-fast.exe

Elapsed time:           ms.                                     // 加了 fast 反而更慢

● 输出结果（不用 IMPROV），发现变快了，可见提前跳出循环的 if 语句对并行化有很大影响。在本例中我们让 errControl = 0，每次循环多一个判断（实际绝对不会跳出），就严重干扰了编译

D:\Code\OpenACC>pgcc main.c -Minfo -o main.exe

main:

     , FMA (fused multiply-add) instruction(s) generated

uval:

     , FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>pgcc main.c -Minfo -o main-fast.exe -fast

main:

     , uval inlined, size= (inline) file main.c ()

          , Loop not fused: different loop trip count

              Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed

              Loop not vectorized: data dependency

              Loop unrolled  times

              Generated  prefetches in scalar loop

     , uval inlined, size= (inline) file main.c ()

     , uval inlined, size= (inline) file main.c ()

          , Loop not vectorized: data dependency

              Loop unrolled  times

     , uval inlined, size= (inline) file main.c ()

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Loop not fused : function call before adjacent loop    // ？

     , Loop not vectorized : data dependency

      Loop unrolled  times                                     // 展开次数由 2 变成 4

         FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>main.exe

Elapsed time:           ms.                                 // 变快了 1 倍

D:\Code\OpenACC>main-fast.exe

Elapsed time:            ms.                                 // 再变快 1 倍

● 使用 OpenMP 优化（就一句导语）

 // #include <math.h> 下面

 #include <omp.h>

 //for (int iter = 1; iter < maxIter; iter++){ 下面

 #ifdef IMPROV

 #pragma omp parallel for reduction(max:err) default(none) shared(u0, u1, c1, c2, hx2, wy2, colPlus) private(err)

 #else

 #pragma omp parallel for default(none) shared(u0, u1, c1, c2, hx2, wy2, colPlus)

 #endif

● 输出结果

D:\Code\OpenACC>set OMP_NUM_THREADS=                           // 使用 4 个线程

D:\Code\OpenACC>pgcc main.c -Minfo -o main4I.exe -fast -mp      // 用 IMPROV

main:

     , uval inlined, size= (inline) file main.c ()

          , Loop not fused: different loop trip count

              Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed

              Loop not vectorized: data dependency

              Loop unrolled  times

              Generated  prefetches in scalar loop

     , uval inlined, size= (inline) file main.c ()

     , uval inlined, size= (inline) file main.c ()

          , Loop not vectorized: data dependency

              Loop unrolled  times

     , uval inlined, size= (inline) file main.c ()

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Loop not vectorized/parallelized: potential early exits

     , Parallel region activated                              // OpenMP 并行区

         Parallel loop activated with static block schedule

     , Loop not vectorized: data dependency

         Loop unrolled  times

         FMA (fused multiply-add) instruction(s) generated

     , Begin critical section                                 // 脱出循环的判断导致的串行区

         End critical section

         Barrier                                                // 栅栏

         Parallel region terminated                             

D:\Code\OpenACC>main4I.exe                                      

Elapsed time:            ms.                                 // 还是快了 3.8 倍

D:\Code\OpenACC>pgcc main.c -Minfo -o main4.exe -fast -mp       // 不用 IMPROV

main:

     , uval inlined, size= (inline) file main.c ()

          , Loop not fused: different loop trip count

              Generated vector and scalar versions of the loop; pointer conflict tests determine which is executed

              Loop not vectorized: data dependency

              Loop unrolled  times

              Generated  prefetches in scalar loop

     , uval inlined, size= (inline) file main.c ()

     , uval inlined, size= (inline) file main.c ()

          , Loop not vectorized: data dependency

              Loop unrolled  times

     , uval inlined, size= (inline) file main.c ()

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Loop not vectorized/parallelized: contains a parallel region   // 有 OpenMP的并行区，拒绝并行

     , Parallel region activated

         Parallel loop activated with static block schedule

     , Loop not vectorized: data dependency

         Loop unrolled  times

         FMA (fused multiply-add) instruction(s) generated

     , Barrier                                                // 没有了串行区

         Parallel region terminated

D:\Code\OpenACC>main4.exe

Elapsed time:            ms.                                 // 还能再快点，加速比 1.4

D:\Code\OpenACC>set OMP_NUM_THREADS=                           // 使用 8 线程

D:\Code\OpenACC>pgcc main.c -Minfo -o main8.exe -fast -mp

...// 跟 4 线程时一模一样

D:\Code\OpenACC>main8.exe

Elapsed time:            ms.                                 // 不宰线性加速，加速比 1.5

▶ 在 Ubuntu 下跑的结果，加速前比 win10 慢很多，关闭 IMPROV 并开启 OpenMP 和 fast 选项后速度接近

mainI.exe            us

mainI-fast.exe       us  // 极速比 3.1

main.exe             us  // 加速比 2.1

main-fast.exe         us  // 加速比 6.4

cuan@CUAN:~$ pgcc mainI.c -Minfo -o main4I-fast.exe -fast -mp // 要求我将 row，col，fij 放入 OpenMP 的 shared 导语中，在 win10 下没有显式放入也行

PGC-S--row must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: )

PGC-S--col must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: )

PGC-S--fij must appear in a proper data sharing clause (e.g., PRIVATE) (mainI.c: )

PGC/x86- Linux 19.4-: compilation completed with severe errors

main4I-fast.exe       us  

main4-fast.exe        us  // 加速比 8.8 

main8-fast.exe        us  // 不能继续线性加速

OpenACC 书上的范例代码（Jacobi 迭代），part 1的更多相关文章

OpenACC 书上的范例代码（Jacobi 迭代），part 3
▶ 使用Jacobi 迭代求泊松方程的数值解 ● 使用 data 构件,强行要求 u0 仅拷入和拷出 GPU 各一次,u1 仅拷入GPU 一次 #include <stdio.h> #in ...
OpenACC 书上的范例代码（Jacobi 迭代），part 2
▶ 使用Jacobi 迭代求泊松方程的数值解 ● 首次使用 OpenACC 进行加速,使用动态数组,去掉了误差控制 #include <stdio.h> #include <stdl ...
C#高级编程（第9版） -C#5.0&.Net4.5.1 书上的示例代码下载链接
http://www.wrox.com/WileyCDA/WroxTitle/Professional-C-5-0-and-NET-4-5-1.productCd-1118833031,descCd- ...
uva 213 - Message Decoding (我认为我的方法要比书上少非常多代码，不保证好……)
#include<stdio.h> #include<math.h> #include<string.h> char s[250]; char a[10][250] ...
java代码流类。。程序怎么跟书上的结果不一样？？？
总结:这个程序很容易懂.的那是这个结果我觉得有问题啊..怎么“stop”后,输出的内容是输入过的呢? 应该是没有关系的呀,与输入的值是不同的....怎么书上运行的结果和我的不一样啊 package c ...
面试必备：高频算法题终章「图文解析 + 范例代码」之矩阵二进制 + 位运算 + LRU 合集
Attention 秋招接近尾声,我总结了牛客.WanAndroid 上,有关笔试面经的帖子中出现的算法题,结合往年考题写了这一系列文章,所有文章均与 LeetCode 进行核对.测试.欢迎食用本 ...
JAVA理解逻辑程序的书上全部重要的习题
今天随便翻翻看以前学过JAVA理解逻辑程序的书上全部练习,为了一些刚学的学弟学妹,所以呢就把这些作为共享了. 希望对初学的学弟学妹有所帮助! 例子:升级“我行我素购物管理系统”,实现购物结算功能代码 ...
OK 开始实践书上的项目一：即使标记
OK 开始实践书上的项目一:及时标记然而....又得往前面看啦! ----------------------我是分割线------------------------ 代码改变世界
关于node的基础理论，书上看来的
最近看了一本书,说了一些Node.js的东西,现在来记录一下,让自己记得更牢靠一点. 在书上,是这样介绍的:Node.js模型是源于Ruby的Event Machine 和 Python的Twiste ...

随机推荐

Dubbo 版 Helloworld
使用工具:MAVEN.IDEA.Spring.Dubbo.Zookeeper 直接上代码项目结构: 步骤如下: 搭建MAVEN项目,添加相关依赖 pom.xml <!--Zookeeper-- ...
hdu1208 dp
题意:给了一个 n * n 的方格图,要从图的左上角走到右下角 ,每次只能向右或者向下走,走的格数为当前格子上的数字,问共有多少中走法. 一开始我看到之后觉得这题完全可以用记忆化搜索来做,dfs 一遍 ...
@ModelAttribute的用法
C# NPOI导出Excel和EPPlus导出Excel
转自:http://www.cnblogs.com/tanpeng/p/6155749.html 系统中经常会使用导出Excel的功能.之前使用的是NPOI,但是导出数据行数多就报内存溢出. 最近看到 ...
MySQL Transaction--RC和RR区别
在MySQL中,事务隔离级别RC(read commit)和RR(repeatable read)两种事务隔离级别基于多版本并发控制MVCC(multi-version concurrency con ...
leetcode:Pascal's Triangle【Python版】
1.这道题一次提交就AC了: 2.以前用C语言实现的话,初始化二维数组全部为0,然后每行第一个元素为1,只需要用a[i][j] = a[i-1][j]+a[i-1][j-1]就可以了: 3.在Pyth ...
Python–logging模块知多少
我们在写程序的时候经常会打一些日志来帮助我们查找问题,这次学习一下logging模块,在python里面如何操作日志. 介绍一下logging模块,logging模块就是python里面用来操作日志的 ...
tomcat源码阅读之过滤器
一.Servlet过滤器: 1.介绍: Servlet过滤器本身并不生成请求和响应对象,它只提供过滤作用. Servlet过滤器能够在Servlet被调用之前检查Request对象,修改Request ...
netty答题
1,介绍一下netty netty封装了Java原生的nio,是一个异步和数据驱动的网络编程框架, 与tcp: netty -> Java Runtime Socket (io.nio.nio2 ...
jquery ajax 上传文件
html: <div class="tab-pane" id="head_portrait"> & ...

OpenACC 书上的范例代码（Jacobi 迭代），part 1

OpenACC 书上的范例代码（Jacobi 迭代），part 1的更多相关文章

随机推荐

热门专题