OpenACC 书上的范例代码（Jacobi 迭代），part 3

▶ 使用Jacobi 迭代求泊松方程的数值解

● 使用 data 构件，强行要求 u0 仅拷入和拷出 GPU 各一次，u1 仅拷入GPU 一次

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>

 #include <time.h>

 #include <openacc.h>

 #if defined(_WIN32) || defined(_WIN64)

     #include <C:\Program Files\PGI\win64\19.4\include\wrap\sys\timeb.h>

     #define timestruct clock_t

     #define gettime(a) (*(a) = clock())

     #define usec(t1,t2) (t2 - t1)

 #else

     #include <sys/time.h>

     #define gettime(a)  gettimeofday(a, NULL)

     #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)

     typedef struct timeval timestruct;

 #endif

 inline float uval(float x, float y)

 {

     return x * x + y * y;

 }

 int main()

 {

     const int row = , col = ;

     const float height = 1.0, width = 2.0;

     const float hx = height / row, wy = width / col;

     const float fij = -4.0f;

     const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2));

     const int maxIter = ;

     const int colPlus = col + ;

     float *restrict u0 = (float *)malloc(sizeof(float)*(row + )*colPlus);

     float *restrict u1 = (float *)malloc(sizeof(float)*(row + )*colPlus);

     float *utemp = NULL;

     // 初始化

     for (int ix = ; ix <= row; ix++)

     {

         u0[ix*colPlus + ] = u1[ix*colPlus + ] = uval(ix * hx, 0.0f);

         u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy);

     }

     for (int jy = ; jy <= col; jy++)

     {

         u0[jy] = u1[jy] = uval(0.0f, jy * wy);

         u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy);

     }

     for (int ix = ; ix < row; ix++)

     {

         for (int jy = ; jy < col; jy++)

             u0[ix*colPlus + jy] = 0.0f;

     }

     // 计算

     timestruct t1, t2;

     acc_init(acc_device_nvidia);

     gettime(&t1);

 #pragma acc data copy(u0[0:(row + 1) * colPlus]) copyin(u1[0:(row + 1) * colPlus])      // 循环外侧添加 data 构件，跨迭代（内核）构造数据空间

     {

         for (int iter = ; iter < maxIter; iter++)

         {

 #pragma acc kernels present(u0[0:((row + 1) * colPlus)], u1[0:((row + 1) * colPlus)])   // 每次调用内核时声明 u0 和 u1 已经存在，不要再拷贝

             {

 #pragma acc loop independent

                 for (int ix = ; ix < row; ix++)

                 {

 #pragma acc loop independent

                     for (int jy = ; jy < col; jy++)

                     {

                         u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - )*colPlus + jy] + u0[(ix + )*colPlus + jy]) + \

                             hx2 * (u0[ix*colPlus + jy - ] + u0[ix*colPlus + jy + ])) * c2;

                     }

                 }

             }

             utemp = u0, u0 = u1, u1 = utemp;

         }

     }

     gettime(&t2);

     long long timeElapse = usec(t1, t2);

 #if defined(_WIN32) || defined(_WIN64)

     printf("\nElapsed time: %13ld ms.\n", timeElapse);

 #else

     printf("\nElapsed time: %13ld us.\n", timeElapse);

 #endif

     free(u0);

     free(u1);

     acc_shutdown(acc_device_nvidia);

     //getchar();

     return ;

 }

● 输出结果，win10 中运行结果，关闭 PGI_ACC_NOTIFY 后可以达到 67 ms

D:\Code\OpenACC>pgcc main.c -o main.exe -c99 -Minfo -acc

main:

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Generating copy(u0[:colPlus*(row+)])

         Generating copyin(u1[:colPlus*(row+)])

     , Generating present(u1[:colPlus*(row+)],u0[:colPlus*(row+)])

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

     , FMA (fused multiply-add) instruction(s) generated

uval:

     , FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>main.exe

launch CUDA kernel  file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4

...

launch CUDA kernel  file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4

Elapsed time:            ms.

● nvvp 结果，可见大部分时间都花在了初始化设备上，计算用时已经比较少了，拷贝用时更少，只有开头和结尾有一点

● 输出结果，Ubuntu 中运行结果，含开启 PGI_ACC_TIME 的数据

cuan@CUAN:~$ pgcc data.c -o data.exe -c99 -Minfo -acc

main:

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Generating copy(u0[:colPlus*(row+)])

         Generating copyin(u1[:colPlus*(row+)])

     , Generating present(utemp[:],u1[:colPlus*(row+)],u0[:colPlus*(row+)])

         FMA (fused multiply-add) instruction(s) generated

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

uval:

     , FMA (fused multiply-add) instruction(s) generated

cuan@CUAN:~$ ./data.exe

launch CUDA kernel  file=/home/cuan/data.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4

...

launch CUDA kernel  file=/home/cuan/data.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4

Elapsed time:          us.

Accelerator Kernel Timing data

/home/cuan/data.c

  main  NVIDIA  devicenum=

    time(us): ,

    : data region reached  times

        : data copyin transfers:

             device time(us): total=, max=, min=, avg=,

        : data copyout transfers:

             device time(us): total=, max=, min= avg=

    : data region reached  times

    : compute region reached  times

        : kernel launched  times

            grid: [32x1024]  block: [32x4]

             device time(us): total=, max= min= avg=

            elapsed time(us): total=, max=, min= avg=

● 将 tempp 放到了更里一层循环，报运行时错误 715 或 719，参考【https://stackoverflow.com/questions/41366915/openacc-create-data-while-running-inside-a-kernels】，大意是关于内存泄露

D:\Code\OpenACC>main.exe
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=74 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1
call to cuStreamSynchronize returned error 715: Illegal instruction

call to cuMemFreeHost returned error 715: Illegal instruction

D:\Code\OpenACC>main.exe

launch CUDA kernel  file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=

Failing in Thread:

call to cuStreamSynchronize returned error : Launch failed (often invalid pointer dereference)

Failing in Thread:

call to cuMemFreeHost returned error : Launch failed (often invalid pointer dereference)

● 尝试在 data 构件中添加 create(utemp) 或在交换指针的位置临时定义 float *utemp 都会报运行时错误 700

D:\Code\OpenACC>main.exe

launch CUDA kernel  file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4

launch CUDA kernel  file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=

Failing in Thread:

call to cuStreamSynchronize returned error : Illegal address during kernel execution

Failing in Thread:

call to cuMemFreeHost returned error : Illegal address during kernel execution

▶ 恢复错误控制，添加 reduction 导语用来计量改进量

 #include <stdio.h>

 #include <stdlib.h>

 #include <math.h>

 #include <time.h>

 #include <openacc.h>

 #if defined(_WIN32) || defined(_WIN64)

 #include <C:\Program Files\PGI\win64\19.4\include\wrap\sys\timeb.h>

 #define timestruct clock_t

 #define gettime(a) (*(a) = clock())

 #define usec(t1,t2) (t2 - t1)

 #else

 #include <sys/time.h>

 #define gettime(a)  gettimeofday(a, NULL)

 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)

 typedef struct timeval timestruct;

 #define max(x,y) ((x) > (y) ? (x) : (y))

 #endif

 inline float uval(float x, float y)

 {

     return x * x + y * y;

 }

 int main()

 {

     const int row = , col = ;

     const float height = 1.0, width = 2.0;

     const float hx = height / row, wy = width / col;

     const float fij = -4.0f;

     const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2)), errControl = 0.0f;

     const int maxIter = ;

     const int colPlus = col + ;

     float *restrict u0 = (float *)malloc(sizeof(float)*(row + )*colPlus);

     float *restrict u1 = (float *)malloc(sizeof(float)*(row + )*colPlus);

     float *utemp = NULL;

     // 初始化

     for (int ix = ; ix <= row; ix++)

     {

         u0[ix*colPlus + ] = u1[ix*colPlus + ] = uval(ix * hx, 0.0f);

         u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy);

     }

     for (int jy = ; jy <= col; jy++)

     {

         u0[jy] = u1[jy] = uval(0.0f, jy * wy);

         u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy);

     }

     for (int ix = ; ix < row; ix++)

     {

         for (int jy = ; jy < col; jy++)

             u0[ix*colPlus + jy] = 0.0f;

     }

     // 计算

     timestruct t1, t2;

     acc_init(acc_device_nvidia);

     gettime(&t1);

 #pragma acc data copy(u0[0:(row + 1) * colPlus]) copyin(u1[0:(row + 1) * colPlus])

     {

         for (int iter = ; iter < maxIter; iter++)

         {

             float uerr = 0.0f;                                                              // uerr 要放到前面，否则离开代码块数据未定义，书上这里是错的

 #pragma acc kernels present(u0[0:(row + 1) * colPlus]) present(u1[0:(row + 1) * colPlus])

             {

 #pragma acc loop independent reduction(max:uerr)                                            // 添加 reduction 语句统计改进量

                 for (int ix = ; ix < row; ix++)

                 {

                     for (int jy = ; jy < col; jy++)

                     {

                         u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - )*colPlus + jy] + u0[(ix + )*colPlus + jy]) + \

                             hx2 * (u0[ix*colPlus + jy - ] + u0[ix*colPlus + jy + ])) * c2;

                         uerr = max(uerr, fabs(u0[ix * colPlus + jy] - u1[ix * colPlus + jy]));

                     }

                 }

             }

             printf("\niter = %d, uerr = %e\n", iter, uerr);

             if (uerr < errControl)

                 break;

             utemp = u0, u0 = u1, u1 = utemp;

         }

     }

     gettime(&t2);

     long long timeElapse = usec(t1, t2);

 #if defined(_WIN32) || defined(_WIN64)

     printf("\nElapsed time: %13ld ms.\n", timeElapse);

 #else

     printf("\nElapsed time: %13ld us.\n", timeElapse);

 #endif

     free(u0);

     free(u1);

     acc_shutdown(acc_device_nvidia);

     //getchar();

     return ;

 }

● 输出结果，win10 相比没有错误控制的情形整整慢了一倍，nvvp 没有明显变化，不放上来了

D:\Code\OpenACC>pgcc main.c -o main.exe -c99 -Minfo -acc

main:

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Generating copy(u0[:colPlus*(row+)])

         Generating copyin(u1[:colPlus*(row+)])

     , Generating present(u0[:colPlus*(row+)])

         Generating implicit copy(uerr)

         Generating present(u1[:colPlus*(row+)])

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

             Generating reduction(max:uerr)                                 // 多了 reduction 的信息

     , FMA (fused multiply-add) instruction(s) generated

uval:

     , FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC>main.exe

iter = , uerr = 2.496107e+00

...

iter = , uerr = 2.202189e-02

Elapsed time:            ms.

● 输出结果，Unubtu

cuan@CUAN:~$ pgcc data+reduction.c -o data+reduction.exe -c99 -Minfo -acc

main:

     , Memory zero idiom, loop replaced by call to __c_mzero4

     , Generating copyin(u1[:colPlus*(row+)])

         Generating copy(u0[:colPlus*(row+)])

     , FMA (fused multiply-add) instruction(s) generated

     , Generating present(u0[:colPlus*(row+)])

         Generating implicit copy(uerr)

         Generating present(u1[:colPlus*(row+)])

     , Loop is parallelizable

     , Loop is parallelizable

         Generating Tesla code

         , #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */

         , #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */

             Generating reduction(max:uerr)

uval:

     , FMA (fused multiply-add) instruction(s) generated

cuan@CUAN:~$ ./data+reduction.exe

launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 shared memory=

launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block= shared memory=

iter = , uerr = 2.496107e+00

launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 shared memory=

launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block= shared memory=

...

iter = , uerr = 2.214956e-02

launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 shared memory=

launch CUDA kernel  file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block= shared memory=

iter = , uerr = 2.202189e-02

Elapsed time:         us.

Accelerator Kernel Timing data

/home/cuan/data+reduction.c

  main  NVIDIA  devicenum=

    time(us): ,

    : data region reached  times

        : data copyin transfers:

             device time(us): total=, max=, min=, avg=,

        : data copyout transfers:

             device time(us): total=, max=, min= avg=

    : compute region reached  times

        : kernel launched  times

            grid: [32x1024]  block: [32x4]

             device time(us): total=, max= min= avg=

            elapsed time(us): total=, max=, min= avg=

        : reduction kernel launched  times

            grid: []  block: []

             device time(us): total=, max= min= avg=

            elapsed time(us): total=, max=, min= avg=

    : data region reached  times

        : data copyin transfers:

             device time(us): total= max= min= avg=

        : data copyout transfers:

             device time(us): total= max= min= avg=

▶ 尝试在计算的循环导语上加上 collapse(2) 子句，意思是合并两个较小的循环为一个较大的循环。发现效果不显著，不放上来了

OpenACC 书上的范例代码（Jacobi 迭代），part 3的更多相关文章

OpenACC 书上的范例代码（Jacobi 迭代），part 2
▶ 使用Jacobi 迭代求泊松方程的数值解 ● 首次使用 OpenACC 进行加速,使用动态数组,去掉了误差控制 #include <stdio.h> #include <stdl ...
OpenACC 书上的范例代码（Jacobi 迭代），part 1
▶ 使用Jacobi 迭代求泊松方程的数值解 ● 原始串行版本,运行时间 2272 ms #include <stdio.h> #include <stdlib.h> #inc ...
C#高级编程（第9版） -C#5.0&.Net4.5.1 书上的示例代码下载链接
http://www.wrox.com/WileyCDA/WroxTitle/Professional-C-5-0-and-NET-4-5-1.productCd-1118833031,descCd- ...
uva 213 - Message Decoding (我认为我的方法要比书上少非常多代码，不保证好……)
#include<stdio.h> #include<math.h> #include<string.h> char s[250]; char a[10][250] ...
java代码流类。。程序怎么跟书上的结果不一样？？？
总结:这个程序很容易懂.的那是这个结果我觉得有问题啊..怎么“stop”后,输出的内容是输入过的呢? 应该是没有关系的呀,与输入的值是不同的....怎么书上运行的结果和我的不一样啊 package c ...
面试必备：高频算法题终章「图文解析 + 范例代码」之矩阵二进制 + 位运算 + LRU 合集
Attention 秋招接近尾声,我总结了牛客.WanAndroid 上,有关笔试面经的帖子中出现的算法题,结合往年考题写了这一系列文章,所有文章均与 LeetCode 进行核对.测试.欢迎食用本 ...
JAVA理解逻辑程序的书上全部重要的习题
今天随便翻翻看以前学过JAVA理解逻辑程序的书上全部练习,为了一些刚学的学弟学妹,所以呢就把这些作为共享了. 希望对初学的学弟学妹有所帮助! 例子:升级“我行我素购物管理系统”,实现购物结算功能代码 ...
OK 开始实践书上的项目一：即使标记
OK 开始实践书上的项目一:及时标记然而....又得往前面看啦! ----------------------我是分割线------------------------ 代码改变世界
关于node的基础理论，书上看来的
最近看了一本书,说了一些Node.js的东西,现在来记录一下,让自己记得更牢靠一点. 在书上,是这样介绍的:Node.js模型是源于Ruby的Event Machine 和 Python的Twiste ...

随机推荐

Immutable集合
转自:https://blog.csdn.net/michaellufhl/article/details/6314333 大家都知道JDK提供了Collections.UnmodifiableLis ...
【JVM】jvm至jstack命令
一.介绍 jstack是java虚拟机自带的一种堆栈跟踪工具.jstack用于打印出给定的java进程ID或core file或远程调试服务的Java堆栈信息,如果是在64位机器上,需要指定选项&qu ...
day 2克隆虚拟机器minimal需要注意的问题和制作本地yum源和常用的Linux的命令
------- 克隆bee2 PS:因为复制机器后,又多了一个网卡eth1.本来只有一个网卡eth0,下面是解决方案. 解决克隆后eth0不见的问题 1.直接修改vi /etc/sysconfig/ ...
smarty中调用php内置函数
http://blog.csdn.net/clevercode/article/details/50373633
第一章 spring起步
点击网址 http://start.spring.io/ 就可以获得spring-boot的项目结构. 如下: 将项目解压到自己的项目中,然后找到mian函数所在启动类.运行.出现: 表示已经运行了 ...
ThinkPHP 5 insertall 只插入最后一条数据的问题
问题来源: Steed 2018/1/5 11:30:25 @流年我用fetchsql查看的sql,发现数据都是最后一条 Steed 2018/1/5 11:30:39 我也不知道是什么问题,我打印 ...
使用 phpStudy + VSCODE 进行 PHP 断点调试
使用 phpStudy + VSCODE 进行 PHP 断点调试自己摸索过程有点曲折,但还是配置成功了,现分享如下. 原料 phpStudy 2018 VSCODE 配置过程安装 phpStudy ...
hadoop之参数调优
一. hdfs-site.xml 配置文件 1. dfs.blocksize 参数:hadoop文件块大小描述:新文件的默认块大小,以字节为单位,默认 134217728 字节.可以使用以下后缀(大小 ...
HttpFileCollection类
最近在学HttpRequest类搞文件上传的时候看到Request.Files返回了HttpFileCollection 这个类的一个对象,这个类用于获取浏览器上传的文件集合,在文件上传的时候可以通过 ...
VBox修改uuid
1.使用VBoxManage命令时,需要先在命令行中切换到VirtualBox的安装目录下 2.修改vdi的uuid:VBoxManage internalcommands sethduuid D: ...

OpenACC 书上的范例代码（Jacobi 迭代），part 3

OpenACC 书上的范例代码（Jacobi 迭代），part 3的更多相关文章

随机推荐

热门专题