▶ 使用Jacobi 迭代求泊松方程的数值解

● 使用 data 构件,强行要求 u0 仅拷入和拷出 GPU 各一次,u1 仅拷入GPU 一次

 #include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <openacc.h> #if defined(_WIN32) || defined(_WIN64)
#include <C:\Program Files\PGI\win64\19.4\include\wrap\sys\timeb.h>
#define timestruct clock_t
#define gettime(a) (*(a) = clock())
#define usec(t1,t2) (t2 - t1)
#else
#include <sys/time.h>
#define gettime(a) gettimeofday(a, NULL)
#define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)
typedef struct timeval timestruct;
#endif inline float uval(float x, float y)
{
return x * x + y * y;
} int main()
{
const int row = , col = ;
const float height = 1.0, width = 2.0;
const float hx = height / row, wy = width / col;
const float fij = -4.0f;
const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2));
const int maxIter = ;
const int colPlus = col + ; float *restrict u0 = (float *)malloc(sizeof(float)*(row + )*colPlus);
float *restrict u1 = (float *)malloc(sizeof(float)*(row + )*colPlus);
float *utemp = NULL; // 初始化
for (int ix = ; ix <= row; ix++)
{
u0[ix*colPlus + ] = u1[ix*colPlus + ] = uval(ix * hx, 0.0f);
u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy);
}
for (int jy = ; jy <= col; jy++)
{
u0[jy] = u1[jy] = uval(0.0f, jy * wy);
u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy);
}
for (int ix = ; ix < row; ix++)
{
for (int jy = ; jy < col; jy++)
u0[ix*colPlus + jy] = 0.0f;
} // 计算
timestruct t1, t2;
acc_init(acc_device_nvidia);
gettime(&t1);
#pragma acc data copy(u0[0:(row + 1) * colPlus]) copyin(u1[0:(row + 1) * colPlus]) // 循环外侧添加 data 构件,跨迭代(内核)构造数据空间
{
for (int iter = ; iter < maxIter; iter++)
{
#pragma acc kernels present(u0[0:((row + 1) * colPlus)], u1[0:((row + 1) * colPlus)]) // 每次调用内核时声明 u0 和 u1 已经存在,不要再拷贝
{
#pragma acc loop independent
for (int ix = ; ix < row; ix++)
{
#pragma acc loop independent
for (int jy = ; jy < col; jy++)
{
u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - )*colPlus + jy] + u0[(ix + )*colPlus + jy]) + \
hx2 * (u0[ix*colPlus + jy - ] + u0[ix*colPlus + jy + ])) * c2;
}
}
}
utemp = u0, u0 = u1, u1 = utemp;
}
}
gettime(&t2); long long timeElapse = usec(t1, t2);
#if defined(_WIN32) || defined(_WIN64)
printf("\nElapsed time: %13ld ms.\n", timeElapse);
#else
printf("\nElapsed time: %13ld us.\n", timeElapse);
#endif
free(u0);
free(u1);
acc_shutdown(acc_device_nvidia);
//getchar();
return ;
}

● 输出结果,win10 中运行结果,关闭 PGI_ACC_NOTIFY 后可以达到 67 ms

D:\Code\OpenACC>pgcc main.c -o main.exe -c99 -Minfo -acc
main:
, Memory zero idiom, loop replaced by call to __c_mzero4
, Generating copy(u0[:colPlus*(row+)])
Generating copyin(u1[:colPlus*(row+)])
, Generating present(u1[:colPlus*(row+)],u0[:colPlus*(row+)])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, FMA (fused multiply-add) instruction(s) generated
uval:
, FMA (fused multiply-add) instruction(s) generated D:\Code\OpenACC>main.exe
launch CUDA kernel file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 ... launch CUDA kernel file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 Elapsed time: ms.

● nvvp 结果,可见大部分时间都花在了初始化设备上,计算用时已经比较少了,拷贝用时更少,只有开头和结尾有一点

● 输出结果,Ubuntu 中运行结果,含开启 PGI_ACC_TIME 的数据

cuan@CUAN:~$ pgcc data.c -o data.exe -c99 -Minfo -acc
main:
, Memory zero idiom, loop replaced by call to __c_mzero4
, Generating copy(u0[:colPlus*(row+)])
Generating copyin(u1[:colPlus*(row+)])
, Generating present(utemp[:],u1[:colPlus*(row+)],u0[:colPlus*(row+)])
FMA (fused multiply-add) instruction(s) generated
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
uval:
, FMA (fused multiply-add) instruction(s) generated
cuan@CUAN:~$ ./data.exe
launch CUDA kernel file=/home/cuan/data.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 ... launch CUDA kernel file=/home/cuan/data.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 Elapsed time: us. Accelerator Kernel Timing data
/home/cuan/data.c
main NVIDIA devicenum=
time(us): ,
: data region reached times
: data copyin transfers:
device time(us): total=, max=, min=, avg=,
: data copyout transfers:
device time(us): total=, max=, min= avg=
: data region reached times
: compute region reached times
: kernel launched times
grid: [32x1024] block: [32x4]
device time(us): total=, max= min= avg=
elapsed time(us): total=, max=, min= avg=

● 将 tempp 放到了更里一层循环,报运行时错误 715 或 719,参考【https://stackoverflow.com/questions/41366915/openacc-create-data-while-running-inside-a-kernels】,大意是关于内存泄露

D:\Code\OpenACC>main.exe
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=74 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1
call to cuStreamSynchronize returned error 715: Illegal instruction call to cuMemFreeHost returned error 715: Illegal instruction D:\Code\OpenACC>main.exe
launch CUDA kernel file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4
launch CUDA kernel file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Failing in Thread:
call to cuStreamSynchronize returned error : Launch failed (often invalid pointer dereference) Failing in Thread:
call to cuMemFreeHost returned error : Launch failed (often invalid pointer dereference)

● 尝试 在 data 构件中添加 create(utemp) 或在交换指针的位置临时定义 float *utemp 都会报运行时错误 700

D:\Code\OpenACC>main.exe
launch CUDA kernel file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4
launch CUDA kernel file=D:\Code\OpenACC\main.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Failing in Thread:
call to cuStreamSynchronize returned error : Illegal address during kernel execution Failing in Thread:
call to cuMemFreeHost returned error : Illegal address during kernel execution

▶ 恢复错误控制,添加 reduction 导语用来计量改进量

 #include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <openacc.h> #if defined(_WIN32) || defined(_WIN64)
#include <C:\Program Files\PGI\win64\19.4\include\wrap\sys\timeb.h>
#define timestruct clock_t
#define gettime(a) (*(a) = clock())
#define usec(t1,t2) (t2 - t1)
#else
#include <sys/time.h>
#define gettime(a) gettimeofday(a, NULL)
#define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000000 + (t2).tv_usec - (t1).tv_usec)
typedef struct timeval timestruct; #define max(x,y) ((x) > (y) ? (x) : (y))
#endif inline float uval(float x, float y)
{
return x * x + y * y;
} int main()
{
const int row = , col = ;
const float height = 1.0, width = 2.0;
const float hx = height / row, wy = width / col;
const float fij = -4.0f;
const float hx2 = hx * hx, wy2 = wy * wy, c1 = hx2 * wy2, c2 = 1.0f / (2.0 * (hx2 + wy2)), errControl = 0.0f;
const int maxIter = ;
const int colPlus = col + ; float *restrict u0 = (float *)malloc(sizeof(float)*(row + )*colPlus);
float *restrict u1 = (float *)malloc(sizeof(float)*(row + )*colPlus);
float *utemp = NULL; // 初始化
for (int ix = ; ix <= row; ix++)
{
u0[ix*colPlus + ] = u1[ix*colPlus + ] = uval(ix * hx, 0.0f);
u0[ix*colPlus + col] = u1[ix*colPlus + col] = uval(ix*hx, col * wy);
}
for (int jy = ; jy <= col; jy++)
{
u0[jy] = u1[jy] = uval(0.0f, jy * wy);
u0[row*colPlus + jy] = u1[row*colPlus + jy] = uval(row*hx, jy * wy);
}
for (int ix = ; ix < row; ix++)
{
for (int jy = ; jy < col; jy++)
u0[ix*colPlus + jy] = 0.0f;
} // 计算
timestruct t1, t2;
acc_init(acc_device_nvidia);
gettime(&t1);
#pragma acc data copy(u0[0:(row + 1) * colPlus]) copyin(u1[0:(row + 1) * colPlus])
{
for (int iter = ; iter < maxIter; iter++)
{
float uerr = 0.0f; // uerr 要放到前面,否则离开代码块数据未定义,书上这里是错的
#pragma acc kernels present(u0[0:(row + 1) * colPlus]) present(u1[0:(row + 1) * colPlus])
{
#pragma acc loop independent reduction(max:uerr) // 添加 reduction 语句统计改进量
for (int ix = ; ix < row; ix++)
{
for (int jy = ; jy < col; jy++)
{
u1[ix*colPlus + jy] = (c1*fij + wy2 * (u0[(ix - )*colPlus + jy] + u0[(ix + )*colPlus + jy]) + \
hx2 * (u0[ix*colPlus + jy - ] + u0[ix*colPlus + jy + ])) * c2;
uerr = max(uerr, fabs(u0[ix * colPlus + jy] - u1[ix * colPlus + jy]));
}
}
}
printf("\niter = %d, uerr = %e\n", iter, uerr);
if (uerr < errControl)
break;
utemp = u0, u0 = u1, u1 = utemp;
}
}
gettime(&t2); long long timeElapse = usec(t1, t2);
#if defined(_WIN32) || defined(_WIN64)
printf("\nElapsed time: %13ld ms.\n", timeElapse);
#else
printf("\nElapsed time: %13ld us.\n", timeElapse);
#endif
free(u0);
free(u1);
acc_shutdown(acc_device_nvidia);
//getchar();
return ;
}

● 输出结果,win10 相比没有错误控制的情形整整慢了一倍,nvvp 没有明显变化,不放上来了

D:\Code\OpenACC>pgcc main.c -o main.exe -c99 -Minfo -acc
main:
, Memory zero idiom, loop replaced by call to __c_mzero4
, Generating copy(u0[:colPlus*(row+)])
Generating copyin(u1[:colPlus*(row+)])
, Generating present(u0[:colPlus*(row+)])
Generating implicit copy(uerr)
Generating present(u1[:colPlus*(row+)])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
Generating reduction(max:uerr) // 多了 reduction 的信息
, FMA (fused multiply-add) instruction(s) generated
uval:
, FMA (fused multiply-add) instruction(s) generated D:\Code\OpenACC>main.exe iter = , uerr = 2.496107e+00 ... iter = , uerr = 2.202189e-02 Elapsed time: ms.

● 输出结果,Unubtu

cuan@CUAN:~$ pgcc data+reduction.c -o data+reduction.exe -c99 -Minfo -acc
main:
, Memory zero idiom, loop replaced by call to __c_mzero4
, Generating copyin(u1[:colPlus*(row+)])
Generating copy(u0[:colPlus*(row+)])
, FMA (fused multiply-add) instruction(s) generated
, Generating present(u0[:colPlus*(row+)])
Generating implicit copy(uerr)
Generating present(u1[:colPlus*(row+)])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
Generating reduction(max:uerr)
uval:
, FMA (fused multiply-add) instruction(s) generated
cuan@CUAN:~$ ./data+reduction.exe
launch CUDA kernel file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 shared memory=
launch CUDA kernel file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block= shared memory= iter = , uerr = 2.496107e+00
launch CUDA kernel file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 shared memory=
launch CUDA kernel file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block= shared memory= ... iter = , uerr = 2.214956e-02
launch CUDA kernel file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x1024 block=32x4 shared memory=
launch CUDA kernel file=/home/cuan/data+reduction.c function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block= shared memory= iter = , uerr = 2.202189e-02 Elapsed time: us. Accelerator Kernel Timing data
/home/cuan/data+reduction.c
main NVIDIA devicenum=
time(us): ,
: data region reached times
: data copyin transfers:
device time(us): total=, max=, min=, avg=,
: data copyout transfers:
device time(us): total=, max=, min= avg=
: compute region reached times
: kernel launched times
grid: [32x1024] block: [32x4]
device time(us): total=, max= min= avg=
elapsed time(us): total=, max=, min= avg=
: reduction kernel launched times
grid: [] block: []
device time(us): total=, max= min= avg=
elapsed time(us): total=, max=, min= avg=
: data region reached times
: data copyin transfers:
device time(us): total= max= min= avg=
: data copyout transfers:
device time(us): total= max= min= avg=

▶ 尝试在计算的循环导语上加上 collapse(2) 子句,意思是合并两个较小的循环为一个较大的循环。发现效果不显著,不放上来了

OpenACC 书上的范例代码(Jacobi 迭代),part 3的更多相关文章

  1. OpenACC 书上的范例代码(Jacobi 迭代),part 2

    ▶ 使用Jacobi 迭代求泊松方程的数值解 ● 首次使用 OpenACC 进行加速,使用动态数组,去掉了误差控制 #include <stdio.h> #include <stdl ...

  2. OpenACC 书上的范例代码(Jacobi 迭代),part 1

    ▶ 使用Jacobi 迭代求泊松方程的数值解 ● 原始串行版本,运行时间 2272 ms #include <stdio.h> #include <stdlib.h> #inc ...

  3. C#高级编程(第9版) -C#5.0&.Net4.5.1 书上的示例代码下载链接

    http://www.wrox.com/WileyCDA/WroxTitle/Professional-C-5-0-and-NET-4-5-1.productCd-1118833031,descCd- ...

  4. uva 213 - Message Decoding (我认为我的方法要比书上少非常多代码,不保证好……)

    #include<stdio.h> #include<math.h> #include<string.h> char s[250]; char a[10][250] ...

  5. java代码流类。。程序怎么跟书上的结果不一样???

    总结:这个程序很容易懂.的那是这个结果我觉得有问题啊..怎么“stop”后,输出的内容是输入过的呢? 应该是没有关系的呀,与输入的值是不同的....怎么书上运行的结果和我的不一样啊 package c ...

  6. 面试必备:高频算法题终章「图文解析 + 范例代码」之 矩阵 二进制 + 位运算 + LRU 合集

    Attention 秋招接近尾声,我总结了 牛客.WanAndroid 上,有关笔试面经的帖子中出现的算法题,结合往年考题写了这一系列文章,所有文章均与 LeetCode 进行核对.测试.欢迎食用 本 ...

  7. JAVA理解逻辑程序的书上全部重要的习题

    今天随便翻翻看以前学过JAVA理解逻辑程序的书上全部练习,为了一些刚学的学弟学妹,所以呢就把这些作为共享了. 希望对初学的学弟学妹有所帮助! 例子:升级“我行我素购物管理系统”,实现购物结算功能 代码 ...

  8. OK 开始实践书上的项目一:即使标记

    OK 开始实践书上的项目一:及时标记 然而....又得往前面看啦! ----------------------我是分割线------------------------ 代码改变世界

  9. 关于node的基础理论,书上看来的

    最近看了一本书,说了一些Node.js的东西,现在来记录一下,让自己记得更牢靠一点. 在书上,是这样介绍的:Node.js模型是源于Ruby的Event Machine 和 Python的Twiste ...

随机推荐

  1. hibernate连接oracle12c数据库报:java.sql.SQLException: ORA-01017: 用户名/口令无效; 登录被拒绝。(用户名/口令在oracle客户端以及cmd命令都能登入)

    报错信息: 2017-09-22 15:40:07,354 WARN [org.hibernate.cfg.SettingsFactory] - Could not obtain connection ...

  2. 【java规则引擎】《Drools7.0.0.Final规则引擎教程》第3章 3.2 KIE API解析

    转载至:https://blog.csdn.net/wo541075754/article/details/75004575 3.2.4 KieServices 该接口提供了很多方法,可以通过这些方法 ...

  3. 玩转ptrace (一)

    转自http://www.cnblogs.com/catch/p/3476280.html [本文翻译自这里: http://www.linuxjournal.com/article/6100?pag ...

  4. 如何调优JVM

    堆设置 -Xmx3550m:设置JVM最大堆内存 为3550M. -Xms3550m:设置JVM初始堆内存 为3550M.此值可以设置与-Xmx相同,以避免每次垃圾回收完成后JVM重新分配内存. -X ...

  5. Sencha Touch+PhoneGap打造超级奶爸之喂养记(一) 源码免费提供(转)

    起源 非常高兴我的宝宝健康平安的出生了.对于初次做奶爸的我,喜悦过后,面临着各中担心,担心宝宝各项指标是否正常.最初几天都是在医院待着,从出生那一天开始,护士妹妹隔一段时间就会来问宝宝的喂奶,大小便, ...

  6. java基本数据类型和引用类型

    这些基本的数据类型是点不出东西来的 3种引用类型  类class 接口interface 数组array 第一个  : 类 Integer  Long  Boolean  Byte  Characte ...

  7. sql server 数字字符串的排序

    假如我们有这样的数据格式1#XXXXX 20#CCCCC等的一系列数据那么我们如何可以排序出1 ,2,3,4,5,6,7.......这样的数据呢 我们知道整数是可以排序成这样的格式的,但是字符串就不 ...

  8. matplotlib y轴标注显示不全以及subplot调整的问题

    matplotlib y轴标注显示不全以及subplot调整的问题 问题: 我想在y轴显示的标注太长,想把它变成两行显示,发现生成的图形只显示的第二行的字,把第一行的字挤出去了 想要的是显示两行这样子 ...

  9. 谈windows中的句柄

    谈windows中的句柄   每当一个进程打开一个对象,系统就返回一个句柄作为凭证,由此可以想到,句柄是依赖于具体的进程的,换句话说,句柄一定属于某个进程,以后在访问这个对象时就要使用这个凭证!   ...

  10. python下的类的部分特点

    #coding=utf-8 class data: def __init__(self): #构造函数 self.name=' def pp(self): print self.name class ...