Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 2

▶ 第四章，逐步优化了一个三维卷积计算的过程

● 基准代码

 #include <stdio.h>

 #include <stdlib.h>

 #include <string.h>

 #include <math.h>

 #include <time.h>

 #include <sys/time.h>

 #include <omp.h>

 #include <assert.h>

 #include <sys/mman.h>

 #define REAL float

 #define NX (64)

 #ifndef M_PI

     #define M_PI (3.1415926535897932384626)

 #endif

 // 初始化格点矩阵

 void init(REAL *buff, const int nx, const int ny, const int nz, const REAL kx, const REAL ky, const REAL kz,

     const REAL dx, const REAL dy, const REAL dz, const REAL kappa, const REAL time)

 {

     REAL ax = exp(-kappa * time*(kx*kx)), ay = exp(-kappa * time*(ky*ky)), az = exp(-kappa * time*(kz*kz));

     for (int jz = ; jz < nz; jz++)

     {

         for (int jy = ; jy < ny; jy++)

         {

             for (int jx = ; jx < nx; jx++)

             {

                 int j = (jz * ny + jy) * NX + jx;

                 REAL x = dx * ((REAL)(jx + 0.5)), y = dy * ((REAL)(jy + 0.5)), z = dz * ((REAL)(jz + 0.5));

                 buff[j] = (REAL)0.125*(1.0 - ax * cos(kx * x))*(1.0 - ay * cos(ky * y))*(1.0 - az * cos(kz * z));;

             }

         }

     }

 }

 // 计算卷积

 void diffusion(REAL *f1, REAL *f2, int nx, int ny, int nz,

     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)

 {

     for (int i = ; i < count; ++i)

     {

         for (int z = ; z < nz; z++)

         {

             for (int y = ; y < ny; y++)

             {

                 for (int x = ; x < nx; x++)

                 {

                     int c = (z * ny + y) * NX + x;

                     int w = (x == ) ? c : c - ;

                     int e = (x == NX - ) ? c : c + ;

                     int n = (y == ) ? c : c - NX;

                     int s = (y == ny - ) ? c : c + NX;

                     int b = (z == ) ? c : c - NX * ny;

                     int t = (z == nz - ) ? c : c + NX * ny;

                     f2[c] = cc * f1[c] + cw * f1[w] + ce * f1[e] + cs * f1[s] + cn * f1[n] + cb * f1[b] + ct * f1[t];

                 }

             }

         }

         REAL *t = f1;

         f1 = f2;

         f2 = t;

     }

     return;

 }

 static double cur_second(void)                                      // 计时器，返回一个秒数

 {

     struct timeval tv;

     gettimeofday(&tv, NULL);

     return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;

 }

 REAL accuracy(const REAL *b1, REAL *b2, const int len)              //计算两个数组的差距

 {

     REAL err = 0.0;

     for (int i = ; i < len; i++)

         err += (b1[i] - b2[i]) * (b1[i] - b2[i]);

     return (REAL)sqrt(err / len);

 }

 void dump_result(REAL *f, int nx, int ny, int nz, char *out_path)   // 将结果写到文件中

 {

     FILE *out = fopen(out_path, "w");

     assert(out);

     fwrite(f, sizeof(REAL), nx * ny * nz, out);

     fclose(out);

 }

 int main(int argc, char *argv[])

 {

     int nx = NX, ny = NX, nz = NX;

     REAL *f1 = (REAL *)malloc(sizeof(REAL) * NX * NX * NX);

     REAL *f2 = (REAL *)malloc(sizeof(REAL) * NX * NX * NX);

     REAL *f3 = (REAL *)malloc(sizeof(REAL) * NX * ny * nz);

     assert(f1 != MAP_FAILED);

     assert(f2 != MAP_FAILED);

     assert(f3 != MAP_FAILED);

     REAL dx, dy, dz, kx, ky, kz;

     dx = dy = dz = 1.0 / nx;                // 边长 1.0

     kx = ky = kz = 2.0 * M_PI;

     REAL kappa = 0.1;

     REAL dt = 0.1 * dx * dx / kappa;

     int count = 0.1 / dt;

     init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, 0.0);

     REAL ce, cw, cn, cs, ct, cb, cc;

     ce = cw = kappa * dt / (dx * dx);

     cn = cs = kappa * dt / (dy * dy);

     ct = cb = kappa * dt / (dz * dz);

     cc = 1.0 - (ce + cw + cn + cs + ct + cb);

     printf("Running diffusion kernel %d times\n", count);

     fflush(stdout);

     struct timeval time_b, time_e;

     gettimeofday(&time_b, NULL);

     diffusion(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc, dt, count);

     gettimeofday(&time_e, NULL);

     //dump_result((count % 2) ? f2 : f1, nx, ny, nz, "diffusion_result.dat");

     init(f3, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, count * dt);            // 对比基准结果

     REAL err = accuracy((count % ) ? f2 : f1, f3, nx*ny*nz);

     double elapsed_time = (time_e.tv_sec - time_b.tv_sec) + (time_e.tv_usec - time_b.tv_usec) * 1.0e-6;

     REAL mflops = (nx*ny*nz)*13.0*count / elapsed_time * 1.0e-06;

     double thput = (nx * ny * nz) * sizeof(REAL) * 3.0 * count / elapsed_time * 1.0e-09;

     printf("Elapsed time : %.3f (s)\nFLOPS        : %.3f (MFlops)\n", elapsed_time, mflops);

     printf("Throughput   : %.3f (GB/s)\nAccuracy     : %e\n", thput, err);

     free(f1);

     free(f2);

     return ;

 }

■ 输出结果

Running diffusion kernel  times

Elapsed time : 177.015 (s)

FLOPS        : 252.276 (MFlops)

Throughput   : 0.233 (GB/s)

Accuracy     : 5.068947e-06

● 计算内核加入 OpenMP

 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,

     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)// 加了 restrict

 {

     #pragma omp parallel                    // openMP 并行域

     {

         REAL *f1_t = f1, *f2_t = f2;        // 使用局部的指针

         for (int i = ; i < count; ++i)

         {

             #pragma omp for collapse(2)     // 展开外两层循环

             for (int z = ; z < nz; z++)

             {

                 for (int y = ; y < ny; y++)

                 {

                     for (int x = ; x < nx; x++)

                     {

                         int c = (z * ny + y) * NX + x;

                         int w = (x == ) ? c : c - ;

                         int e = (x == NX - ) ? c : c + ;

                         int n = (y == ) ? c : c - NX;

                         int s = (y == ny - ) ? c : c + NX;

                         int b = (z == ) ? c : c - NX * ny;

                         int t = (z == nz - ) ? c : c + NX * ny;

                         f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];

                     }

                 }

             }

             REAL *t = f1_t;

             f1_t = f2_t;

             f2_t = t;

         }

     }

     return;

 }

■ 输出结果

Running diffusion kernel  times

Elapsed time : 2.936 (s)

FLOPS        : 15209.439 (MFlops)

Throughput   : 14.039 (GB/s)

Accuracy     : 4.789139e-06

● 保证向量化

 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,

     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)

 {

 #pragma omp parallel

         {

             REAL *f1_t = f1, *f2_t = f2;

             for (int i = ; i < count; ++i)

             {

                 #pragma omp for collapse(2)

                 for (int z = ; z < nz; z++)

                 {

                     for (int y = ; y < ny; y++)

                     {

                         #pragma simd                        // 保证向量化，不考虑 f1_t 和 f2_t 之间的独立子性

                         for (int x = ; x < nx; x++)

                         {

                         int c = (z * ny + y) * NX + x;

                         int w = (x == ) ? c : c - ;

                         int e = (x == NX - ) ? c : c + ;

                         int n = (y == ) ? c : c - NX;

                         int s = (y == ny - ) ? c : c + NX;

                         int b = (z == ) ? c : c - NX * ny;

                         int t = (z == nz - ) ? c : c + NX * ny;

                         f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];

                         }

                     }

                 }

                 REAL *t = f1_t;

                 f1_t = f2_t;

                 f2_t = t;

             }

         }

         return;

 }

■ 输出结果

Running diffusion kernel  times

Elapsed time : 0.865 (s)

FLOPS        : 51651.863 (MFlops)

Throughput   : 47.679 (GB/s)

Accuracy     : 4.427611e-06

● 手动剥离边界

 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,

     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)

 {

 #pragma omp parallel

     {

         REAL *f1_t = f1, *f2_t = f2;

         for (int i = ; i < count; ++i)

         {

             #pragma omp for collapse(2)

             for (int z = ; z < nz; z++)

             {

                 for (int y = ; y < ny; y++)

                 {

                     int x = ;                                  // 每行首次

                     int c = (z * ny + y) * NX + x;              // 注意 w 方向的下标是 c

                     int n = (y == ) ? c : c - NX;

                     int s = (y == ny - ) ? c : c + NX;

                     int b = (z == ) ? c : c - NX * ny;

                     int t = (z == nz - ) ? c : c + NX * ny;

                     f2_t[c] = cc * f1_t[c] + cw * f1_t[c] + ce * f1_t[c + ] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];

                     #pragma simd

                     for (x = ; x < nx - ; x++)                // 中间部分，注意循环要按照 OpenMP 格式书写

                     {

                         c++;

                         n++;

                         s++;

                         b++;

                         t++;

                         f2_t[c] = cc * f1_t[c] + cw * f1_t[c - ] + ce * f1_t[c + ] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];

                     }

                     c++;                                        // 每行末次

                     n++;                                        // 注意 e 方向的下标是 c

                     s++;

                     b++;

                     t++;

                     f2_t[c] = cc * f1_t[c] + cw * f1_t[c - ] + ce * f1_t[c] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];

                 }

             }

             REAL *t = f1_t;

             f1_t = f2_t;

             f2_t = t;

         }

     }

     return;

 }

■ 输出结果

Running diffusion kernel  times

Elapsed time : 0.565 (s)

FLOPS        : 79071.250 (MFlops)

Throughput   : 72.989 (GB/s)

Accuracy     : 4.577150e-06

● 数据切片

 void diffusion(REAL *restrict f1, REAL *restrict f2, int nx, int ny, int nz,

     REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, int count)

 {

 #pragma omp parallel

     {

         REAL *f1_t = f1, *f2_t = f2;

         for (int i = ; i < count; ++i)

         {

             #define YBF 16                                          // 分块大小

             #pragma omp for collapse(2)

             for (int yy = ; yy < ny; yy += YBF)                    // 在循环之外放入分块

             {

                 for (int z = ; z < nz; z++)

                 {

                     int yyy = (yy + YBF) >= ny ? ny : (yy + YBF);   // 该分块的末端

                     for (int y = yy; y < yyy; y++)                  // y 限定在分块内循环

                     {

                         int x = ;

                         int c = (z * ny + y) * NX + x;

                         int n = (y == ) ? c : c - NX;

                         int s = (y == ny - ) ? c : c + NX;

                         int b = (z == ) ? c : c - NX * ny;

                         int t = (z == nz - ) ? c : c + NX * ny;

                         f2_t[c] = cc * f1_t[c] + cw * f1_t[c] + ce * f1_t[c + ] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];

                         #pragma simd

                         for (x = ; x < nx - ; x++)

                         {

                             c++;

                             n++;

                             s++;

                             b++;

                             t++;

                             f2_t[c] = cc * f1_t[c] + cw * f1_t[c - ] + ce * f1_t[c + ] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];

                         }

                         c++;

                         n++;

                         s++;

                         b++;

                         t++;

                         f2_t[c] = cc * f1_t[c] + cw * f1_t[c - ] + ce * f1_t[c] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];

                     }

                 }

             }

             REAL *t = f1_t;

             f1_t = f2_t;

             f2_t = t;

         }

     }

     return;

 }

■ 输出结果，没有明显优化

Running diffusion kernel  times

Elapsed time : 0.594 (s)

FLOPS        : 75224.680 (MFlops)

Throughput   : 69.438 (GB/s)

Accuracy     : 4.577150e-06

Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 2的更多相关文章

Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 1
▶ 第三章,逐步优化了一个二维卷积计算的过程 ● 基准代码 #include <stdio.h> #include <stdlib.h> #include <string ...
Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 4
▶ 第五章,几个优化 ● 代码 #include <stdio.h> #include <stdlib.h> #include <math.h> #define S ...
Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 3
▶ 第二章,几个简单的程序 ● 代码,单线程 #include <stdio.h> #include <stdlib.h> #include <string.h> ...
Xeon Phi 编程备忘
▶ 闲鱼的 Xeon Phi 3120A 配办公室的新 Xeon 服务器,记录一下环境安装过程. ● 原本尝试搭 Ubuntu 服务器,参考[https://software.intel.com/en ...
Python猫荐书系列之五：Python高性能编程
稍微关心编程语言的使用趋势的人都知道,最近几年,国内最火的两种语言非 Python 与 Go 莫属,于是,隔三差五就会有人问:这两种语言谁更厉害/好找工作/高工资…… 对于编程语言的争论,就是猿界的生 ...
《高性能javascript》一书要点和延伸（上）
前些天收到了HTML5中国送来的<高性能javascript>一书,便打算将其做为假期消遣,顺便也写篇文章记录下书中一些要点. 个人觉得本书很值得中低级别的前端朋友阅读,会有很多意想不到的 ...
高质量C++/C编程指南（林锐）
推荐-高质量C++/C编程指南(林锐) 版本/状态作者参与者起止日期备注 V 0.9 草稿文件林锐 2001-7-1至 2001-7-18 林锐起草 V 1.0 正式文件林锐 20 ...
物联网操作系统HelloX应用编程指南
HelloX操作系统应用编程指南 HelloX应用开发概述可以通过三种方式,在HelloX操作系统基础上开发应用: 1．以内部命令方式实现应用,直接编译链接到HelloX的内核she ...
JDK 高性能编程之容器
高性能编程在对不同场景下对于容器的选择有着非常苛刻的条件,这里记录下前人总结的经验,并对源码进行调试 JDK高性能编程之容器读书笔记内容部分来源书籍深入理解JVM.互联网等先放一个类图util,点 ...

随机推荐

4 Django应用第3部分（视图部分）
接着昨天写的那篇笔记,今天继续学习DJango中的内容.这一章主要是介绍Django中的视图部分. 4.1视图理念 4.2编写第一个视图 4.3编写更多的视图 4.4给视图编写功能 4.5render ...
Android开发 ---代码创建选项菜单、隐藏菜单项、菜单的生命周期，菜单按钮图标设置、搜索框、xml中设置子菜单
1.activity_main.xml 描述: 定义了一个按钮 <?xml version="1.0" encoding="utf-8"?> < ...
Linux每天一个命令：nc/ncat
nmap-ncat.x86_64版nc/ncat nc/ncat所做的就是在两台电脑之间建立链接并返回两个数据流,在这之后所能做的事就看你的想像力了.你能建立一个服务器,传输文件,与朋友聊天,传输流媒 ...
.NET并行计算和并发8-QueueUserWorkItem异步
QueueUserWorkItem方法将非常简单的任务排入队列下面这个简单的代码,涉及到资源竞争问题,如果主线程先争取到资源,如果没有等待一段时间,那么QueueUserWorkItem申请的 ...
wc语法2
wc命令的功能为统计指定文件中的字节数.字数.行数, 并将统计结果显示输出. 语法:wc [选项] 文件… 说明:该命令统计给定文件中的字节数.字数.行数.如果没有给出文件名,则从标准输入读取.wc同 ...
使用Jackson解析首字母大写的json字符串
Jackson在解析返回的json字符串时始终报错,纠结很久之后才找到原因,原来是是由于json字符串中的字母都是首字母大写,导致jackson找不到相应的KEY. 在项目中经常使用从服务器获取的数据 ...
Linux 堆溢出原理分析
堆溢出与堆的内存布局有关,要搞明白堆溢出,首先要清楚的是malloc()分配的堆内存布局是什么样子,free()操作后又变成什么样子. 解决第一个问题:通过malloc()分配的堆内存,如何布局? 上 ...
535种使用JavaScript重新加载页面的方法
除了location = location之外还有534中方法重新加载页面 location = location location = location.href location = window ...
java.net.BindException: Address already in use: JVM_Bind:80 异常的解决办法
今天遇见了这个端口被占用问题然后各种百度先是说用命令 netstat -a -n -o 最后一个选项表示连接所在进程id. 找到8080端口的PID然后打开任务管理器, 切换到进程选项卡, 在菜 ...
activiti学习第一天
公司项目组在考虑工作流,首选了activiti,首先我们要明确为什么要使用activiti,有什么好处. 在工作中有些项目会用到工作流,如果简单的项目,我们就无需使用类似activiti.jbpm等工 ...

Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 2

Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 2的更多相关文章

随机推荐

热门专题