ICL Auto Vectorization

简介

此文简单介绍如何使用intel c++编译器实现向量化加速。

全文如下安排：

base ：待优化的源代码。
vectorization ：第一个向量化版本。
aligned ：内存对其对向量化的影响。

base

base版本代码：

// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;

int64_t cpu_freq;
int64_t cpu_counter(){
  int64_t clock;
    QueryPerformanceCounter((LARGE_INTEGER*)&clock);
  return clock;
}

// output time
#if 1
  int64_t gloabel_timer_begin;
  int64_t gloabel_timer_end;
  #define TB__ gloabel_timer_begin=cpu_counter()
  #define TE__ gloabel_timer_end  =cpu_counter(); \
  cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else
  #define TB__
  #define TE__
#endif

// repeat times
#define REPEATTIMES 100000

// initialize data
void init(float *data, int rows, int cols, int true_cols){
  for (int i = 0; i < rows; i++){
    for (int j = 0; j < cols; j++){
      data[i*true_cols+j] = float(rand())/float(RAND_MAX);
    }
  }
}

void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);

void print_sum(float *data, int rows, int cols, int true_cols){
  float total = 0;
  for (int i = 0; i < rows; i++){
    for (int j = 0; j < cols; j++){
      total += data[i*true_cols+j];
    }
  }
  cout << total << endl;
}

int main(){
  QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);

  int rows = 100;
  int cols = 101;

  int true_cols = cols;
  float *A = (float*)malloc(rows*true_cols*sizeof(float));
  float *B = (float*)malloc(rows*sizeof(float));
  float *C = (float*)malloc(rows*sizeof(float));

  init(A, rows, cols, true_cols);
  init(B, rows, 1, 1);

  // computing
  TB__;
  for (int k = 0; k < REPEATTIMES; k++){
    multiply(C, A, B, rows, cols, true_cols);
  }
  TE__;

  // print result.
  print_sum(C, rows, 1, 1);

  free(A); A = NULL;
  free(B); B = NULL;
  free(C); C = NULL;

  return 0;
}

// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){
  for (int i = 0; i < rows; i++){
    C[i] = 0;
    for (int j = 0; j < cols; j++){
      C[i] += A[i*true_cols+j]*B[j];
    }
  }
}

编译：

user@machine> icl /O1 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp

执行：

user@machine> main.exe
73 : 0.877882 seconds
2483.53

vectorization

源代码保持不变

编译：

user@machine> icl /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp

执行：

user@machine> main.exe
73 : 0.205989 seconds
2483.53

执行速度提升了 4倍左右。

aligned

源代码修改。（注意：下面的代码有问题，结果可能有错误，原因可能是内存的问题。）

// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;

int64_t cpu_freq;
int64_t cpu_counter(){
  int64_t clock;
    QueryPerformanceCounter((LARGE_INTEGER*)&clock);
  return clock;
}

// output time
#if 1
  int64_t gloabel_timer_begin;
  int64_t gloabel_timer_end;
  #define TB__ gloabel_timer_begin=cpu_counter()
  #define TE__ gloabel_timer_end  =cpu_counter(); \
  cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else
  #define TB__
  #define TE__
#endif

// repeat times
#define REPEATTIMES 100000

// initialize data
void init(float *data, int rows, int cols, int true_cols){
  for (int i = 0; i < rows; i++){
    for (int j = 0; j < cols; j++){
      data[i*true_cols+j] = float(rand())/float(RAND_MAX);
    }
  }
}

void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);

void print_sum(float *data, int rows, int cols, int true_cols){
  float total = 0;
  for (int i = 0; i < rows; i++){
    for (int j = 0; j < cols; j++){
      total += data[i*true_cols+j];
    }
  }
  cout << total << endl;
}

int main(){
  QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);

  int rows = 100;
  int cols = 101;

#ifdef ALIGNED
  #define ALLIGNED_LEN 32
  int true_cols = ((((cols*sizeof(float))+ALLIGNED_LEN-1)/ALLIGNED_LEN)*ALLIGNED_LEN)/sizeof(float);
  //cout << true_cols << endl;
  float *A = (float*)_aligned_malloc(rows*true_cols*sizeof(float), ALLIGNED_LEN);
  float *B = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);
  float *C = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);
#else
  int true_cols = cols;
  float *A = (float*)malloc(rows*true_cols*sizeof(float));
  float *B = (float*)malloc(rows*sizeof(float));
  float *C = (float*)malloc(rows*sizeof(float));
#endif

  init(A, rows, cols, true_cols);
  init(B, rows, 1, 1);

  // computing
  TB__;
  for (int k = 0; k < REPEATTIMES; k++){
    multiply(C, A, B, rows, cols, true_cols);
  }
  TE__;

  // print result.
  print_sum(C, rows, 1, 1);

#ifdef ALIGNED
  _aligned_free(A); A = NULL;
  _aligned_free(B); B = NULL;
  _aligned_free(C); C = NULL;
#else
  free(A); A = NULL;
  free(B); B = NULL;
  free(C); C = NULL;
#endif

  return 0;
}

// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){
  for (int i = 0; i < rows; i++){
    C[i] = 0;
    #ifdef ALIGNED
    #pragma vector aligned
    #endif
    for (int j = 0; j < cols; j++){
      C[i] += A[i*true_cols+j]*B[j];
    }
  }
}

编译：

user@machine> icl /DALIGNED /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp

执行：

82 : 0.17747 seconds
2483.53

相对第一个优化的版本又提升了一点速度。

结论

vectorization版本：不需要改变源代码，通过修改编译器选项直接实现向量化。

aligned版本：需要修改代码，使得内存对其。可以进一步获得性能。

ICL Auto Vectorization的更多相关文章

使用Auto TensorCore CodeGen优化Matmul
使用Auto TensorCore CodeGen优化Matmul 本文将演示如何使用TVM Auto TensorCore CodeGen在Volta / Turing GPU上编写高性能matmu ...
C++11特性——变量部分（using类型别名、constexpr常量表达式、auto类型推断、nullptr空指针等）
#include <iostream> using namespace std; int main() { using cullptr = const unsigned long long ...
overflow:hidden与margin:0 auto之间的冲突
相对于父容器水平居中的代码margin:0 auto与overflow:hidden之间存在冲突.当这两个属性同时应用在一个DIV上时,在chrome浏览器中将无法居中.至于为啥我也不明白.
Android Auto开发之一《开始学习Auto 》
共同学习,共同进步, 转载请注明出处.欢迎微信交流:sfssqs,申请注明"Android Car"字样＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝＝ ...
width:100%;与width:auto;的区别
<div> <p>1111</p> </div> div{ width:980px; background-color: #ccc; height:30 ...
SQl 2005 For XMl 简单查询(Raw，Auto，Path模式)(1)
很多人对Xpath可能比较熟悉,但不知道有没有直接操作过数据库,我们都知道在Sql2005里公支持的几种查询有Raw,Auto模式,页并没有Path和Elements用法等,如果在2000里使用过 ...
margin:0 auto;不居中
margin:0 auto:不居中可能有以下两个的原因; 1.没有设置宽度<div style="margin:0 auto;"></div>看看上面的代码 ...
初学C++ 之 auto关键字（IDE：VS2013）
/*使用auto关键字,需要先赋初值,auto关键字是会根据初值来判断类型*/ auto i = ; auto j = ; cout << "auto i = 5" & ...
C++11 - 类型推导auto关键字
在C++11中,auto关键字被作为类型自动类型推导关键字 (1)基本用法 C++98:类型变量名 = 初值; int i = 10; C++11:auto 变量名 = 初值; auto i ...

随机推荐

Go VS Code 调式常见问题处理
GO VS Code 调式配置 launch.json{ "version": "0.2.0", "configurations": [ { ...
Sevrlet 工作原理解析-转
从 Servlet 容器说起要介绍 Servlet 必须要先把 Servlet 容器说清楚,Servlet 与 Servlet 容器的关系有点像枪和子弹的关系,枪是为子弹而生,而子弹又让枪有了杀伤力 ...
window10下安装linux虚拟机
一.准备工具虚拟机.镜像.putty 1.安装虚拟机 VMware Workstation Pro 安装成功之后需要输入密钥,请点击以下链接 http://www.360doc.com/conten ...
github的简单使用
查了好多入门教程(图文并茂可以了解一些基本步骤),感觉逻辑欠缺,(很多东西跟着教程了解会用了,不了解逻辑,只是会了这一个,其他的还是很蒙),来一起理一理把 1.第一步下载并注册(这个自己解决) 2.用 ...
[ Java学习基础 ] Java的封装性与访问控制
Java面向对象的封装性是通过对成员变量和方法进行访问控制实现的,访问控制分为4个等级:私有.默认.保护和公有,具体规则如下表: 1.私有级别私有级别的关键字是private,私有级别的成员变量和方 ...
[AHOI 2012]树屋阶梯
Description 暑假期间,小龙报名了一个模拟野外生存作战训练班来锻炼体魄,训练的第一个晚上,教官就给他们出了个难题.由于地上露营湿气重,必须选择在高处的树屋露营.小龙分配的树屋建立在一颗高度为 ...
[PA 2014]Kuglarz
Description 魔术师的桌子上有n个杯子排成一行,编号为1,2,…,n,其中某些杯子底下藏有一个小球,如果你准确地猜出是哪些杯子,你就可以获得奖品.花费c_ij元,魔术师就会告诉你杯子i,i+ ...
[HNOI2015]菜肴制作
题目描述知名美食家小 A被邀请至ATM 大酒店,为其品评菜肴. ATM 酒店为小 A 准备了 N 道菜肴,酒店按照为菜肴预估的质量从高到低给予1到N的顺序编号,预估质量最高的菜肴编号为1. 由于菜肴 ...
【BZOJ2186】【SDOI2008】沙拉公主的困惑
Description 大富翁国因为通货膨胀,以及假钞泛滥,政府决定推出一项新的政策:现有钞票编号范围为1到N的阶乘,但是,政府只发行编号与M!互质的钞票.房地产第一大户沙拉公主决定预测一下大富翁 ...
[cf453e]Little Pony and Lord Tirek
来自FallDream的博客,未经允许,请勿转载,谢谢. 更博客= = 有n个数,每个数字都有一个初始大小ai和最大值mi,然后每秒会增加ri,你需要回答m个发生时间依此增大的询问,每次询问区间和并且 ...