A tiny program to benchmark image transpose algorithms
Here is the code:
#include <stdio.h>
#include <xmmintrin.h>
#include <windows.h> typedef __m128 Vec; typedef unsigned long long value_t; __forceinline value_t now()
{
LARGE_INTEGER n;
QueryPerformanceCounter(&n);
return n.QuadPart;
} inline void img_transpose(
Vec *dst_img,
Vec *src_img,
const int src_w,
const int src_h)
{
#pragma omp parallel for
for (int j = ; j < src_w; ++j)
{
for (int i = ; i < src_h; ++i)
{
dst_img[j * src_h + i] = src_img[i * src_w + j];
}
}
} inline void img_transpose_block(
Vec *dst_img,
Vec *src_img,
const int src_w,
const int src_h)
{
#pragma omp parallel for
for (int j = ; j < src_w; j += )
{
for (int i = ; i < src_h; i += )
{
const int nsize = min(j + , src_w);
const int msize = min(i + , src_h); for (int n = j; n < nsize; ++n)
{
for (int m = i; m < msize; ++m)
{
dst_img[n * src_h + m] = src_img[m * src_w + n];
}
}
}
}
} int main(int argc, char *argv[])
{
//// performance benchmark //// const int w = ;
const int h = ;
Vec *a = new Vec [w * h];
Vec *b = new Vec [w * h];
value_t start_time, end_time; LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
double ms_per_tick = 1000.0 / (double)freq.QuadPart; start_time = now(); for (int t = ; t < ; ++t)
{
img_transpose(b, a, w, h);
img_transpose(a, b, h, w);
} end_time = now();
printf("img_transpose: %f ms\n", (double)(end_time - start_time) * ms_per_tick); start_time = now(); for (int t = ; t < ; ++t)
{
img_transpose_block(b, a, w, h);
img_transpose_block(a, b, h, w);
} end_time = now();
printf("img_transpose_block: %f ms\n", (double)(end_time - start_time) * ms_per_tick); delete [] a;
delete [] b; //// algorithm validation ////
const int width = ;
const int height = ;
Vec *src_img = new Vec [width * height];
Vec *dst_img = new Vec [height * width]; for (int j = ; j < height; ++j)
{
for (int i = ; i < width; ++i)
{
src_img[j * width + i].m128_i32[] = i;
src_img[j * width + i].m128_i32[] = j;
}
} img_transpose_block(dst_img, src_img, width, height); for (int j = ; j < width; ++j)
{
for (int i = ; i < height; ++i)
{
int pi = dst_img[j * height + i].m128_i32[];
int pj = dst_img[j * height + i].m128_i32[]; if (pi != j || pj != i)
{
printf("Algorithm is wrong!!!\n");
goto END_OF_PROGRAM;
}
}
} END_OF_PROGRAM:
printf("All done\n"); return ;
}
A tiny program to benchmark image transpose algorithms的更多相关文章
- hey is a tiny program that sends some load to a web application.
hey is a tiny program that sends some load to a web application. DOS attack DOS攻击生成 https://github.c ...
- 自己动手写一个编译器Tiny语言解析器实现
然后,上一篇文章简介Tiny词法分析,实现语言.本文将介绍Tiny的语法分析器的实现. 1 Tiny语言的语法 下图是Tiny在BNF中的文法. 文法的定义能够看出.INNY语言有以下特点: 1 程序 ...
- Reading List on Automated Program Repair
Some resources: https://www.monperrus.net/martin/automatic-software-repair 2017 [ ] DeepFix: Fixing ...
- [io benchmark]常用磁盘基准/压力测试工具
Unix Disk I/O Benchmarks fio - NEW! fio is an I/O tool meant to be used both for benchmark and stres ...
- UVA - 10895 Matrix Transpose
UVA - 10895 Matrix Transpose Time Limit:3000MS Memory Limit:Unknown 64bit IO Format:%lld & % ...
- Awesome Go
A curated list of awesome Go frameworks, libraries and software. Inspired by awesome-python. Contrib ...
- Go 语言相关的优秀框架,库及软件列表
If you see a package or project here that is no longer maintained or is not a good fit, please submi ...
- Awesome Go (http://awesome-go.com/)
A curated list of awesome Go frameworks, libraries and software. Inspired by awesome-python. Contrib ...
- Awesome Go精选的Go框架,库和软件的精选清单.A curated list of awesome Go frameworks, libraries and software
Awesome Go financial support to Awesome Go A curated list of awesome Go frameworks, libraries a ...
随机推荐
- 【Spring】Spring boot多数据源历险记
一.问题描述 笔者根据需求在开发过程中,需要在原项目的基础上(单数据源),新增一个数据源C,根据C数据源来实现业务.至于为什么不新建一个项目,大概是因为这只是个小功能,访问量不大,不需要单独申请个服务 ...
- kafka 报Failed to load class "org.slf4j.impl.StaticLoggerBinder".[z]
转:http://blog.chinaunix.net/uid-25135004-id-4172954.html 测试kafka producer发送消息 和 consumer 接受消息报错 ...
- 20172325 2017-2018-2 《Java程序设计》第八周学习总结
20172325 2017-2018-2 <Java程序设计>第八周学习总结 教材学习内容总结 1.关于绑定 绑定:在执行程序时产生一个请求事件,需要执行一段代码来来完成方法调用,即一个方 ...
- 编译器C1001问题
https://ask.csdn.net/questions/184495 http://blog.sina.com.cn/s/blog_7822ce750100szed.html
- Java NIO系列教程(十)DatagramChannel
Java NIO系列教程(十)DatagramChannel 转载自并发编程网 – ifeve.com,本文链接地址: Java NIO系列教程(十) Java NIO DatagramChannel
- jqgrid单元格合并
<%@ Page Language="C#" AutoEventWireup="true" CodeBehind="WebForm1.aspx. ...
- KBMMW 4.6 正式版发布
喜大普奔迎新年! Merry Christmas! We are happy to announce the release of kbmMW v. 4.60.00 Professional and ...
- 前端之JavaScript笔记2
一 数组对象 <!DOCTYPE html> <html lang="en"> <head> <meta charset="UT ...
- 2018.10.14 NOIP训练 直线(二分答案+st表+切比雪夫距离转化)
传送门 二分答案好题. 这已经是当年普及组模拟时挖的坑了233. 这道题还是很不错的. 考虑把坐标系转个45度再操作. 为了不爆精度可以直接转切比雪夫距离. 然后就直接二分答案. 其中竖线就按二分的答 ...
- 2018.10.09 NOIP模拟 世界杯(图论+set优化)
传送门 貌似是防akakak题? 不是很清楚. 事实上如果两个人没有严格的大小关系,我们给他们两个连一条边. 这样可以构成很多连通块. 而且对于连通块a,ba,ba,b,aia_iai和bjb_jb ...