0_Simple__simplePitchLinearTexture

对比设备线性二维数组和 CUDA 二维数组在纹理引用中的效率

▶ 源代码。分别绑定相同大小的设备线性二维数组和 CUDA 二维数组为纹理引用，做简单的平移操作，重复若干次计算带宽和访问速度。

 #include <stdio.h>

 #ifdef _WIN32

 #  define WINDOWS_LEAN_AND_MEAN

 #  define NOMINMAX

 #  include <windows.h>

 #endif

 #include <cuda_runtime.h>

 #include "device_launch_parameters.h"

 #include <helper_functions.h>

 #include <helper_cuda.h>

 #define NUM_REPS 100  // test 重复次数

 #define TILE_DIM 16   // 线程块尺寸

 texture<float, , cudaReadModeElementType> texRefPL;

 texture<float, , cudaReadModeElementType> texRefArray;

 __global__ void shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY)

 {

     int xid = blockIdx.x * blockDim.x + threadIdx.x;

     int yid = blockIdx.y * blockDim.y + threadIdx.y;

     odata[yid * pitch + xid] = tex2D(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);

 }

 __global__ void shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY)

 {

     int xid = blockIdx.x * blockDim.x + threadIdx.x;

     int yid = blockIdx.y * blockDim.y + threadIdx.y;

     odata[yid * pitch + xid] = tex2D(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);

 }

 bool test()

 {

     bool result = true;

     int i, j, ishift, jshift;

     // 数组大小以及 x，y 方向上的偏移量

     const int nx = ;

     const int ny = ;

     const int x_shift = ;

     const int y_shift = ;

     if ((nx % TILE_DIM) || (ny % TILE_DIM))

     {

         printf("nx and ny must be multiples of TILE_DIM\n");

         return EXIT_FAILURE;

     }

     dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);

     cudaEvent_t start, stop;

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

     //int devID = findCudaDevice(argc, (const char **)argv);// 使用device 0，不再使用命令行参数进行判断

     // 申请内存

     float *h_idata = (float *)malloc(sizeof(float) * nx * ny);

     float *h_odata = (float *)malloc(sizeof(float) * nx * ny);

     float *h_ref = (float *)malloc(sizeof(float) * nx * ny);

     for (int i = ; i < nx * ny; ++i)

         h_idata[i] = (float)i;

     float *d_idataPL;

     size_t d_pitchBytes;

     cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny);

     cudaArray *d_idataArray;

     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

     cudaMallocArray(&d_idataArray, &channelDesc, nx, ny);

     float *d_odata;

     cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny);

     // 拷贝内存（两组）

     size_t h_pitchBytes = nx * sizeof(float);

     cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice);

     cudaMemcpyToArray(d_idataArray, , , h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice);

     // 绑定纹理（两组）

     texRefPL.normalized = ;

     texRefPL.filterMode = cudaFilterModePoint;

     texRefPL.addressMode[] = cudaAddressModeWrap;

     texRefPL.addressMode[] = cudaAddressModeWrap;

     cudaBindTexture2D(, &texRefPL, d_idataPL, &channelDesc, nx, ny, d_pitchBytes);

     texRefArray.normalized = ;

     texRefArray.filterMode = cudaFilterModePoint;

     texRefArray.addressMode[] = cudaAddressModeWrap;

     texRefArray.addressMode[] = cudaAddressModeWrap;

     cudaBindTextureToArray(texRefArray, d_idataArray, channelDesc);

     // 理论计算结果

     for (i = ; i < ny; i++)

     {

         for (j = ; j < nx; ++j)

             h_ref[i * nx + j] = h_idata[(i + y_shift) % ny * nx + (j + x_shift) % nx];

     }

     // 使用线性数组的纹理计算

     cudaMemset2D(d_odata, d_pitchBytes, , nx * sizeof(float), ny);

     cudaEventRecord(start, );

     for (int i = ; i < NUM_REPS; ++i)

         shiftPitchLinear << <dimGrid, dimBlock >> > (d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift);

     cudaEventRecord(stop, );

     cudaEventSynchronize(stop);

     float timePL;

     cudaEventElapsedTime(&timePL, start, stop);

     // 检查结果

     cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost);

     if (!compareData(h_ref, h_odata, nx*ny, 0.0f, 0.15f))

     {

         printf("\n\t ShiftPitchLinear failed\n");

         result = false;

     }

     // 使用 CUDA数组的纹理计算

     cudaMemset2D(d_odata, d_pitchBytes, , nx * sizeof(float), ny);

     cudaEventRecord(start, );

     for (int i = ; i < NUM_REPS; ++i)

         shiftArray << <dimGrid, dimBlock >> > (d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift);

     cudaEventRecord(stop, );

     cudaEventSynchronize(stop);

     float timeArray;

     cudaEventElapsedTime(&timeArray, start, stop);

     // 检查结果

     cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost);

     if (!compareData(h_ref, h_odata, nx*ny, 0.0f, 0.15f))

     {

         printf("\n\tShiftArray failed\n");

         result = false;

     }

     // 计算带宽和读取速度

     float bandwidthPL = .f * nx * ny * sizeof(float) / (timePL / .f / NUM_REPS * .e+9f);

     float bandwidthArray = .f * nx * ny * sizeof(float) / (timeArray / .f / NUM_REPS * .e+9f);

     printf("\n\tBandwidth for pitch linear: %.2f GB/s; for array: %.2f GB/s\n", bandwidthPL, bandwidthArray);

     float fetchRatePL = nx * ny / .e+6f / (timePL / 1000.0f / NUM_REPS);

     float fetchRateArray = nx * ny / .e+6f / (timeArray / 1000.0f / NUM_REPS);

     printf("\n\tTexture fetch rate for pitch linear: %.2f Mpix/s; for array: %.2f Mpix/s\n", fetchRatePL, fetchRateArray);

     // 回收工作

     free(h_idata);

     free(h_odata);

     free(h_ref);

     cudaUnbindTexture(texRefPL);

     cudaUnbindTexture(texRefArray);

     cudaFree(d_idataPL);

     cudaFreeArray(d_idataArray);

     cudaFree(d_odata);

     cudaEventDestroy(start);

     cudaEventDestroy(stop);

     return result;

 }

 int main(int argc, char **argv)

 {

     printf("\n\tStart\n");

     printf("\n\tFinished, %s\n", test() ? "Passed" : "Failed");

     getchar();

     return ;

 }

▶ 输出结果

    Start

    Bandwidth for pitch linear: 12.58 GB/s; for array: 14.64 GB/s

    Texture fetch rate for pitch linear: 1573.09 Mpix/s; for array: 1829.39 Mpix/s

    Finished, Passed

▶ 涨姿势

● 用到的函数都在以前的，有关线性二维数组和纹理内存使用方法的博客汇总讨论过了。

● 由运行结果可知，使用二维纹理引用时，CUDA 二维数组的效率比线性二维数组更高。

0_Simple__simplePitchLinearTexture的更多相关文章

随机推荐

test20190408(十二省联考)
做了十二省联考的题.暂时只更几个比较可做的题目. 异或粽子考试的时候乱搞了个做法.结果以每个大数据点 \(1900+\ ms\) 的优秀效率通过了此题... 乱搞建一颗 \(Trie\) 树,显然 ...
虚拟机lamp环境下，Apache配置虚拟主机
1.在Apache配置文件中开启虚拟主机功能:即:Include etc//extra/httpd-vhosts.conf把前面的#去掉: 2.在extra目录下找到文件httpd-vhosts.co ...
Passing the Message 单调栈两次
What a sunny day! Let’s go picnic and have barbecue! Today, all kids in “Sun Flower” kindergarten ar ...
Heap Operations 优先队列
Petya has recently learned data structure named "Binary heap". The heap he is now operatin ...
一键分享到各个SNS插件
使用百度分享:http://share.baidu.com/code/advance#toid 例: HTML: <div class="bdsharebuttonbox" ...
mySQL 教程第5章插入更新与删除数据
使用SQL Manager管理工具连接到schoolDB.由于三张表都设置了主键,因此,以下练习中插入的记录,主键不能重. 插入数据 1. 练习:为表的所有字段插入数据为表中所有字段插入数据,可以不 ...
java 解析office文件大全
原文地址:http://ansjsun.iteye.com/blog/791142 读取OFFICE文件纯文本 package org.css.resource.businesssoft.search ...
android设备唯一码的获取，cpu号，mac地址
抄自http://blog.csdn.net/hpccn/article/details/7872141 开发Android应用中,我们常常需要设备的唯一码来确定客户端. Android 中的几中方法 ...
【Spring学习笔记-MVC-7】Spring MVC模型对象-模型属性讲解
作者:ssslinppp 来自为知笔记(Wiz) 附件列表处理模型数据.png
bzoj1085 骑士精神
Description 在一个5×5的棋盘上有12个白色的骑士和12个黑色的骑士, 且有一个空位.在任何时候一个骑士都能按照骑士的走法(它可以走到和它横坐标相差为1,纵坐标相差为2或者横坐标相差为2, ...

0_Simple__simplePitchLinearTexture

0_Simple__simplePitchLinearTexture的更多相关文章

随机推荐

热门专题