0_Simple__simpleLayeredTexture

二维分层纹理

▶ 源代码。用纹理方法把元素按原顺序从 CUDA3D 数组中取出来，求个相反数再加上层数放入全局内存，输出。

 #include <stdio.h>

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include <helper_functions.h>

 #include <helper_cuda.h>

 #define MIN_EPSILON_ERROR 5e-3f

 #define OUTPUT 5

 texture<float, cudaTextureType2DLayered> tex;

 __global__ void transformKernel(float *g_odata, int width, int height, int layer)

 {

     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

     float u = (x + 0.5f) / (float)width;

     float v = (y + 0.5f) / (float)height;

     g_odata[layer*width*height + y*width + x] = - tex2DLayered(tex, u, v, layer) + layer;

 }

 int main(int argc, char **argv)

 {

     unsigned int width = , height = , num_layers = ;

     unsigned int size = width * height * num_layers * sizeof(float);

     float *h_data = (float *)malloc(size);

     float *h_data_ref = (float *)malloc(size);

     float *d_data = NULL;

     cudaMalloc((void **)&d_data, size);

     for (unsigned int layer = ; layer < num_layers; layer++)

     {

         for (int i = ; i < (int)(width * height); i++)

             h_data[layer*width*height + i] = (float)i;

     }

     for (unsigned int layer = ; layer < num_layers; layer++)

     {

         for (int i = ; i < (int)(width * height); i++)

             h_data_ref[layer*width*height + i] = - h_data[layer*width*height + i] + layer;

     }

     printf("\n\t\Input data\n\t");

     for (int i = ; i < num_layers; i++)

     {

         for (int j = ; j < OUTPUT; j++)

         {

             for(int k=;k<OUTPUT;k++)

                 printf("%2.1f ", h_data[i*width*height+j*width+k]);

             printf("\n\t");

         }

         printf("\n\t");

     }

     printf("\n\t\Ideal output data\n\t");

     for (int i = ; i < num_layers; i++)

     {

         for (int j = ; j < OUTPUT; j++)

         {

             for (int k = ; k<OUTPUT; k++)

                 printf("%2.1f ", h_data_ref[i*width*height + j*width + k]);

             printf("\n\t");

         }

         printf("\n\t");

     }

     // 设置 CUDA 3D 数组参数和数据拷贝

     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(, , , , cudaChannelFormatKindFloat);

     cudaArray *cu_3darray;

     cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered);

     cudaMemcpy3DParms myparms = {  };

     myparms.srcPos = make_cudaPos(, , );

     myparms.dstPos = make_cudaPos(, , );

     myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);

     myparms.dstArray = cu_3darray;

     myparms.extent = make_cudaExtent(width, height, num_layers);

     myparms.kind = cudaMemcpyHostToDevice;

     cudaMemcpy3D(&myparms);

     // 设置纹理参数并绑定

     tex.addressMode[] = cudaAddressModeWrap;

     tex.addressMode[] = cudaAddressModeWrap;

     tex.filterMode = cudaFilterModeLinear;

     tex.normalized = true;

     cudaBindTextureToArray(tex, cu_3darray, channelDesc);

     dim3 dimBlock(, , );

     dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, );

     printf("Covering 2D data of %d * %d * %d: Grid size is %d x %d, each block has 8 x 8 threads\n", width, height, num_layers, dimGrid.x, dimGrid.y);

     transformKernel << < dimGrid, dimBlock >> >(d_data, width, height, );// 预跑

     cudaDeviceSynchronize();

     StopWatchInterface *timer = NULL;

     sdkCreateTimer(&timer);

     sdkStartTimer(&timer);

     for (unsigned int layer = ; layer < num_layers; layer++)// 启用多个核，每个核完成一层

         transformKernel << < dimGrid, dimBlock,  >> >(d_data, width, height, layer);

     cudaDeviceSynchronize();

     sdkStopTimer(&timer);

     printf("\n\Time: %.3f msec, %.2f Mtexlookups/sec\n", sdkGetTimerValue(&timer), (width *height *num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));

     sdkDeleteTimer(&timer);

     // 返回计算结果并检验

     memset(h_data, , size);

     cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);

     if (checkCmdLineFlag(argc, (const char **)argv, "regression"))

         sdkWriteFile<float>("./data/regression.dat", h_data, width * width, 0.0f, false);

     else

         printf("Comparing kernel output to expected data return %d\n", compareData(h_data, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f));

     printf("\n\tActual output data\n\t");

     for (int i = ; i < num_layers; i++)

     {

         for (int j = ; j < OUTPUT; j++)

         {

             for (int k = ; k<OUTPUT; k++)

                 printf("%2.1f ", h_data[i*width*height + j*width + k]);

             printf("\n\t");

         }

         printf("\n\t");

     }

     free(h_data);

     free(h_data_ref);

     cudaFree(d_data);

     cudaFreeArray(cu_3darray);

     getchar();

     return ;

 }

▶ 输出结果

    Input data

    0.0 1.0 2.0 3.0 4.0

    512.0 513.0 514.0 515.0 516.0

    1024.0 1025.0 1026.0 1027.0 1028.0

    1536.0 1537.0 1538.0 1539.0 1540.0

    2048.0 2049.0 2050.0 2051.0 2052.0

    0.0 1.0 2.0 3.0 4.0

    512.0 513.0 514.0 515.0 516.0

    1024.0 1025.0 1026.0 1027.0 1028.0

    1536.0 1537.0 1538.0 1539.0 1540.0

    2048.0 2049.0 2050.0 2051.0 2052.0

    0.0 1.0 2.0 3.0 4.0

    512.0 513.0 514.0 515.0 516.0

    1024.0 1025.0 1026.0 1027.0 1028.0

    1536.0 1537.0 1538.0 1539.0 1540.0

    2048.0 2049.0 2050.0 2051.0 2052.0

    0.0 1.0 2.0 3.0 4.0

    512.0 513.0 514.0 515.0 516.0

    1024.0 1025.0 1026.0 1027.0 1028.0

    1536.0 1537.0 1538.0 1539.0 1540.0

    2048.0 2049.0 2050.0 2051.0 2052.0

    0.0 1.0 2.0 3.0 4.0

    512.0 513.0 514.0 515.0 516.0

    1024.0 1025.0 1026.0 1027.0 1028.0

    1536.0 1537.0 1538.0 1539.0 1540.0

    2048.0 2049.0 2050.0 2051.0 2052.0

    Ideal output data

    0.0 -1.0 -2.0 -3.0 -4.0

    -512.0 -513.0 -514.0 -515.0 -516.0

    -1024.0 -1025.0 -1026.0 -1027.0 -1028.0

    -1536.0 -1537.0 -1538.0 -1539.0 -1540.0

    -2048.0 -2049.0 -2050.0 -2051.0 -2052.0

    1.0 0.0 -1.0 -2.0 -3.0

    -511.0 -512.0 -513.0 -514.0 -515.0

    -1023.0 -1024.0 -1025.0 -1026.0 -1027.0

    -1535.0 -1536.0 -1537.0 -1538.0 -1539.0

    -2047.0 -2048.0 -2049.0 -2050.0 -2051.0

    2.0 1.0 0.0 -1.0 -2.0

    -510.0 -511.0 -512.0 -513.0 -514.0

    -1022.0 -1023.0 -1024.0 -1025.0 -1026.0

    -1534.0 -1535.0 -1536.0 -1537.0 -1538.0

    -2046.0 -2047.0 -2048.0 -2049.0 -2050.0

    3.0 2.0 1.0 0.0 -1.0

    -509.0 -510.0 -511.0 -512.0 -513.0

    -1021.0 -1022.0 -1023.0 -1024.0 -1025.0

    -1533.0 -1534.0 -1535.0 -1536.0 -1537.0

    -2045.0 -2046.0 -2047.0 -2048.0 -2049.0

    4.0 3.0 2.0 1.0 0.0

    -508.0 -509.0 -510.0 -511.0 -512.0

    -1020.0 -1021.0 -1022.0 -1023.0 -1024.0

    -1532.0 -1533.0 -1534.0 -1535.0 -1536.0

    -2044.0 -2045.0 -2046.0 -2047.0 -2048.0

    Covering 2D data of  *  * : Grid size is  x , each block has  x  threads

Time: 0.995 msec, 1317.00 Mtexlookups/sec

Comparing kernel output to expected data return 

    Actual output data

    0.0 -1.0 -2.0 -3.0 -4.0

    -512.0 -513.0 -514.0 -515.0 -516.0

    -1024.0 -1025.0 -1026.0 -1027.0 -1028.0

    -1536.0 -1537.0 -1538.0 -1539.0 -1540.0

    -2048.0 -2049.0 -2050.0 -2051.0 -2052.0

    1.0 0.0 -1.0 -2.0 -3.0

    -511.0 -512.0 -513.0 -514.0 -515.0

    -1023.0 -1024.0 -1025.0 -1026.0 -1027.0

    -1535.0 -1536.0 -1537.0 -1538.0 -1539.0

    -2047.0 -2048.0 -2049.0 -2050.0 -2051.0

    2.0 1.0 0.0 -1.0 -2.0

    -510.0 -511.0 -512.0 -513.0 -514.0

    -1022.0 -1023.0 -1024.0 -1025.0 -1026.0

    -1534.0 -1535.0 -1536.0 -1537.0 -1538.0

    -2046.0 -2047.0 -2048.0 -2049.0 -2050.0

    3.0 2.0 1.0 0.0 -1.0

    -509.0 -510.0 -511.0 -512.0 -513.0

    -1021.0 -1022.0 -1023.0 -1024.0 -1025.0

    -1533.0 -1534.0 -1535.0 -1536.0 -1537.0

    -2045.0 -2046.0 -2047.0 -2048.0 -2049.0

    4.0 3.0 2.0 1.0 0.0

    -508.0 -509.0 -510.0 -511.0 -512.0

    -1020.0 -1021.0 -1022.0 -1023.0 -1024.0

    -1532.0 -1533.0 -1534.0 -1535.0 -1536.0

    -2044.0 -2045.0 -2046.0 -2047.0 -2048.0

▶ 涨姿势

● 与前面立方体贴图纹理不同的地方：申请 CUDA3D 数组的时候使用标志 cudaArrayLayered 而不是 cudaArrayCubemap，并注意调整相关的维度参数。

0_Simple__simpleLayeredTexture的更多相关文章

随机推荐

（5）subprocess模块（子进程模块）
什么是进程一个程序运行起来了就是一个进程但是程序本身不是进程,程序是一对代码而已所以进程就是一个抽象的概念,就是程序运行起来的一个过程进程和进程之间是相互独立的,互不影响如何理解子进程和父进 ...
[团队项目]SCRUM项目6.0 7.0
6.0----------------------------------------------------- sprint演示 1.坚持所有的sprint都结束于演示. 团队的成果得到认可,会感觉 ...
test20181004 苹果树
题意分析对每个点维护子树所能达到的dfn最大值.最小值.次大值.次小值,然后就可以计算原树中每个点与父亲的连边对答案的贡献. 如果子树中没有边能脱离子树,断掉该边与任意一条新加的边都成立,答案就加 ...
51Nod1526 分配笔名
分析在trie树上贪心,将所有串加入trie树中,在深度较深的地方匹配会更优. 由于只需要知道最后的总质量,所以直接取每个点的子树中最大的匹配即可复杂度\(O(\sum len)\) 加串的时候把 ...
修改ThinkPHP缓存为Memcache的方法
一般来说,ThinkPHP的默认缓存方式是以File文件方式实现的,运行时会在/Runtime/Temp 下生成很多的缓存文件. 有的情况下服务器装了memcached之后,需要将ThinkPHP的缓 ...
MongoDB注册Windows服务启动
下载MongoDB安装到:E:\Work_App\MongoDB 这个目录安装:E:\Work_App\MongoDB (安装在专门的目录中) 配置: 1.在E:\Work_App\MongoDB\ ...
koa2 知识点
koa2 常用功能: 假设 Koa 提供的 Context 对象的实例为 ctx table th:first-of-type { width: 200px; } table th:nth-of-ty ...
java 多线程之：sleep() 方法
sleep()介绍 sleep() 定义在java.lang.Thread中. sleep() 的作用是让当前线程休眠,即当前线程会从"运行状态"进入到"休眠(阻塞)状态 ...
gitlab-ce-omnibus社区版的备份、还原及升级
gitlab-ce-omnibus社区版的备份和还原,可以使用gitlab自带工具,gitlab-rake来完成,详见下面例子将旧gitlab服务器备份,并还原至新gitlab服务器 ,这两台git ...
php 5.2.17 升级到5.3.29
修改php.ini配置文件 register_globals =On include_path = ".;d:/testoa/webroot" error_reporting = ...

0_Simple__simpleLayeredTexture

0_Simple__simpleLayeredTexture的更多相关文章

随机推荐

热门专题