《GPU高性能编程CUDA实战》第八章图形互操作性

▶ OpenGL与DirectX，等待填坑。

● basic_interop

 #include <stdio.h>

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include "cuda.h"

 #include "cuda_gl_interop.h"

 #include "D:\Code\CUDA\book\common\book.h"

 #include "D:\Code\CUDA\book\common\cpu_bitmap.h"

 PFNGLBINDBUFFERARBPROC    glBindBuffer = NULL;

 PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;

 PFNGLGENBUFFERSARBPROC    glGenBuffers = NULL;

 PFNGLBUFFERDATAARBPROC    glBufferData = NULL;

 #define     DIM    512

 GLuint  bufferObj;

 cudaGraphicsResource *resource;

 // based on ripple code, but uses uchar4 which is the type of data

 // graphic inter op uses. see screenshot - basic2.png

 __global__ void kernel(uchar4 *ptr)

 {

     // map from threadIdx/BlockIdx to pixel position

     int x = threadIdx.x + blockIdx.x * blockDim.x;

     int y = threadIdx.y + blockIdx.y * blockDim.y;

     int offset = x + y * blockDim.x * gridDim.x;

     // now calculate the value at that position

     float fx = x / (float)DIM - 0.5f;

     float fy = y / (float)DIM - 0.5f;

     unsigned char   green =  +  * sin(abs(fx * ) - abs(fy * );

     // accessing uchar4 vs unsigned char*

     ptr[offset].x = ;

     ptr[offset].y = green;

     ptr[offset].z = ;

     ptr[offset].w = ;

 }

 static void key_func(unsigned char key, int x, int y)

 {

     switch (key)

     {

     case :

         // clean up OpenGL and CUDA

         cudaGraphicsUnregisterResource(resource);

         glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, );

         glDeleteBuffers(, &bufferObj);

         exit();

     }

 }

 static void draw_func(void)

 {

     // we pass zero as the last parameter, because out bufferObj is now

     // the source, and the field switches from being a pointer to a

     // bitmap to now mean an offset into a bitmap object

     glDrawPixels(DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, );

     glutSwapBuffers();

 }

 int main(int argc, char **argv)

 {

     cudaDeviceProp  prop;

     int dev;

     memset(&prop, , sizeof(cudaDeviceProp));

     prop.major = ;

     prop.minor = ;

     cudaChooseDevice(&dev, &prop);

     // tell CUDA which dev we will be using for graphic interop

     // from the programming guide:  Interoperability with OpenGL

     //     requires that the CUDA device be specified by

     //     cudaGLSetGLDevice() before any other runtime calls.

     cudaGLSetGLDevice(dev);

     // these GLUT calls need to be made before the other OpenGL

     // calls, else we get a seg fault

     glutInit(&argc, argv);

     glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);

     glutInitWindowSize(DIM, DIM);

     glutCreateWindow("bitmap");//初始化并创建一个窗口

     //创建缓冲区

     glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");

     glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");

     glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");

     glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

     // the first three are standard OpenGL, the 4th is the CUDA reg

     // of the bitmap these calls exist starting in OpenGL 1.5

     glGenBuffers(, &bufferObj);// 将bufferObj注册为图形资源

     glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);

     glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * , NULL, GL_DYNAMIC_DRAW_ARB);

     cudaGraphicsGLRegisterBuffer(&resource, bufferObj, cudaGraphicsMapFlagsNone);

     // do work with the memory dst being on the GPU, gotten via mapping

     cudaGraphicsMapResources(, &resource, NULL);

     uchar4* devPtr;

     size_t  size;

     cudaGraphicsResourceGetMappedPointer((void**)&devPtr,&size,resource);

     dim3    grids(DIM / , DIM / );

     dim3    threads(, );

     kernel << <grids, threads >> >(devPtr);

     cudaGraphicsUnmapResources(, &resource, NULL);

     // set up GLUT and kick off main loop

     glutKeyboardFunc(key_func);

     glutDisplayFunc(draw_func);

     glutMainLoop();

     getchar();

     return;

 }

● ripple

 #include <stdio.h>

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include "cuda.h"

 #include "cuda_gl_interop.h"

 #include "D:\Code\CUDA\book\common\book.h"

 #include "D:\Code\CUDA\book\common\gpu_anim.h"

 #define DIM 1024

 __global__ void kernel(uchar4 *ptr, int ticks)

 {

     // map from threadIdx/BlockIdx to pixel position

     int x = threadIdx.x + blockIdx.x * blockDim.x;

     int y = threadIdx.y + blockIdx.y * blockDim.y;

     int offset = x + y * blockDim.x * gridDim.x;

     // now calculate the value at that position

     float fx = x - DIM / ;

     float fy = y - DIM / ;

     float d = sqrtf(fx * fx + fy * fy);

     unsigned char grey = (unsigned char)(128.0f + 127.0f *cos(d / 10.0f - ticks / 7.0f) / (d / 10.0f + 1.0f));

     ptr[offset].x = grey;

     ptr[offset].y = grey;

     ptr[offset].z = grey;

     ptr[offset].w = ;

 }

 void generate_frame(uchar4 *pixels, void*, int ticks)

 {

     dim3    grids(DIM / , DIM / );

     dim3    threads(, );

     kernel << <grids, threads >> >(pixels, ticks);

 }

 int main(void)

 {

     GPUAnimBitmap  bitmap(DIM, DIM, NULL);

     bitmap.anim_and_exit((void(*)(uchar4*, void*, int))generate_frame, NULL);

     getchar();

     return;

 }

● heat

 #include <stdio.h>

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include "cuda.h"

 #include "cuda_gl_interop.h"

 #include "D:\Code\CUDA\book\common\book.h"

 #include "D:\Code\CUDA\book\common\gpu_anim.h"

 #define DIM 1024

 #define MAX_TEMP 1.0f

 #define MIN_TEMP 0.0001f

 #define SPEED   0.25f

 // these exist on the GPU side

 texture<float>  texConstSrc;

 texture<float>  texIn;

 texture<float>  texOut;

 // this kernel takes in a 2-d array of floats

 // it updates the value-of-interest by a scaled value based

 // on itself and its nearest neighbors

 __global__ void blend_kernel(float *dst,bool dstOut)

 {

     // map from threadIdx/BlockIdx to pixel position

     int x = threadIdx.x + blockIdx.x * blockDim.x;

     int y = threadIdx.y + blockIdx.y * blockDim.y;

     int offset = x + y * blockDim.x * gridDim.x;

     int left = offset - ;

     int right = offset + ;

     if (x == )   left++;

     if (x == DIM - ) right--;

     int top = offset - DIM;

     int bottom = offset + DIM;

     if (y == )   top += DIM;

     if (y == DIM - ) bottom -= DIM;

     float   t, l, c, r, b;

     if (dstOut) {

         t = tex1Dfetch(texIn, top);

         l = tex1Dfetch(texIn, left);

         c = tex1Dfetch(texIn, offset);

         r = tex1Dfetch(texIn, right);

         b = tex1Dfetch(texIn, bottom);

     }

     else {

         t = tex1Dfetch(texOut, top);

         l = tex1Dfetch(texOut, left);

         c = tex1Dfetch(texOut, offset);

         r = tex1Dfetch(texOut, right);

         b = tex1Dfetch(texOut, bottom);

     }

     dst[offset] = c + SPEED * (t + b + r + l -  * c);

 }

 // NOTE - texOffsetConstSrc could either be passed as a

 // parameter to this function, or passed in __constant__ memory

 // if we declared it as a global above, it would be

 // a parameter here:

 // __global__ void copy_const_kernel( float *iptr,

 //                                    size_t texOffset )

 __global__ void copy_const_kernel(float *iptr)

 {

     // map from threadIdx/BlockIdx to pixel position

     int x = threadIdx.x + blockIdx.x * blockDim.x;

     int y = threadIdx.y + blockIdx.y * blockDim.y;

     int offset = x + y * blockDim.x * gridDim.x;

     float c = tex1Dfetch(texConstSrc, offset);

     if (c != )

         iptr[offset] = c;

 }

 // globals needed by the update routine

 struct DataBlock

 {

     float           *dev_inSrc;

     float           *dev_outSrc;

     float           *dev_constSrc;

     cudaEvent_t     start, stop;

     float           totalTime;

     float           frames;

 };

 void anim_gpu(uchar4* outputBitmap, DataBlock *d, int ticks)

 {

     cudaEventRecord(d->start, );

     dim3    blocks(DIM / , DIM / );

     dim3    threads(, );

     // since tex is global and bound, we have to use a flag to

     // select which is in/out per iteration

     volatile bool dstOut = true;

     for (int i = ; i<; i++)

     {

         float   *in, *out;

         if (dstOut)

         {

             in = d->dev_inSrc;

             out = d->dev_outSrc;

         }

         else

         {

             out = d->dev_inSrc;

             in = d->dev_outSrc;

         }

         copy_const_kernel << <blocks, threads >> >(in);

         blend_kernel << <blocks, threads >> >(out, dstOut);

         dstOut = !dstOut;

     }

     float_to_color << <blocks, threads >> >(outputBitmap,d->dev_inSrc);

     cudaEventRecord(d->stop, );

     cudaEventSynchronize(d->stop);

     float   elapsedTime;

     cudaEventElapsedTime(&elapsedTime,d->start, d->stop);

     d->totalTime += elapsedTime;

     ++d->frames;

     printf("Average Time per frame:  %3.1f ms\n",d->totalTime / d->frames);

 }

 // clean up memory allocated on the GPU

 void anim_exit(DataBlock *d)

 {

     cudaUnbindTexture(texIn);

     cudaUnbindTexture(texOut);

     cudaUnbindTexture(texConstSrc);

     cudaFree(d->dev_inSrc);

     cudaFree(d->dev_outSrc);

     cudaFree(d->dev_constSrc);

     cudaEventDestroy(d->start);

     cudaEventDestroy(d->stop);

 }

 int main(void)

 {

     DataBlock   data;

     GPUAnimBitmap bitmap(DIM, DIM, &data);

     data.totalTime = ;

     data.frames = ;

     cudaEventCreate(&data.start);

     cudaEventCreate(&data.stop);

     int imageSize = bitmap.image_size();

     cudaMalloc((void**)&data.dev_inSrc, imageSize);

     cudaMalloc((void**)&data.dev_outSrc, imageSize);

     cudaMalloc((void**)&data.dev_constSrc, imageSize);

     cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);

     cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize);

     cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize); 

     float *temp = (float*)malloc(imageSize);

     for (int i = ; i < DIM*DIM; i++)// 恒温格点数据

     {

         temp[i] = ;

         int x = i % DIM;

         int y = i / DIM;

         if ((x >= ) && (x < ) && (y >= ) && (y < ))

             temp[i] = MAX_TEMP;

         if ((x >= ) && (x < ) && (y >= ) && (y < ))

             temp[i] = MIN_TEMP;

     }

     cudaMemcpy(data.dev_constSrc, temp,imageSize,cudaMemcpyHostToDevice);

     for (int i = ; i < DIM*DIM; i++)// 初始温度场数据

     {

         temp[i] = 0.5;

         int x = i % DIM;

         int y = i / DIM;

         if ((x >= ) && (x < ) && (y >= ) && (y < ))

             temp[i] = MAX_TEMP;

     }

     cudaMemcpy(data.dev_inSrc, temp,imageSize,cudaMemcpyHostToDevice);

     free(temp);

     bitmap.anim_and_exit((void(*)(uchar4*, void*, int))anim_gpu,(void(*)(void*))anim_exit);

     getchar();

     return;

 }

《GPU高性能编程CUDA实战》第八章图形互操作性的更多相关文章

[问题解决]《GPU高性能编程CUDA实战》中第4章Julia实例“显示器驱动已停止响应，并且已恢复”问题的解决方法
以下问题的出现及解决都基于"WIN7+CUDA7.5". 问题描述:当我编译运行<GPU高性能编程CUDA实战>中第4章所给Julia实例代码时,出现了显示器闪动的现象 ...
《GPU高性能编程CUDA实战》第四章简单的线程块并行
▶ 本章介绍了线程块并行,并给出两个例子:长向量加法和绘制julia集. ● 长向量加法,中规中矩的GPU加法,包含申请内存和显存,赋值,显存传入,计算,显存传出,处理结果,清理内存和显存.用到了 t ...
《GPU高性能编程CUDA实战》第十一章多GPU系统的CUDA C
▶ 本章介绍了多设备胸膛下的 CUDA 编程,以及一些特殊存储类型对计算速度的影响 ● 显存和零拷贝内存的拷贝与计算对比 #include <stdio.h> #include " ...
《GPU高性能编程CUDA实战》第六章常量内存
▶ 本章介绍了常量内存的使用,并给光线追踪的一个例子.介绍了结构cudaEvent_t及其在计时方面的使用. ● 章节代码,大意是有SPHERES个球分布在原点附近,其球心坐标在每个坐标轴方向上分量绝 ...
《GPU高性能编程CUDA实战》第五章线程并行
▶ 本章介绍了线程并行,并给出四个例子.长向量加法.波纹效果.点积和显示位图. ● 长向量加法(线程块并行 + 线程并行) #include <stdio.h> #include &quo ...
《GPU高性能编程CUDA实战》附录二散列表
▶ 使用CPU和GPU分别实现散列表 ● CPU方法 #include <stdio.h> #include <time.h> #include "cuda_runt ...
《GPU高性能编程CUDA实战》第七章纹理内存
▶ 本章介绍了纹理内存的使用,并给出了热传导的两个个例子.分别使用了一维和二维纹理单元. ● 热传导(使用一维纹理) #include <stdio.h> #include "c ...
《GPU高性能编程CUDA实战》第三章 CUDA设备相关
▶ 这章介绍了与CUDA设备相关的参数,并给出了了若干用于查询参数的函数. ● 代码(已合并) #include <stdio.h> #include "cuda_runtime ...
《GPU高性能编程CUDA实战》附录四其他头文件
▶ cpu_bitmap.h #ifndef __CPU_BITMAP_H__ #define __CPU_BITMAP_H__ #include "gl_helper.h" st ...

随机推荐

python去掉字符串'\xa0'
AssertionError: '5\xa0e\xa0*\xa0*\xa0*\xa05' != '5e***5'mystr = '5\xa0e\xa0*\xa0*\xa0*\xa05'mystr = ...
杨恒说李的算法好-我问你听谁说的-龙哥说的（java中常见的List就2个）（list放入的是原子元素）
1.List中常用的方法集合: 函数原型 ******************************************* ********************************** ...
repo学习笔记
1. 遍历所有的git仓库,并在每个仓库执行-c所指定的命令(被执行的命令不限于git命令,而是任何被系统支持的命令,比如:ls . pwd .cp 等 . $ repo forall -c &quo ...
Apache和Nginx的Rewrite规则对比
一.Apache的rewrite 1.Rewrite规则简介: Rewirte主要的功能就是实现URL的跳转,它的正则表达式是基于Perl语言.可基于服务器级的(httpd.conf)和目录级的(.h ...
pycharm -- 导入主题(theme) and 修改背景颜色（护眼色）
前情提要众所周知,随着python语言的不断流行,越来越多的程序员开始用python来开发自己的项目以及产品. pycharm作为一款流行的IDE,被越来越多的程序员所接受和使用. 尽管pychar ...
TFTP error: 'Only absolute filenames allowed' (2)
hisilicon # tftp 0x82000000 u-boot-hi3518ev200.bin Hisilicon ETH net controler MAC: ----- eth0 : phy ...
基于 FastAdmin 开发后台流程（持续更新）
使用 git init 初始化增加一个自己的git 原始仓库,用于存放自己的代码. 增加一个 fastadmin 的仓库,为了方便以后与官方同步. 自己修改的代码 git Push 到自己的仓库将 ...
Open Flash Chart 之线图
天公司要求开发一个曲线图,简单看了一下之前公司的一个系统,发现一个曲线图效果还不错,查了一下叫OpenFlashChart,还是很不错的,很多人用.研究了一下,发现还不错,特地写了个DEMO测试下. ...
Asp.Net Core MVC框架内置过滤器
第一部分.MVC框架内置过滤器下图展示了Asp.Net Core MVC框架默认实现的过滤器的执行顺序: Authorization Filters:身份验证过滤器,处在整个过滤器通道的最顶层.对应 ...
JAVA架构师面试题一
基础题目 Java线程的状态进程和线程的区别,进程间如何通讯,线程间如何通讯 HashMap的数据结构是什么?如何实现的.和HashTable,ConcurrentHashMap的区别 Cookie ...

《GPU高性能编程CUDA实战》第八章 图形互操作性

《GPU高性能编程CUDA实战》第八章 图形互操作性的更多相关文章

随机推荐

热门专题

《GPU高性能编程CUDA实战》第八章图形互操作性

《GPU高性能编程CUDA实战》第八章图形互操作性的更多相关文章