《GPU高性能编程CUDA实战》第八章 图形互操作性
▶ OpenGL与DirectX,等待填坑。
● basic_interop
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\cpu_bitmap.h" PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL; #define DIM 512 GLuint bufferObj;
cudaGraphicsResource *resource; // based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png
__global__ void kernel(uchar4 *ptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x / (float)DIM - 0.5f;
float fy = y / (float)DIM - 0.5f;
unsigned char green = + * sin(abs(fx * ) - abs(fy * ); // accessing uchar4 vs unsigned char*
ptr[offset].x = ;
ptr[offset].y = green;
ptr[offset].z = ;
ptr[offset].w = ;
} static void key_func(unsigned char key, int x, int y)
{
switch (key)
{
case :
// clean up OpenGL and CUDA
cudaGraphicsUnregisterResource(resource);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, );
glDeleteBuffers(, &bufferObj);
exit();
}
} static void draw_func(void)
{
// we pass zero as the last parameter, because out bufferObj is now
// the source, and the field switches from being a pointer to a
// bitmap to now mean an offset into a bitmap object
glDrawPixels(DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, );
glutSwapBuffers();
} int main(int argc, char **argv)
{
cudaDeviceProp prop;
int dev; memset(&prop, , sizeof(cudaDeviceProp));
prop.major = ;
prop.minor = ;
cudaChooseDevice(&dev, &prop); // tell CUDA which dev we will be using for graphic interop
// from the programming guide: Interoperability with OpenGL
// requires that the CUDA device be specified by
// cudaGLSetGLDevice() before any other runtime calls. cudaGLSetGLDevice(dev); // these GLUT calls need to be made before the other OpenGL
// calls, else we get a seg fault
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
glutInitWindowSize(DIM, DIM);
glutCreateWindow("bitmap");//初始化并创建一个窗口 //创建缓冲区
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData"); // the first three are standard OpenGL, the 4th is the CUDA reg
// of the bitmap these calls exist starting in OpenGL 1.5
glGenBuffers(, &bufferObj);// 将bufferObj注册为图形资源
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * , NULL, GL_DYNAMIC_DRAW_ARB); cudaGraphicsGLRegisterBuffer(&resource, bufferObj, cudaGraphicsMapFlagsNone); // do work with the memory dst being on the GPU, gotten via mapping
cudaGraphicsMapResources(, &resource, NULL);
uchar4* devPtr;
size_t size;
cudaGraphicsResourceGetMappedPointer((void**)&devPtr,&size,resource); dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(devPtr);
cudaGraphicsUnmapResources(, &resource, NULL); // set up GLUT and kick off main loop
glutKeyboardFunc(key_func);
glutDisplayFunc(draw_func);
glutMainLoop(); getchar();
return;
}
● ripple
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024 __global__ void kernel(uchar4 *ptr, int ticks)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x - DIM / ;
float fy = y - DIM / ;
float d = sqrtf(fx * fx + fy * fy);
unsigned char grey = (unsigned char)(128.0f + 127.0f *cos(d / 10.0f - ticks / 7.0f) / (d / 10.0f + 1.0f));
ptr[offset].x = grey;
ptr[offset].y = grey;
ptr[offset].z = grey;
ptr[offset].w = ;
} void generate_frame(uchar4 *pixels, void*, int ticks)
{
dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(pixels, ticks);
} int main(void)
{
GPUAnimBitmap bitmap(DIM, DIM, NULL); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))generate_frame, NULL); getchar();
return;
}
● heat
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f // these exist on the GPU side
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut; // this kernel takes in a 2-d array of floats
// it updates the value-of-interest by a scaled value based
// on itself and its nearest neighbors
__global__ void blend_kernel(float *dst,bool dstOut)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; int left = offset - ;
int right = offset + ;
if (x == ) left++;
if (x == DIM - ) right--; int top = offset - DIM;
int bottom = offset + DIM;
if (y == ) top += DIM;
if (y == DIM - ) bottom -= DIM; float t, l, c, r, b;
if (dstOut) {
t = tex1Dfetch(texIn, top);
l = tex1Dfetch(texIn, left);
c = tex1Dfetch(texIn, offset);
r = tex1Dfetch(texIn, right);
b = tex1Dfetch(texIn, bottom); }
else {
t = tex1Dfetch(texOut, top);
l = tex1Dfetch(texOut, left);
c = tex1Dfetch(texOut, offset);
r = tex1Dfetch(texOut, right);
b = tex1Dfetch(texOut, bottom);
}
dst[offset] = c + SPEED * (t + b + r + l - * c);
} // NOTE - texOffsetConstSrc could either be passed as a
// parameter to this function, or passed in __constant__ memory
// if we declared it as a global above, it would be
// a parameter here:
// __global__ void copy_const_kernel( float *iptr,
// size_t texOffset )
__global__ void copy_const_kernel(float *iptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; float c = tex1Dfetch(texConstSrc, offset);
if (c != )
iptr[offset] = c;
} // globals needed by the update routine
struct DataBlock
{
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
cudaEvent_t start, stop;
float totalTime;
float frames;
}; void anim_gpu(uchar4* outputBitmap, DataBlock *d, int ticks)
{
cudaEventRecord(d->start, );
dim3 blocks(DIM / , DIM / );
dim3 threads(, ); // since tex is global and bound, we have to use a flag to
// select which is in/out per iteration
volatile bool dstOut = true;
for (int i = ; i<; i++)
{
float *in, *out;
if (dstOut)
{
in = d->dev_inSrc;
out = d->dev_outSrc;
}
else
{
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel << <blocks, threads >> >(in);
blend_kernel << <blocks, threads >> >(out, dstOut);
dstOut = !dstOut;
}
float_to_color << <blocks, threads >> >(outputBitmap,d->dev_inSrc); cudaEventRecord(d->stop, );
cudaEventSynchronize(d->stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,d->start, d->stop);
d->totalTime += elapsedTime;
++d->frames;
printf("Average Time per frame: %3.1f ms\n",d->totalTime / d->frames);
} // clean up memory allocated on the GPU
void anim_exit(DataBlock *d)
{
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
cudaFree(d->dev_constSrc);
cudaEventDestroy(d->start);
cudaEventDestroy(d->stop);
} int main(void)
{
DataBlock data;
GPUAnimBitmap bitmap(DIM, DIM, &data);
data.totalTime = ;
data.frames = ;
cudaEventCreate(&data.start);
cudaEventCreate(&data.stop); int imageSize = bitmap.image_size(); cudaMalloc((void**)&data.dev_inSrc, imageSize);
cudaMalloc((void**)&data.dev_outSrc, imageSize);
cudaMalloc((void**)&data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize);
cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize); float *temp = (float*)malloc(imageSize);
for (int i = ; i < DIM*DIM; i++)// 恒温格点数据
{
temp[i] = ;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MIN_TEMP;
}
cudaMemcpy(data.dev_constSrc, temp,imageSize,cudaMemcpyHostToDevice); for (int i = ; i < DIM*DIM; i++)// 初始温度场数据
{
temp[i] = 0.5;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
}
cudaMemcpy(data.dev_inSrc, temp,imageSize,cudaMemcpyHostToDevice);
free(temp); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))anim_gpu,(void(*)(void*))anim_exit);
getchar();
return;
}
《GPU高性能编程CUDA实战》第八章 图形互操作性的更多相关文章
- [问题解决]《GPU高性能编程CUDA实战》中第4章Julia实例“显示器驱动已停止响应,并且已恢复”问题的解决方法
以下问题的出现及解决都基于"WIN7+CUDA7.5". 问题描述:当我编译运行<GPU高性能编程CUDA实战>中第4章所给Julia实例代码时,出现了显示器闪动的现象 ...
- 《GPU高性能编程CUDA实战》第四章 简单的线程块并行
▶ 本章介绍了线程块并行,并给出两个例子:长向量加法和绘制julia集. ● 长向量加法,中规中矩的GPU加法,包含申请内存和显存,赋值,显存传入,计算,显存传出,处理结果,清理内存和显存.用到了 t ...
- 《GPU高性能编程CUDA实战》第十一章 多GPU系统的CUDA C
▶ 本章介绍了多设备胸膛下的 CUDA 编程,以及一些特殊存储类型对计算速度的影响 ● 显存和零拷贝内存的拷贝与计算对比 #include <stdio.h> #include " ...
- 《GPU高性能编程CUDA实战》第六章 常量内存
▶ 本章介绍了常量内存的使用,并给光线追踪的一个例子.介绍了结构cudaEvent_t及其在计时方面的使用. ● 章节代码,大意是有SPHERES个球分布在原点附近,其球心坐标在每个坐标轴方向上分量绝 ...
- 《GPU高性能编程CUDA实战》第五章 线程并行
▶ 本章介绍了线程并行,并给出四个例子.长向量加法.波纹效果.点积和显示位图. ● 长向量加法(线程块并行 + 线程并行) #include <stdio.h> #include &quo ...
- 《GPU高性能编程CUDA实战》附录二 散列表
▶ 使用CPU和GPU分别实现散列表 ● CPU方法 #include <stdio.h> #include <time.h> #include "cuda_runt ...
- 《GPU高性能编程CUDA实战》第七章 纹理内存
▶ 本章介绍了纹理内存的使用,并给出了热传导的两个个例子.分别使用了一维和二维纹理单元. ● 热传导(使用一维纹理) #include <stdio.h> #include "c ...
- 《GPU高性能编程CUDA实战》第三章 CUDA设备相关
▶ 这章介绍了与CUDA设备相关的参数,并给出了了若干用于查询参数的函数. ● 代码(已合并) #include <stdio.h> #include "cuda_runtime ...
- 《GPU高性能编程CUDA实战》附录四 其他头文件
▶ cpu_bitmap.h #ifndef __CPU_BITMAP_H__ #define __CPU_BITMAP_H__ #include "gl_helper.h" st ...
随机推荐
- python去掉字符串'\xa0'
AssertionError: '5\xa0e\xa0*\xa0*\xa0*\xa05' != '5e***5'mystr = '5\xa0e\xa0*\xa0*\xa0*\xa05'mystr = ...
- 杨恒说李的算法好-我问你听谁说的-龙哥说的(java中常见的List就2个)(list放入的是原子元素)
1.List中常用的 方法集合: 函数原型 ******************************************* ********************************** ...
- repo学习笔记
1. 遍历所有的git仓库,并在每个仓库执行-c所指定的命令(被执行的命令不限于git命令,而是任何被系统支持的命令,比如:ls . pwd .cp 等 . $ repo forall -c &quo ...
- Apache和Nginx的Rewrite规则对比
一.Apache的rewrite 1.Rewrite规则简介: Rewirte主要的功能就是实现URL的跳转,它的正则表达式是基于Perl语言.可基于服务器级的(httpd.conf)和目录级的(.h ...
- pycharm -- 导入主题(theme) and 修改背景颜色(护眼色)
前情提要 众所周知,随着python语言的不断流行,越来越多的程序员开始用python来开发自己的项目以及产品. pycharm作为一款流行的IDE,被越来越多的程序员所接受和使用. 尽管pychar ...
- TFTP error: 'Only absolute filenames allowed' (2)
hisilicon # tftp 0x82000000 u-boot-hi3518ev200.bin Hisilicon ETH net controler MAC: ----- eth0 : phy ...
- 基于 FastAdmin 开发后台流程 (持续更新)
使用 git init 初始化 增加一个自己的git 原始仓库,用于存放自己的代码. 增加一个 fastadmin 的仓库,为了方便以后与官方同步. 自己修改的代码 git Push 到自己的仓库 将 ...
- Open Flash Chart 之线图
天公司要求开发一个曲线图,简单看了一下之前公司的一个系统,发现一个曲线图效果还不错,查了一下叫OpenFlashChart,还是很不错的,很多人用.研究了一下,发现还不错,特地写了个DEMO测试下. ...
- Asp.Net Core MVC框架内置过滤器
第一部分.MVC框架内置过滤器 下图展示了Asp.Net Core MVC框架默认实现的过滤器的执行顺序: Authorization Filters:身份验证过滤器,处在整个过滤器通道的最顶层.对应 ...
- JAVA架构师面试题 一
基础题目 Java线程的状态 进程和线程的区别,进程间如何通讯,线程间如何通讯 HashMap的数据结构是什么?如何实现的.和HashTable,ConcurrentHashMap的区别 Cookie ...