《GPU高性能编程CUDA实战》第八章 图形互操作性
▶ OpenGL与DirectX,等待填坑。
● basic_interop
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\cpu_bitmap.h" PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL; #define DIM 512 GLuint bufferObj;
cudaGraphicsResource *resource; // based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png
__global__ void kernel(uchar4 *ptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x / (float)DIM - 0.5f;
float fy = y / (float)DIM - 0.5f;
unsigned char green = + * sin(abs(fx * ) - abs(fy * ); // accessing uchar4 vs unsigned char*
ptr[offset].x = ;
ptr[offset].y = green;
ptr[offset].z = ;
ptr[offset].w = ;
} static void key_func(unsigned char key, int x, int y)
{
switch (key)
{
case :
// clean up OpenGL and CUDA
cudaGraphicsUnregisterResource(resource);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, );
glDeleteBuffers(, &bufferObj);
exit();
}
} static void draw_func(void)
{
// we pass zero as the last parameter, because out bufferObj is now
// the source, and the field switches from being a pointer to a
// bitmap to now mean an offset into a bitmap object
glDrawPixels(DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, );
glutSwapBuffers();
} int main(int argc, char **argv)
{
cudaDeviceProp prop;
int dev; memset(&prop, , sizeof(cudaDeviceProp));
prop.major = ;
prop.minor = ;
cudaChooseDevice(&dev, &prop); // tell CUDA which dev we will be using for graphic interop
// from the programming guide: Interoperability with OpenGL
// requires that the CUDA device be specified by
// cudaGLSetGLDevice() before any other runtime calls. cudaGLSetGLDevice(dev); // these GLUT calls need to be made before the other OpenGL
// calls, else we get a seg fault
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
glutInitWindowSize(DIM, DIM);
glutCreateWindow("bitmap");//初始化并创建一个窗口 //创建缓冲区
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData"); // the first three are standard OpenGL, the 4th is the CUDA reg
// of the bitmap these calls exist starting in OpenGL 1.5
glGenBuffers(, &bufferObj);// 将bufferObj注册为图形资源
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * , NULL, GL_DYNAMIC_DRAW_ARB); cudaGraphicsGLRegisterBuffer(&resource, bufferObj, cudaGraphicsMapFlagsNone); // do work with the memory dst being on the GPU, gotten via mapping
cudaGraphicsMapResources(, &resource, NULL);
uchar4* devPtr;
size_t size;
cudaGraphicsResourceGetMappedPointer((void**)&devPtr,&size,resource); dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(devPtr);
cudaGraphicsUnmapResources(, &resource, NULL); // set up GLUT and kick off main loop
glutKeyboardFunc(key_func);
glutDisplayFunc(draw_func);
glutMainLoop(); getchar();
return;
}
● ripple
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024 __global__ void kernel(uchar4 *ptr, int ticks)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x - DIM / ;
float fy = y - DIM / ;
float d = sqrtf(fx * fx + fy * fy);
unsigned char grey = (unsigned char)(128.0f + 127.0f *cos(d / 10.0f - ticks / 7.0f) / (d / 10.0f + 1.0f));
ptr[offset].x = grey;
ptr[offset].y = grey;
ptr[offset].z = grey;
ptr[offset].w = ;
} void generate_frame(uchar4 *pixels, void*, int ticks)
{
dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(pixels, ticks);
} int main(void)
{
GPUAnimBitmap bitmap(DIM, DIM, NULL); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))generate_frame, NULL); getchar();
return;
}
● heat
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f // these exist on the GPU side
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut; // this kernel takes in a 2-d array of floats
// it updates the value-of-interest by a scaled value based
// on itself and its nearest neighbors
__global__ void blend_kernel(float *dst,bool dstOut)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; int left = offset - ;
int right = offset + ;
if (x == ) left++;
if (x == DIM - ) right--; int top = offset - DIM;
int bottom = offset + DIM;
if (y == ) top += DIM;
if (y == DIM - ) bottom -= DIM; float t, l, c, r, b;
if (dstOut) {
t = tex1Dfetch(texIn, top);
l = tex1Dfetch(texIn, left);
c = tex1Dfetch(texIn, offset);
r = tex1Dfetch(texIn, right);
b = tex1Dfetch(texIn, bottom); }
else {
t = tex1Dfetch(texOut, top);
l = tex1Dfetch(texOut, left);
c = tex1Dfetch(texOut, offset);
r = tex1Dfetch(texOut, right);
b = tex1Dfetch(texOut, bottom);
}
dst[offset] = c + SPEED * (t + b + r + l - * c);
} // NOTE - texOffsetConstSrc could either be passed as a
// parameter to this function, or passed in __constant__ memory
// if we declared it as a global above, it would be
// a parameter here:
// __global__ void copy_const_kernel( float *iptr,
// size_t texOffset )
__global__ void copy_const_kernel(float *iptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; float c = tex1Dfetch(texConstSrc, offset);
if (c != )
iptr[offset] = c;
} // globals needed by the update routine
struct DataBlock
{
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
cudaEvent_t start, stop;
float totalTime;
float frames;
}; void anim_gpu(uchar4* outputBitmap, DataBlock *d, int ticks)
{
cudaEventRecord(d->start, );
dim3 blocks(DIM / , DIM / );
dim3 threads(, ); // since tex is global and bound, we have to use a flag to
// select which is in/out per iteration
volatile bool dstOut = true;
for (int i = ; i<; i++)
{
float *in, *out;
if (dstOut)
{
in = d->dev_inSrc;
out = d->dev_outSrc;
}
else
{
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel << <blocks, threads >> >(in);
blend_kernel << <blocks, threads >> >(out, dstOut);
dstOut = !dstOut;
}
float_to_color << <blocks, threads >> >(outputBitmap,d->dev_inSrc); cudaEventRecord(d->stop, );
cudaEventSynchronize(d->stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,d->start, d->stop);
d->totalTime += elapsedTime;
++d->frames;
printf("Average Time per frame: %3.1f ms\n",d->totalTime / d->frames);
} // clean up memory allocated on the GPU
void anim_exit(DataBlock *d)
{
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
cudaFree(d->dev_constSrc);
cudaEventDestroy(d->start);
cudaEventDestroy(d->stop);
} int main(void)
{
DataBlock data;
GPUAnimBitmap bitmap(DIM, DIM, &data);
data.totalTime = ;
data.frames = ;
cudaEventCreate(&data.start);
cudaEventCreate(&data.stop); int imageSize = bitmap.image_size(); cudaMalloc((void**)&data.dev_inSrc, imageSize);
cudaMalloc((void**)&data.dev_outSrc, imageSize);
cudaMalloc((void**)&data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize);
cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize); float *temp = (float*)malloc(imageSize);
for (int i = ; i < DIM*DIM; i++)// 恒温格点数据
{
temp[i] = ;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MIN_TEMP;
}
cudaMemcpy(data.dev_constSrc, temp,imageSize,cudaMemcpyHostToDevice); for (int i = ; i < DIM*DIM; i++)// 初始温度场数据
{
temp[i] = 0.5;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
}
cudaMemcpy(data.dev_inSrc, temp,imageSize,cudaMemcpyHostToDevice);
free(temp); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))anim_gpu,(void(*)(void*))anim_exit);
getchar();
return;
}
《GPU高性能编程CUDA实战》第八章 图形互操作性的更多相关文章
- [问题解决]《GPU高性能编程CUDA实战》中第4章Julia实例“显示器驱动已停止响应,并且已恢复”问题的解决方法
以下问题的出现及解决都基于"WIN7+CUDA7.5". 问题描述:当我编译运行<GPU高性能编程CUDA实战>中第4章所给Julia实例代码时,出现了显示器闪动的现象 ...
- 《GPU高性能编程CUDA实战》第四章 简单的线程块并行
▶ 本章介绍了线程块并行,并给出两个例子:长向量加法和绘制julia集. ● 长向量加法,中规中矩的GPU加法,包含申请内存和显存,赋值,显存传入,计算,显存传出,处理结果,清理内存和显存.用到了 t ...
- 《GPU高性能编程CUDA实战》第十一章 多GPU系统的CUDA C
▶ 本章介绍了多设备胸膛下的 CUDA 编程,以及一些特殊存储类型对计算速度的影响 ● 显存和零拷贝内存的拷贝与计算对比 #include <stdio.h> #include " ...
- 《GPU高性能编程CUDA实战》第六章 常量内存
▶ 本章介绍了常量内存的使用,并给光线追踪的一个例子.介绍了结构cudaEvent_t及其在计时方面的使用. ● 章节代码,大意是有SPHERES个球分布在原点附近,其球心坐标在每个坐标轴方向上分量绝 ...
- 《GPU高性能编程CUDA实战》第五章 线程并行
▶ 本章介绍了线程并行,并给出四个例子.长向量加法.波纹效果.点积和显示位图. ● 长向量加法(线程块并行 + 线程并行) #include <stdio.h> #include &quo ...
- 《GPU高性能编程CUDA实战》附录二 散列表
▶ 使用CPU和GPU分别实现散列表 ● CPU方法 #include <stdio.h> #include <time.h> #include "cuda_runt ...
- 《GPU高性能编程CUDA实战》第七章 纹理内存
▶ 本章介绍了纹理内存的使用,并给出了热传导的两个个例子.分别使用了一维和二维纹理单元. ● 热传导(使用一维纹理) #include <stdio.h> #include "c ...
- 《GPU高性能编程CUDA实战》第三章 CUDA设备相关
▶ 这章介绍了与CUDA设备相关的参数,并给出了了若干用于查询参数的函数. ● 代码(已合并) #include <stdio.h> #include "cuda_runtime ...
- 《GPU高性能编程CUDA实战》附录四 其他头文件
▶ cpu_bitmap.h #ifndef __CPU_BITMAP_H__ #define __CPU_BITMAP_H__ #include "gl_helper.h" st ...
随机推荐
- page 分页
<!DOCTYPE html><html> <head> <meta charset="UTF-8"> <title>& ...
- test20181016 B君的第二题
题意 分析 考场暴力50分. 考虑bfs序,一个点的儿子节点的bfs序一定连续,所以对bfs序建线段树,努力打一下就行了. 时间复杂度\(O(n \log n + m \log n)\) #inclu ...
- benthos 通过rest api 配置 stream 说明
stream 模式,我们可以通过rest api 进行控制 使用方法 启动 benthos --streams 进行流的配置(rest api) curl http://localhost:4195/ ...
- base64编码的原理及实现
base64编码的原理及实现 我们的图片大部分都是可以转换成base64编码的data:image. 这个在将canvas保存为img的时候尤其有用.虽然除ie外,大部分现代浏览器都已经支持原生的基于 ...
- C# ObjectArx AutoCAD二次开发(转帖)
http://www.cnblogs.com/houlinbo/p/3325898.html 1.开发基本资料准备 用Vs2010进行Autocad 2010开发,首先下载ObjectArx 2010 ...
- jmeter测试TCP服务器/模拟发送TCP请求
jmeter测试TCP服务器,使用TCP采样器模拟发送TCP请求. TCP采样器:打开一个到指定服务器的TCP / IP连接,然后发送指定文本并等待响应. jmeter模拟发送TCP请求的方法: 1. ...
- MySql检测阻塞,锁等待sql
------------ 1分钟内产生binlog大小计算 select @a1:=VARIABLE_VALUE as a1from information_schema.GLOBAL_STATUSw ...
- ASP.NET 实现伪静态网页方法
方法一:利用Httphandler实现URL重写(伪URL及伪静态) 我们有时候会见到这样的地址:“http://www.huoho.com/show-12-34.html”,你或许认为在站点服务器根 ...
- QWidget背景(透明)问题
一. 背景設定為成黑色,前景色設為白色. 方法一.paltette方式,經測試,該方法不會影響到其他控制元件,建議使用 QPalette bgpal = palette();bgpal.setColo ...
- 安装Linux软件时没有图形界面的问题
Q: 现在Linux下的软件有很多也采用图形界面安装了:但有时候我们发现启动安装程序时本该出现图形界面时却出现了文本界面. A: 检查常用的图形函数库(最主要的自然是gtk, qt)是否已安装.尤其是 ...