▶ OpenGL与DirectX,等待填坑。

● basic_interop

 #include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\cpu_bitmap.h" PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL; #define DIM 512 GLuint bufferObj;
cudaGraphicsResource *resource; // based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png
__global__ void kernel(uchar4 *ptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x / (float)DIM - 0.5f;
float fy = y / (float)DIM - 0.5f;
unsigned char green = + * sin(abs(fx * ) - abs(fy * ); // accessing uchar4 vs unsigned char*
ptr[offset].x = ;
ptr[offset].y = green;
ptr[offset].z = ;
ptr[offset].w = ;
} static void key_func(unsigned char key, int x, int y)
{
switch (key)
{
case :
// clean up OpenGL and CUDA
cudaGraphicsUnregisterResource(resource);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, );
glDeleteBuffers(, &bufferObj);
exit();
}
} static void draw_func(void)
{
// we pass zero as the last parameter, because out bufferObj is now
// the source, and the field switches from being a pointer to a
// bitmap to now mean an offset into a bitmap object
glDrawPixels(DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, );
glutSwapBuffers();
} int main(int argc, char **argv)
{
cudaDeviceProp prop;
int dev; memset(&prop, , sizeof(cudaDeviceProp));
prop.major = ;
prop.minor = ;
cudaChooseDevice(&dev, &prop); // tell CUDA which dev we will be using for graphic interop
// from the programming guide: Interoperability with OpenGL
// requires that the CUDA device be specified by
// cudaGLSetGLDevice() before any other runtime calls. cudaGLSetGLDevice(dev); // these GLUT calls need to be made before the other OpenGL
// calls, else we get a seg fault
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
glutInitWindowSize(DIM, DIM);
glutCreateWindow("bitmap");//初始化并创建一个窗口 //创建缓冲区
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData"); // the first three are standard OpenGL, the 4th is the CUDA reg
// of the bitmap these calls exist starting in OpenGL 1.5
glGenBuffers(, &bufferObj);// 将bufferObj注册为图形资源
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * , NULL, GL_DYNAMIC_DRAW_ARB); cudaGraphicsGLRegisterBuffer(&resource, bufferObj, cudaGraphicsMapFlagsNone); // do work with the memory dst being on the GPU, gotten via mapping
cudaGraphicsMapResources(, &resource, NULL);
uchar4* devPtr;
size_t size;
cudaGraphicsResourceGetMappedPointer((void**)&devPtr,&size,resource); dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(devPtr);
cudaGraphicsUnmapResources(, &resource, NULL); // set up GLUT and kick off main loop
glutKeyboardFunc(key_func);
glutDisplayFunc(draw_func);
glutMainLoop(); getchar();
return;
}

● ripple

 #include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024 __global__ void kernel(uchar4 *ptr, int ticks)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x - DIM / ;
float fy = y - DIM / ;
float d = sqrtf(fx * fx + fy * fy);
unsigned char grey = (unsigned char)(128.0f + 127.0f *cos(d / 10.0f - ticks / 7.0f) / (d / 10.0f + 1.0f));
ptr[offset].x = grey;
ptr[offset].y = grey;
ptr[offset].z = grey;
ptr[offset].w = ;
} void generate_frame(uchar4 *pixels, void*, int ticks)
{
dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(pixels, ticks);
} int main(void)
{
GPUAnimBitmap bitmap(DIM, DIM, NULL); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))generate_frame, NULL); getchar();
return;
}

● heat

 #include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f // these exist on the GPU side
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut; // this kernel takes in a 2-d array of floats
// it updates the value-of-interest by a scaled value based
// on itself and its nearest neighbors
__global__ void blend_kernel(float *dst,bool dstOut)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; int left = offset - ;
int right = offset + ;
if (x == ) left++;
if (x == DIM - ) right--; int top = offset - DIM;
int bottom = offset + DIM;
if (y == ) top += DIM;
if (y == DIM - ) bottom -= DIM; float t, l, c, r, b;
if (dstOut) {
t = tex1Dfetch(texIn, top);
l = tex1Dfetch(texIn, left);
c = tex1Dfetch(texIn, offset);
r = tex1Dfetch(texIn, right);
b = tex1Dfetch(texIn, bottom); }
else {
t = tex1Dfetch(texOut, top);
l = tex1Dfetch(texOut, left);
c = tex1Dfetch(texOut, offset);
r = tex1Dfetch(texOut, right);
b = tex1Dfetch(texOut, bottom);
}
dst[offset] = c + SPEED * (t + b + r + l - * c);
} // NOTE - texOffsetConstSrc could either be passed as a
// parameter to this function, or passed in __constant__ memory
// if we declared it as a global above, it would be
// a parameter here:
// __global__ void copy_const_kernel( float *iptr,
// size_t texOffset )
__global__ void copy_const_kernel(float *iptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; float c = tex1Dfetch(texConstSrc, offset);
if (c != )
iptr[offset] = c;
} // globals needed by the update routine
struct DataBlock
{
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
cudaEvent_t start, stop;
float totalTime;
float frames;
}; void anim_gpu(uchar4* outputBitmap, DataBlock *d, int ticks)
{
cudaEventRecord(d->start, );
dim3 blocks(DIM / , DIM / );
dim3 threads(, ); // since tex is global and bound, we have to use a flag to
// select which is in/out per iteration
volatile bool dstOut = true;
for (int i = ; i<; i++)
{
float *in, *out;
if (dstOut)
{
in = d->dev_inSrc;
out = d->dev_outSrc;
}
else
{
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel << <blocks, threads >> >(in);
blend_kernel << <blocks, threads >> >(out, dstOut);
dstOut = !dstOut;
}
float_to_color << <blocks, threads >> >(outputBitmap,d->dev_inSrc); cudaEventRecord(d->stop, );
cudaEventSynchronize(d->stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,d->start, d->stop);
d->totalTime += elapsedTime;
++d->frames;
printf("Average Time per frame: %3.1f ms\n",d->totalTime / d->frames);
} // clean up memory allocated on the GPU
void anim_exit(DataBlock *d)
{
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
cudaFree(d->dev_constSrc);
cudaEventDestroy(d->start);
cudaEventDestroy(d->stop);
} int main(void)
{
DataBlock data;
GPUAnimBitmap bitmap(DIM, DIM, &data);
data.totalTime = ;
data.frames = ;
cudaEventCreate(&data.start);
cudaEventCreate(&data.stop); int imageSize = bitmap.image_size(); cudaMalloc((void**)&data.dev_inSrc, imageSize);
cudaMalloc((void**)&data.dev_outSrc, imageSize);
cudaMalloc((void**)&data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize);
cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize); float *temp = (float*)malloc(imageSize);
for (int i = ; i < DIM*DIM; i++)// 恒温格点数据
{
temp[i] = ;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MIN_TEMP;
}
cudaMemcpy(data.dev_constSrc, temp,imageSize,cudaMemcpyHostToDevice); for (int i = ; i < DIM*DIM; i++)// 初始温度场数据
{
temp[i] = 0.5;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
}
cudaMemcpy(data.dev_inSrc, temp,imageSize,cudaMemcpyHostToDevice);
free(temp); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))anim_gpu,(void(*)(void*))anim_exit);
getchar();
return;
}

《GPU高性能编程CUDA实战》第八章 图形互操作性的更多相关文章

  1. [问题解决]《GPU高性能编程CUDA实战》中第4章Julia实例“显示器驱动已停止响应,并且已恢复”问题的解决方法

    以下问题的出现及解决都基于"WIN7+CUDA7.5". 问题描述:当我编译运行<GPU高性能编程CUDA实战>中第4章所给Julia实例代码时,出现了显示器闪动的现象 ...

  2. 《GPU高性能编程CUDA实战》第四章 简单的线程块并行

    ▶ 本章介绍了线程块并行,并给出两个例子:长向量加法和绘制julia集. ● 长向量加法,中规中矩的GPU加法,包含申请内存和显存,赋值,显存传入,计算,显存传出,处理结果,清理内存和显存.用到了 t ...

  3. 《GPU高性能编程CUDA实战》第十一章 多GPU系统的CUDA C

    ▶ 本章介绍了多设备胸膛下的 CUDA 编程,以及一些特殊存储类型对计算速度的影响 ● 显存和零拷贝内存的拷贝与计算对比 #include <stdio.h> #include " ...

  4. 《GPU高性能编程CUDA实战》第六章 常量内存

    ▶ 本章介绍了常量内存的使用,并给光线追踪的一个例子.介绍了结构cudaEvent_t及其在计时方面的使用. ● 章节代码,大意是有SPHERES个球分布在原点附近,其球心坐标在每个坐标轴方向上分量绝 ...

  5. 《GPU高性能编程CUDA实战》第五章 线程并行

    ▶ 本章介绍了线程并行,并给出四个例子.长向量加法.波纹效果.点积和显示位图. ● 长向量加法(线程块并行 + 线程并行) #include <stdio.h> #include &quo ...

  6. 《GPU高性能编程CUDA实战》附录二 散列表

    ▶ 使用CPU和GPU分别实现散列表 ● CPU方法 #include <stdio.h> #include <time.h> #include "cuda_runt ...

  7. 《GPU高性能编程CUDA实战》第七章 纹理内存

    ▶ 本章介绍了纹理内存的使用,并给出了热传导的两个个例子.分别使用了一维和二维纹理单元. ● 热传导(使用一维纹理) #include <stdio.h> #include "c ...

  8. 《GPU高性能编程CUDA实战》第三章 CUDA设备相关

    ▶ 这章介绍了与CUDA设备相关的参数,并给出了了若干用于查询参数的函数. ● 代码(已合并) #include <stdio.h> #include "cuda_runtime ...

  9. 《GPU高性能编程CUDA实战》附录四 其他头文件

    ▶ cpu_bitmap.h #ifndef __CPU_BITMAP_H__ #define __CPU_BITMAP_H__ #include "gl_helper.h" st ...

随机推荐

  1. 斐波那契数列的5种python实现写法

    斐波那契数列的5种python写法       斐波那契数列(Fibonacci sequence),又称黄金分割数列.因数学家列昂纳多·斐波那契(Leonardoda Fibonacci)以兔子繁殖 ...

  2. (4)logging(日志模块)

    日志分成几个常用的级别 debug 10 代表程序调试过程中的信息 info 20 代表普通日志信息,用户的访问等等 warning 30 警告日志,有可能出错,但是目前还没出错的 error 40 ...

  3. Hadoop storm大数据分析 知识体系结构

    最近工作工作有用到hadoop 和storm,最近看到一个网站上例句的hadoop 和storm的知识体系.所以列出来供大家了解和学习.来自哪个网站就不写了以免以为我做广告额. 目录结构知识点还是挺全 ...

  4. cin和cout详解

    无论输入数字还是字符串,一个回车键是把输入的这个东西送到变量中,可以一次性送到 一个(或者多个)空格键是分隔这些值的 cout <<N; for(int i=0;i<5;i++) { ...

  5. Understanding Safari Reader

    Interesting enough to find out the Reader function in Safari is actually Javascript and there are ma ...

  6. oracle12c之 控制pdb中sga 与 pga 内存使用

    Memory Management using Resource Manager Oracle数据库资源管理器(资源管理器)现在可以在多租户容器数据库(CDB)中管理可插入数据库(PDBs)之间的内存 ...

  7. JUC锁之 框架

    根据锁的添加到Java中的时间,Java中的锁,可以分为"同步锁"和"JUC包中的锁". 同步锁 即通过synchronized关键字来进行同步,实现对竞争资源 ...

  8. JUC线程池之 线程池拒绝策略

    拒绝策略介绍 线程池的拒绝策略,是指当任务添加到线程池中被拒绝,而采取的处理措施. 当任务添加到线程池中之所以被拒绝,可能是由于:第一,线程池异常关闭.第二,任务数量超过线程池的最大限制. 线程池共包 ...

  9. 无人驾驶之激光雷达&摄像头(主要from 速腾CEO 邱纯鑫分享)

    无人驾驶之激光雷达&摄像头 (from 速腾CEO 邱纯鑫公开课分享) 根据听的一些讲座和看的书籍,个人感觉:目前现在的自动驾驶,根本问题还是在于感知(路况,周边物体,交通标识等等),控制的方 ...

  10. XFire构建web service客户端的五种方式

    这里并未涉及到JSR 181 Annotations 的相关应用,具体的三种方式如下 ① 通过WSDL地址来创建动态客户端 ② 通过服务端提供的接口来创建客户端 ③ 使用Ant通过WSDL文件来生成客 ...