不合理的代码

 /*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/ #include "../common/book.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20) __global__ void kernel(int *a, int *b, int *c) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + ) % ;
int idx2 = (idx + ) % ;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / ;
}
} int main(void) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR(cudaGetDevice(&whichDevice));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
if (!prop.deviceOverlap) {
printf("Device will not handle overlaps, so no speed up from streams\n");
return ;
} cudaEvent_t start, stop;
float elapsedTime; cudaStream_t stream0, stream1;
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1; // start the timers
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop)); // initialize the streams
HANDLE_ERROR(cudaStreamCreate(&stream0));
HANDLE_ERROR(cudaStreamCreate(&stream1)); // allocate the memory on the GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_a0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_a1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c1,
N * sizeof(int))); // allocate host locked memory, used to stream
HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault)); for (int i = ; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
} HANDLE_ERROR(cudaEventRecord(start, ));
// now loop over full data, in bite-sized chunks
for (int i = ; i<FULL_DATA_SIZE; i += N * ) {
// copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0)); kernel << <N / , , , stream0 >> >(dev_a0, dev_b0, dev_c0); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream0)); // copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1));
HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1)); kernel << <N / , , , stream1 >> >(dev_a1, dev_b1, dev_c1); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream1));
}
HANDLE_ERROR(cudaStreamSynchronize(stream0));
HANDLE_ERROR(cudaStreamSynchronize(stream1)); HANDLE_ERROR(cudaEventRecord(stop, )); HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
start, stop));
printf("Time taken: %3.1f ms\n", elapsedTime); // cleanup the streams and memory
HANDLE_ERROR(cudaFreeHost(host_a));
HANDLE_ERROR(cudaFreeHost(host_b));
HANDLE_ERROR(cudaFreeHost(host_c));
HANDLE_ERROR(cudaFree(dev_a0));
HANDLE_ERROR(cudaFree(dev_b0));
HANDLE_ERROR(cudaFree(dev_c0));
HANDLE_ERROR(cudaFree(dev_a1));
HANDLE_ERROR(cudaFree(dev_b1));
HANDLE_ERROR(cudaFree(dev_c1));
HANDLE_ERROR(cudaStreamDestroy(stream0));
HANDLE_ERROR(cudaStreamDestroy(stream1)); return ;
}

代码下载

basic_double_stream_incorrect的更多相关文章

随机推荐

  1. openstack RPC通信

    openstack RPC通信 OpenStack 的主要组件有 Nova.Cinder.Neutron.Glance 等,分别负责云平台的计算.存储.网络资源管理.openstack 各组件之间是通 ...

  2. 40.QT-QPropertyAnimationdong和QParallelAnimationGroup动画实现

    简述:QPropertyAnimation (动画类,用来向QObject对象添加动画) 该类的继承框图如下所示: 1.QAbstractAnimation(所有动画的抽象基类) 该抽象类为QProp ...

  3. 10分钟了解什么是BFC

    BFC对于已经是一个耳熟能详的词汇了,而且在前端面试中,这题也是一个高频题.虽然我们平时在开发的时候知道如何利用BFC来解决问题,但是我们要具体说出BFC的概念和怎么触发BFC,我相信很多小伙伴也是和 ...

  4. Bitbucekt Reference

    Bitbucket Server installation guide https://confluence.atlassian.com/bitbucketserver/bitbucket-serve ...

  5. Exadata中Infiniband交换机升级

    Infiniband交换机的软件补丁包,随着软件版本的不同,其发布方式也有所变化,从如下图表可以看出,1.3.3-2这个版本是一个分水岭,这个版本及以前的版本,补丁包是单独下载的,而之后的版本,inf ...

  6. Luogu P5103 「JOI 2016 Final」断层 树状数组or线段树+脑子

    太神仙了这题... 原来的地面上升,可以倒着操作(时光倒流),转化为地面沉降,最后的答案就是每个点的深度. 下面的1,2操作均定义为向下沉降(与原题意的变换相反): 首先这个题目只会操作前缀和后缀,并 ...

  7. Knight Tournament (set)

    Hooray! Berl II, the king of Berland is making a knight tournament. The king has already sent the me ...

  8. python_魔法方法(五):描述符和定制序列

    描述符(property的原理) 描述符(descripto),用一句话来解释,描述符就是某种特殊的类的实例指派给另一个类的属性.那么什么是特殊类型的类呢?就是至少要在这个类中定义__get__(). ...

  9. Linux Shell简单命令

    sudo uname --m 查看操作系统位数sudo uname --s 显示内核名字ssudo uname --r 显示内核版本sudo uname --n 显示网络主机名sudo uname - ...

  10. (转)由su和su -的区别谈学习linux运维方法

    由su和su -的区别谈学习linux运维方法 原文:http://blog.51cto.com/oldboy/1053606 由su和su -的区别谈学习linux运维方法一例 老男孩Linux培训 ...