basic_double_stream_incorrect
不合理的代码
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/ #include "../common/book.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20) __global__ void kernel(int *a, int *b, int *c) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + ) % ;
int idx2 = (idx + ) % ;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / ;
}
} int main(void) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR(cudaGetDevice(&whichDevice));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
if (!prop.deviceOverlap) {
printf("Device will not handle overlaps, so no speed up from streams\n");
return ;
} cudaEvent_t start, stop;
float elapsedTime; cudaStream_t stream0, stream1;
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1; // start the timers
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop)); // initialize the streams
HANDLE_ERROR(cudaStreamCreate(&stream0));
HANDLE_ERROR(cudaStreamCreate(&stream1)); // allocate the memory on the GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_a0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_a1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c1,
N * sizeof(int))); // allocate host locked memory, used to stream
HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault)); for (int i = ; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
} HANDLE_ERROR(cudaEventRecord(start, ));
// now loop over full data, in bite-sized chunks
for (int i = ; i<FULL_DATA_SIZE; i += N * ) {
// copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0)); kernel << <N / , , , stream0 >> >(dev_a0, dev_b0, dev_c0); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream0)); // copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1));
HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1)); kernel << <N / , , , stream1 >> >(dev_a1, dev_b1, dev_c1); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream1));
}
HANDLE_ERROR(cudaStreamSynchronize(stream0));
HANDLE_ERROR(cudaStreamSynchronize(stream1)); HANDLE_ERROR(cudaEventRecord(stop, )); HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
start, stop));
printf("Time taken: %3.1f ms\n", elapsedTime); // cleanup the streams and memory
HANDLE_ERROR(cudaFreeHost(host_a));
HANDLE_ERROR(cudaFreeHost(host_b));
HANDLE_ERROR(cudaFreeHost(host_c));
HANDLE_ERROR(cudaFree(dev_a0));
HANDLE_ERROR(cudaFree(dev_b0));
HANDLE_ERROR(cudaFree(dev_c0));
HANDLE_ERROR(cudaFree(dev_a1));
HANDLE_ERROR(cudaFree(dev_b1));
HANDLE_ERROR(cudaFree(dev_c1));
HANDLE_ERROR(cudaStreamDestroy(stream0));
HANDLE_ERROR(cudaStreamDestroy(stream1)); return ;
}
代码下载
basic_double_stream_incorrect的更多相关文章
随机推荐
- Java 引用类型
若内存中一个对象没有任何引用的话,则可以认为该对象已经不再使用了,可以成为GC的候选.不过由于垃圾回收器的运行时间不确定,可被垃圾回收的对象的实际被回收时间是不确定的.对于一个对象来说,只要有引用的存 ...
- h5自定义播放器得实现原理
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8&quo ...
- 消息中间件的研究 (四)RabbitMQ、Kafka、RocketMQ消息中间件的对比及分析
RabbitMQ: RabbitMQ是使用Erlang语言开发的开源消息队列系统,基于AMQP协议来实现.AMQP的主要特征是面向消息.队列.路由(包括点对点和发布/订阅).可靠性.安全.AM ...
- STP-17-对抗单向链路问题
单向链路问题是指链路上的两条传输路径中,有一条出现了问题,但并不是两条同时出现问题.这可能是因为线缆错误.切断了一条光纤线缆.拔掉了一根管线.GBIC问题,或其他问题.因为STP会监控入向BPDU,以 ...
- Eclipse进行Java web开发时,可能会出现这样的错误:The superclass javax.servlet.http.HttpServlet was not found on the Java Build Path
我们遇到的错误显示如下: 我们右击有错误提示的文件夹,如下: 我们点击”配置构建路径“,如下: 我们再点击”添加库“,如下: 我们选中上图中标出的选项,再点击下一步,如下: 我们再 ...
- POJ1845 Sumdiv 数学?逆元?
当初写过一篇分治的 题意:求A^B的所有因子之和,并对其取模 9901再输出 对于数A=p1^c1+p2^c2+...+pn*cn,它的所有约数之和为(1+p1+p1^2+p1^3+...+p1^(c ...
- scala数据类型
# Scala数据类型 ## 1.数值类型 ### 1.1 与Java一样Scala也有8种数值类型 * Byte * Char * Short * Int * Long * Float * Doub ...
- sql数据库发布、订阅同步方式操作
Sql数据库发布订阅分为两个步骤:1.发布.2.订阅.首先在数据源数据库服务器上对需要同步的数据进行发布,然后在目标数据库服务器上对上述发布进行订阅. 一.发布. 发布需要用实际的服务器名称,不能使用 ...
- 如何创建width与height比例固定的元素
面试题,刚在github上看到的,说说这里面的知识点吧~~ padding-bottom的值,其百分比是根据元素自身的width来算的. padding,在标准盒模型中,width+padding+b ...
- 1160: sundari && Shortest path HDU - 4479
http://gdutcode.sinaapp.com/problem.php?id=1160 http://acm.hdu.edu.cn/showproblem.php?pid=4479 35 51 ...