basic_double_stream_incorrect
不合理的代码
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/ #include "../common/book.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20) __global__ void kernel(int *a, int *b, int *c) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + ) % ;
int idx2 = (idx + ) % ;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / ;
}
} int main(void) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR(cudaGetDevice(&whichDevice));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
if (!prop.deviceOverlap) {
printf("Device will not handle overlaps, so no speed up from streams\n");
return ;
} cudaEvent_t start, stop;
float elapsedTime; cudaStream_t stream0, stream1;
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1; // start the timers
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop)); // initialize the streams
HANDLE_ERROR(cudaStreamCreate(&stream0));
HANDLE_ERROR(cudaStreamCreate(&stream1)); // allocate the memory on the GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_a0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c0,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_a1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b1,
N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c1,
N * sizeof(int))); // allocate host locked memory, used to stream
HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
FULL_DATA_SIZE * sizeof(int),
cudaHostAllocDefault)); for (int i = ; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
} HANDLE_ERROR(cudaEventRecord(start, ));
// now loop over full data, in bite-sized chunks
for (int i = ; i<FULL_DATA_SIZE; i += N * ) {
// copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0)); kernel << <N / , , , stream0 >> >(dev_a0, dev_b0, dev_c0); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream0)); // copy the locked memory to the device, async
HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1));
HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1)); kernel << <N / , , , stream1 >> >(dev_a1, dev_b1, dev_c1); // copy the data from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream1));
}
HANDLE_ERROR(cudaStreamSynchronize(stream0));
HANDLE_ERROR(cudaStreamSynchronize(stream1)); HANDLE_ERROR(cudaEventRecord(stop, )); HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
start, stop));
printf("Time taken: %3.1f ms\n", elapsedTime); // cleanup the streams and memory
HANDLE_ERROR(cudaFreeHost(host_a));
HANDLE_ERROR(cudaFreeHost(host_b));
HANDLE_ERROR(cudaFreeHost(host_c));
HANDLE_ERROR(cudaFree(dev_a0));
HANDLE_ERROR(cudaFree(dev_b0));
HANDLE_ERROR(cudaFree(dev_c0));
HANDLE_ERROR(cudaFree(dev_a1));
HANDLE_ERROR(cudaFree(dev_b1));
HANDLE_ERROR(cudaFree(dev_c1));
HANDLE_ERROR(cudaStreamDestroy(stream0));
HANDLE_ERROR(cudaStreamDestroy(stream1)); return ;
}
代码下载
basic_double_stream_incorrect的更多相关文章
随机推荐
- Node.js 内置模块crypto使用事件方法(onreadable)加密的一些问题
javaScript代码如下: 'use strict'; const crypto = require('crypto'); //实例化一个AES加密对象 const aesEncrept = cr ...
- 洛谷P2911 [USACO08OCT]牛骨头Bovine Bones【水题】
题目大意:输入S1,S2,S3,随机生成三个数x,y,z,求x+y+z出现次数最多的数(如果有多个答案输出最小的),其中1<=x<=S1,1<=y<=S2,1<=z< ...
- MySQL的复制:MySQL系列之十三
一.MySQL复制相关概念 主从复制:主节点将数据同步到多个从节点 级联复制:主节点将数据同步到一个从节点,其他的从节点在向从节点复制数据 同步复制:将数据从主节点全部同步到从节点时才返回给用户的复制 ...
- 洛谷P4114 Qtree1
题目描述 给定一棵\(n\)个节点的树,有两个操作: \(CHANGE\) \(i\) \(t_i\) 把第\(i\)条边的边权变成\(t_i\) \(QUERY\) \(a\) \(b\) 输出从\ ...
- POJ1040 Transportation
题目来源:http://poj.org/problem?id=1040 题目大意: 某运输公司要做一个测试.从A城市到B城市的一条运输线路中有若干个站,将所有站包括A和B在内按顺序编号为0到m.该路线 ...
- Leetcode题解
前言 Leetcode现在弄了一个Weekly Contest,然后题目又会作为新题目:感觉如果现在还不及时刷题的话可能真的赶不上它题目增长的速度了.......题目会在博客和Github上同步更新的 ...
- 关于强大的requests
存到文件: with open(filename, 'wb') as fd: for chunk in r.iter_content(chunk_size): fd.write(chunk) 使用 R ...
- 四则运算 calc()
它的出现还真的蛮令人惊喜的,很适用于百分比宽度.之前我们有box-sizng,而今又多了一个它,并且,calc的实用性更高.我们可以在border.margin.pading.font-size和wi ...
- js dairy
//留言js逻辑 $(document).ready( function() { $("#post_btn").click( function() { var comment = ...
- Spark Mllib里的分层抽样(使用map作为分层抽样的数据标记)
不多说,直接上干货! 具体,见 Spark Mllib机器学习实战的第4章 Mllib基本数据类型和Mllib数理统计