cudaThreadSynchronize()

// 调用CUDA kernel 是非阻塞的，调用kernel语句后面的语句不等待kernel执行完，立即执行。所以在 call_kernel（see kernel.cu）中执行 m5op.dump 是错误的！！！

// REF: https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Measuring_kernel_runtime

// cudaThreadSynchronize() 暂停调用者的执行，直到前面的 stream operation 执行完毕。

// REF: https://stackoverflow.com/questions/13485018/cudastreamsynchronize-vs-cudadevicesynchronize-vs-cudathreadsynchronize

// C++ thread join 问题，在 kernel.cpp 中也有 join，那么是在 kernel.cpp 中 dump 还是在main.cpp中join后面dump?

// REF: http://en.cppreference.com/w/cpp/algorithm/for_each

// 若 GPU 先执行完毕，在 main.cpp 中join后 dump 似乎合理；若 CPU 先执行完毕，岂不是要阻塞在 cudaThreadSynchronize 处？

// 暂且在 kernel.cp p中 dump！

kernel.cpp

// CPU threads--------------------------------------------------------------------------------------

void run_cpu_threads(T *matrix_out, T *matrix, std::atomic_int *flags, int n, int m, int pad, int n_threads, int ldim, int n_tasks, float alpha

#ifdef CUDA_8_0

    , std::atomic_int *worklist

#endif

    ) {

        std::cout<<"run_cpu_threads start."<<std::endl;

    const int                REGS_CPU = REGS * ldim;

    std::vector<std::thread> cpu_threads;

    for(int i = ; i < n_threads; i++) {

        cpu_threads.push_back(std::thread([=]() {

#ifdef CUDA_8_0

            Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads, worklist);

#else

            Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads);

#endif

            const int matrix_size       = m * (n + pad);

            const int matrix_size_align = (matrix_size + ldim * REGS - ) / (ldim * REGS) * (ldim * REGS);

            for(int my_s = cpu_first(&p); cpu_more(&p); my_s = cpu_next(&p)) {

                // Declare on-chip memory

                T   reg[REGS_CPU];

                int pos      = matrix_size_align -  - (my_s * REGS_CPU);

                int my_s_row = pos / (n + pad);

                int my_x     = pos % (n + pad);

                int pos2     = my_s_row * n + my_x;

// Load in on-chip memory

#pragma unroll

                for(int j = ; j < REGS_CPU; j++) {

                    if(pos2 >=  && my_x < n && pos2 < matrix_size)

                        reg[j] = matrix[pos2];

                    else

                        reg[j] = ;

                    pos--;

                    my_s_row = pos / (n + pad);

                    my_x     = pos % (n + pad);

                    pos2     = my_s_row * n + my_x;

                }

                // Set global synch

                while((&flags[my_s])->load() == ) {

                }

                (&flags[my_s + ])->fetch_add();

                // Store to global memory

                pos = matrix_size_align -  - (my_s * REGS_CPU);

#pragma unroll

                for(int j = ; j < REGS_CPU; j++) {

                    if(pos >=  && pos < matrix_size)

                        matrix_out[pos] = reg[j];

                    pos--;

                }

            }

        }));

    }

    std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); });

    std::cout<<"dump.. after run_cpu_threads end."<<std::endl;

    m5_dump_stats(,);

}

kernel.cu

cudaError_t call_Padding_kernel(int blocks, int threads, int n, int m, int pad, int n_tasks, float alpha,

    T *matrix_out, T *matrix, int *flags

#ifdef CUDA_8_0

    , int l_mem_size, int *worklist

#endif

    ){

        std::cout<<"call_pad start."<<std::endl;

    dim3 dimGrid(blocks);

    dim3 dimBlock(threads);

    Padding_kernel<<<dimGrid, dimBlock

#ifdef CUDA_8_0

        , l_mem_size

#endif

        >>>(n, m, pad, n_tasks, alpha,

        matrix_out, matrix, flags

#ifdef CUDA_8_0

        , worklist

#endif

        );

    cudaError_t err = cudaGetLastError();

    std::cout<<"dump.. after call_pad end."<<std::endl;

    m5_dump_stats(,);

    return err;

}

main.cpp

for(int rep = ; rep < p.n_warmup + p.n_reps; rep++) {

        // Reset

#ifdef CUDA_8_0

        for(int i = ; i < p.n_bins; i++) {

            h_histo[i].store();

        }

#else

        memset(h_histo, , p.n_bins * sizeof(unsigned int));

        cudaStatus = cudaMemcpy(d_histo, h_histo, p.n_bins * sizeof(unsigned int), cudaMemcpyHostToDevice);

        cudaThreadSynchronize();

        CUDA_ERR();

#endif

        std::cout<<"m5 work begin."<<std::endl;

        // Launch GPU threads

        // Kernel launch

        if(p.n_gpu_blocks > ) {

            std::cout<<"launch gpu."<<std::endl;

            cudaStatus = call_Histogram_kernel(p.n_gpu_blocks, p.n_gpu_threads, p.in_size, p.n_bins, n_cpu_bins,

                d_in, (unsigned int*)d_histo, p.n_bins * sizeof(unsigned int));

            CUDA_ERR();

        }

        // Launch CPU threads

        std::cout<<"launch cpu."<<std::endl;

        std::thread main_thread(run_cpu_threads, (unsigned int *)h_histo, h_in, p.in_size, p.n_bins, p.n_threads,

            p.n_gpu_threads, n_cpu_bins);

            std::cout<<"cuda sync."<<std::endl;

        cudaThreadSynchronize();

        std::cout<<"cpu join after cuda sync."<<std::endl;

        main_thread.join();

        //m5_work_end(0, 0);

        std::cout<<"m5 work end."<<std::endl;

    }

cudaThreadSynchronize()的更多相关文章

cuda多线程间通信
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <std ...
cuda并行计算的几种模式
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <std ...
【OpenCV & CUDA】OpenCV和Cuda结合编程
一.利用OpenCV中提供的GPU模块目前,OpenCV中已提供了许多GPU函数,直接使用OpenCV提供的GPU模块,可以完成大部分图像处理的加速操作. 基本使用方法,请参考:http://www ...
CUDA入门1
1GPUs can handle thousands of concurrent threads. 2The pieces of code running on the gpu are calle ...
win7(X64)系统下cuda7.5和VS2013的配置
&1 安装 cuda7.5文件:链接:http://pan.baidu.com/s/1bU2zIQ 密码:nvyw &2 环境变量注意:CUDA_PATH是安装好cuda7.5之后会 ...
使用 CUBLAS 库给矩阵运算提速
前言编写 CUDA 程序真心不是个简单的事儿,调试也不方便,很费时.那么有没有一些现成的 CUDA 库来调用呢? 答案是有的,如 CUBLAS 就是 CUDA 专门用来解决线性代数运算的库. 本文将 ...
CUDA编程
目录: 1.什么是CUDA 2.为什么要用到CUDA 3.CUDA环境搭建 4.第一个CUDA程序 5. CUDA编程 5.1. 基本概念 5.2. 线程层次结构 5.3. 存储器层次结构 5.4. ...
CUDA从入门到精通
http://blog.csdn.net/augusdi/article/details/12833235 CUDA从入门到精通(零):写在前面在老板的要求下.本博主从2012年上高性能计算课程開始 ...
CUDA编程－（2）其实写个矩阵相乘并不是那么难
程序代码及图解析: #include <iostream> #include "book.h" __global__ void add( int a, int b, i ...

随机推荐

多年珍藏的55w御剑字典
御剑珍藏55w目录字典,很给力,放在以前直接数据库都能给跑出来. 用法:直接把放入配置文件的目录链接:https://pan.baidu.com/s/1MGxdd9hH006Y7AO7CpkO8g ...
Codeforces 598E：Chocolate Bar
E. Chocolate Bar time limit per test 2 seconds memory limit per test 256 megabytes input standard in ...
struts2--验证器
1.输入验证: --struts2提供了一些基于Xwork Validation Framework的内建验证程序,使用这些验证程序不需要变编程,只要在一个XML文件里进行声明,声明的内容如下: &g ...
Python—处理Excel表格
一.使用xlrd和xlwt这两个库来处理excel,即xlrd是读excel的库,xlwt是写excel的库 1.使用 xlrd 读取Excel数据 # -*- coding:utf-8 -*- im ...
IOS pin约束问题存在间隙
今天在为自己的view添加约束对比以前添加的约束时,发现有有两层淡红色线框一条实线和一条虚线,而以前一个demo中添加的则只有一个蓝色实线框. 今天添加的约束如图1所示: 图1 而以前添加约束如图2 ...
在ListView头和尾添加东西
直接上代码 import android.support.v7.app.AppCompatActivity; import android.os.Bundle; import android.view ...
fuseki远程访问方法
./fuseki-server启动服务后,我们的服务只能是localhost访问,无法被其他人访问,那么要怎么修改呢.很简单,把apche-jena-fuseki-3.10.0/run 下面的shi ...
sftp 多用户安装与配置
sftp 是 Secure File Transfer Protocol 的缩写,安全文件传送协议.可以为传输文件提供一种安全的加密方法.SFTP 为 SSH 的一部分,由于这种传输方式使用了加密/解 ...
springboot官网->application.properties文件
springboot application.properties 2.1.6.RELEASE
Linux远程上传文件
#对拷文件夹 (包括文件夹本身) scp -r /home/slk root@192.168.1.5:/home # 对拷文件并重命名 scp /home/a.txt root@192.168.1.5 ...

cudaThreadSynchronize()

cudaThreadSynchronize()的更多相关文章

随机推荐

热门专题