bsxfun.h multiple threads backup

https://code.google.com/p/deep-learning-faces/source/browse/trunk/cuda_ut/include/bsxfun.h?r=7&spec=svn7

/*

Copyright (C) 2013 Yichuan Tang.

contact: tang at cs.toronto.edu

http://www.cs.toronto.edu/~tang

This program is free software: you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation, either version 3 of the License, or

(at your option) any later version.

This program is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with this program.  If not, see <http://www.gnu.org/licenses/>.

*/

#ifndef _BSXFUN_H_

#define _BSXFUN_H_

#include "cu_util.h"

#include "cu_clmatrix.h"

/***********************************************************************************************************

 * @brief:              this function performs a matrix + col. vector operation *

 * @param[in]:   pA and pOut: nI by nJ matrix

 *                               pB is a column vector nI by 1

 *                               nInJ is the total dimensionality of the matrix pA

 *

 * @param[out]:

 * @topology:   assumes a 1D block layout in x direction and covers the entire matrix pA

 * @note:               assume column-major

 * @change:

 * @tested:

 * @to_do:

 ***********************************************************************************************************

 */

template<class O, typename T>

__global__ void bsxfun_colvec_1dkernel( const T* pA,  const T* pVec, T* pOut,

                                                                                int nI, int nJ, int nInJ, O op)

{

        const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x;

        const unsigned int totalThreads = blockDim.x*gridDim.x;

        for (int i = ind; i < nInJ; i += totalThreads)

                pOut[i] = op(pA[i], pVec[i % nI]);

}

/***********************************************************************************************************

 * @brief:              this function performs a matrix + row. vector operation

 * @param[in]:   pA and pOut: nI by nJ matrix

 *                               pVec is a row vector 1 by nJ

 *                               nInJ is the total dimensionality of the matrix pA

 *

 * @param[out]:

 * @topology:   assumes a 1D block layout in x direction and covers the entire matrix pA

 * @note:               assume column-major

 * @change:

 * @tested:

 * @to_do:

 ***********************************************************************************************************

 */

template<class O, typename T>

__global__ void bsxfun_rowvec_1dkernel(  const T* pA,  const T* pVec, T* pOut,

                                                                                 int nI, int nJ, int nInJ, O op)

{

        const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x;

        const unsigned int totalThreads = blockDim.x*gridDim.x;

        for (int i = ind; i < nInJ; i += totalThreads)

                pOut[i] = op(pA[i], pVec[i / nI]);

}

//alpha beta version

template<class O, typename T>

__global__ void bsxfun_colvec_1dkernel( T alpha, const T* pA,  T beta, const T* pVec, T* pOut,

                                                                                int nI, int nJ, int nInJ, O op)

{

        const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x;

        const unsigned int totalThreads = blockDim.x*gridDim.x;

        for (int i = ind; i < nInJ; i += totalThreads)

                pOut[i] = op(pA[i], alpha, pVec[i % nI], beta);

}

template<class O, typename T>

__global__ void bsxfun_rowvec_1dkernel(  T alpha, const T * pA,  T beta, const T* pVec, T* pOut,

                                                                                 int nI, int nJ, int nInJ, O op)

{

        const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x;

        const unsigned int totalThreads = blockDim.x*gridDim.x;

        for (int i = ind; i < nInJ; i += totalThreads)

                pOut[i] = op(pA[i], alpha, pVec[i / nI], beta);

}

/***********************************************************************************************************

* @brief: function similar to bsxfun of matlab

* A op B ---> Out

* @param[in]:   op - type of operation

*                               A - first matrix

*                               B - col/row vector, one dimension must be 1

* @param[out]:

                        if Out is set to A, the operation is inplace, overwrites A

*

* @topology:

* @note:

* @change:

* @tested:

* @to_do:               switch to shared memory operators to see if we can achieve speedup?!

***********************************************************************************************************

*/

template<class O, typename T>

int Bsxfun( const clMatrix<T>& A, O op, const clMatrix<T>& B, clMatrix<T>& Out){

        if (! (B.nI ==  || B.nJ == ) )

                return -;

        if ( ( B.nI ==  && B.nJ != A.nJ) || ( B.nJ ==  && B.nI != A.nI) ){

                if (!(B.nI ==  && B.nJ == ))  //special case

                        return -;

        }

        if ( A.nI != Out.nI || A.nJ != Out.nJ)

                return -;

        const unsigned int datadim = A.nJ*A.nI;

        dim3 dim_block( MEDIUM_NUM_THREADS );

        dim3 dim_grid( MIN( MAX_GRIDS, (datadim + dim_block.x-)/dim_block.x) );

        if (B.nJ ==  && B.nI != ){

                bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData,

                                                                                                                        A.nI, A.nJ, datadim, op);

        }else if (B.nJ !=  && B.nI == ){

                bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData,

                                                                                                                                  A.nI, A.nJ, datadim, op );

        }else{ // when B is 1x1

                if (A.nI == ){

                        bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData,

                                                                                                                        A.nI, A.nJ, datadim, op);

                }else if (A.nJ == ){

                        bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData,

                                                                                                                                  A.nI, A.nJ, datadim, op );

                }else{

                        return -;  //invalid case

                }

        }

        return ;

}

//alpha beta version

template<class O, typename T>

int Bsxfun(T alpha, const clMatrix<T>& A, O op, T beta, const clMatrix<T>& B, clMatrix<T>& Out){

        if (! (B.nI ==  || B.nJ == ) )

                return -;

        if ( ( B.nI ==  && B.nJ != A.nJ) || ( B.nJ ==  && B.nI != A.nI) ){

                if (!(B.nI ==  && B.nJ == ))  //special case

                        return -;

        }

        if ( A.nI != Out.nI || A.nJ != Out.nJ)

                return -;

        const uint64_t datadim = A.nJ*A.nI;

        dim3 dim_block( MEDIUM_NUM_THREADS );

        dim3 dim_grid( MIN( MAX_GRIDS, (datadim + dim_block.x-)/dim_block.x) );

        if (B.nJ ==  && B.nI != ){

                bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>( alpha, A.pData, beta, B.pData, Out.pData,

                                                                                                                                  A.nI, A.nJ, datadim, op);

        }else if (B.nJ !=  && B.nI == ){

                bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>( alpha, A.pData, beta, B.pData, Out.pData,

                                                                                                                                  A.nI, A.nJ, datadim, op );

        }else{

                if (A.nI == ){

                        bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>(alpha, A.pData, beta, B.pData, Out.pData,

                                                                                                                        A.nI, A.nJ, datadim, op);

                }else if (A.nJ == ){

                        bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>(alpha, A.pData, beta, B.pData, Out.pData,

                                                                                                                                  A.nI, A.nJ, datadim, op );

                }else{

                        return -;  //invalid case

                }

        }

        return ;

}

#endif

bsxfun.h multiple threads backup的更多相关文章

caffe网络在多线程中无法使用GPU的解决方案 | cpp caffe net run in multiple threads
本文首发于个人博客https://kezunlin.me/post/8d877e63/,欢迎阅读! cpp caffe net run in multiple threads Guide set_mo ...
Multiple Threads reading from the same file（转载）
问 I have a xml file that needs to be read from many many times. I am trying to use the Parallel.ForE ...
Android 性能优化（16）线程优化：Creating a Manager for Multiple Threads 如何创建一个线程池管理类
Creating a Manager for Multiple Threads 1.You should also read Processes and Threads The previous le ...
临界区代码 critical section Locks and critical sections in multiple threads
临界区在同步的程序设计中,临界区段(Critical section)指的是一个访问共享资源(例如:共享设备或是共享存储器)的程序片段,而这些共享资源有无法同时被多个线程访问的特性. 当有线程进入临 ...
SQLite multiple threads
const int loops = 1000; public void DatabaseThreadSafetyTest() { var backgroundThread = new Thread(n ...
Hashtable insert failed. Load factor too high. The most common cause is multiple threads writing to the Hashtable simultaneously
暂时也没准确定位到问题 https://support.microsoft.com/zh-cn/help/2803754/hotfix-rollup-2803754-is-available-for- ...
PatentTips - Controlling TSC offsets for multiple cores and threads
BACKGROUND Many processors include a time stamp count (TSC) counter which is typically implemented a ...
Libevent源码学习笔记一：event2/event.h
一.libevent标准使用方法: 每个程序使用Libevent必须include <event2/event.h> 头文件,并传给 -levent 链接器.如果只是想使用主要的eve ...
OpenMPI源码剖析4：rte.h 头文件的说明信息
上一篇文章中说道,我们在 rte.h 中发现了有价值的说明: 我们一块一块来分析,首先看到第一块,关于 Process name Object: * (a) Process name objects ...

随机推荐

GDB查看堆栈局部变量
GDB查看堆栈局部变量 “参数从右到左入栈”,“局部变量在栈上分配空间”,听的耳朵都起茧子了.最近做项目涉及C和汇编互相调用,写代码的时候才发现没真正弄明白.自己写了个最简单的函数,用gdb跟踪了调用 ...
[js]es6语法: 字符串和数组的方法
s的方法根据index取value: 取首尾项,arr[0], arr[arr.length-1] 根据value取index(判断是否包含子字符串): s.indexOf 栗子: 'maotai' ...
python基础(十三) cmd命令调用
python cmd命令调用关于python调用cmd命令: 主要介绍两种方式: 1.python的OS模块. OS模块调用CMD命令有两种方式:os.popen(),os.system(). 都是 ...
[Android] websocket客户端开发
为了能够在H5和APP都保持同一套长连接接口,因为采用websocket协议作为开发使用的第三方库是:https://github.com/TakahikoKawasaki/nv-websocket ...
CFRunLoop 源码学习笔记(CF-1151.16)
1.CFRunLoopModeRef 什么时候创建的? 在调用__CFRunLoopFindMode(rl, modeName, create) 1.1)首先通过modeName 在RunLoop 中 ...
WinSDK(菜单笔记)
3D打印机如何添加自动调平功能
原理说明 Kossel/Rostock等Delta(并联/三角洲)类型的机器,可以参考:http://learn.makerlab.me/guides/11 3d打印打印时最重要的是第一层的效果,如果 ...
AES,BigInteger,MD5加密
http://tool.oschina.net/apidocs/apidoc?api=jdk-zh package cn.com.gome.cashier.web; import java.lang. ...
sublime text 入门
sublime text3入门教程 2017年07月19日 09:15:51 阅读数:13736 作者:sam976 转载需征得作者本人同意,谢谢. 1.介绍所谓工欲善其事必先利其器,编码过程合理熟 ...
ELK学习笔记之基于kakfa (confluent)搭建ELK
0x00 概述测试搭建一个使用kafka作为消息队列的ELK环境,数据采集转换实现结构如下: F5 HSL–>logstash(流处理)–> kafka –>elasticsear ...

bsxfun.h multiple threads backup

bsxfun.h multiple threads backup的更多相关文章

随机推荐

热门专题