bsxfun.h multiple threads backup
https://code.google.com/p/deep-learning-faces/source/browse/trunk/cuda_ut/include/bsxfun.h?r=7&spec=svn7
/*
Copyright (C) 2013 Yichuan Tang.
contact: tang at cs.toronto.edu
http://www.cs.toronto.edu/~tang This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ #ifndef _BSXFUN_H_
#define _BSXFUN_H_ #include "cu_util.h"
#include "cu_clmatrix.h" /***********************************************************************************************************
* @brief: this function performs a matrix + col. vector operation *
* @param[in]: pA and pOut: nI by nJ matrix
* pB is a column vector nI by 1
* nInJ is the total dimensionality of the matrix pA
*
* @param[out]:
* @topology: assumes a 1D block layout in x direction and covers the entire matrix pA
* @note: assume column-major
* @change:
* @tested:
* @to_do:
***********************************************************************************************************
*/
template<class O, typename T>
__global__ void bsxfun_colvec_1dkernel( const T* pA, const T* pVec, T* pOut,
int nI, int nJ, int nInJ, O op)
{
const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int totalThreads = blockDim.x*gridDim.x; for (int i = ind; i < nInJ; i += totalThreads)
pOut[i] = op(pA[i], pVec[i % nI]);
} /***********************************************************************************************************
* @brief: this function performs a matrix + row. vector operation
* @param[in]: pA and pOut: nI by nJ matrix
* pVec is a row vector 1 by nJ
* nInJ is the total dimensionality of the matrix pA
*
* @param[out]:
* @topology: assumes a 1D block layout in x direction and covers the entire matrix pA
* @note: assume column-major
* @change:
* @tested:
* @to_do:
***********************************************************************************************************
*/
template<class O, typename T>
__global__ void bsxfun_rowvec_1dkernel( const T* pA, const T* pVec, T* pOut,
int nI, int nJ, int nInJ, O op)
{
const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int totalThreads = blockDim.x*gridDim.x; for (int i = ind; i < nInJ; i += totalThreads)
pOut[i] = op(pA[i], pVec[i / nI]);
} //alpha beta version
template<class O, typename T>
__global__ void bsxfun_colvec_1dkernel( T alpha, const T* pA, T beta, const T* pVec, T* pOut,
int nI, int nJ, int nInJ, O op)
{
const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int totalThreads = blockDim.x*gridDim.x; for (int i = ind; i < nInJ; i += totalThreads)
pOut[i] = op(pA[i], alpha, pVec[i % nI], beta);
} template<class O, typename T>
__global__ void bsxfun_rowvec_1dkernel( T alpha, const T * pA, T beta, const T* pVec, T* pOut,
int nI, int nJ, int nInJ, O op)
{
const unsigned int ind = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int totalThreads = blockDim.x*gridDim.x; for (int i = ind; i < nInJ; i += totalThreads)
pOut[i] = op(pA[i], alpha, pVec[i / nI], beta);
} /***********************************************************************************************************
* @brief: function similar to bsxfun of matlab
* A op B ---> Out
* @param[in]: op - type of operation
* A - first matrix
* B - col/row vector, one dimension must be 1
* @param[out]:
if Out is set to A, the operation is inplace, overwrites A
*
* @topology:
* @note:
* @change:
* @tested:
* @to_do: switch to shared memory operators to see if we can achieve speedup?!
***********************************************************************************************************
*/
template<class O, typename T>
int Bsxfun( const clMatrix<T>& A, O op, const clMatrix<T>& B, clMatrix<T>& Out){ if (! (B.nI == || B.nJ == ) )
return -;
if ( ( B.nI == && B.nJ != A.nJ) || ( B.nJ == && B.nI != A.nI) ){ if (!(B.nI == && B.nJ == )) //special case
return -;
}
if ( A.nI != Out.nI || A.nJ != Out.nJ)
return -; const unsigned int datadim = A.nJ*A.nI;
dim3 dim_block( MEDIUM_NUM_THREADS );
dim3 dim_grid( MIN( MAX_GRIDS, (datadim + dim_block.x-)/dim_block.x) ); if (B.nJ == && B.nI != ){
bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData,
A.nI, A.nJ, datadim, op);
}else if (B.nJ != && B.nI == ){
bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData,
A.nI, A.nJ, datadim, op );
}else{ // when B is 1x1
if (A.nI == ){
bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData,
A.nI, A.nJ, datadim, op);
}else if (A.nJ == ){
bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>( A.pData, B.pData, Out.pData,
A.nI, A.nJ, datadim, op );
}else{
return -; //invalid case
} }
return ;
} //alpha beta version
template<class O, typename T>
int Bsxfun(T alpha, const clMatrix<T>& A, O op, T beta, const clMatrix<T>& B, clMatrix<T>& Out){ if (! (B.nI == || B.nJ == ) )
return -;
if ( ( B.nI == && B.nJ != A.nJ) || ( B.nJ == && B.nI != A.nI) ){ if (!(B.nI == && B.nJ == )) //special case
return -;
}
if ( A.nI != Out.nI || A.nJ != Out.nJ)
return -; const uint64_t datadim = A.nJ*A.nI;
dim3 dim_block( MEDIUM_NUM_THREADS );
dim3 dim_grid( MIN( MAX_GRIDS, (datadim + dim_block.x-)/dim_block.x) ); if (B.nJ == && B.nI != ){
bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>( alpha, A.pData, beta, B.pData, Out.pData,
A.nI, A.nJ, datadim, op);
}else if (B.nJ != && B.nI == ){
bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>( alpha, A.pData, beta, B.pData, Out.pData,
A.nI, A.nJ, datadim, op );
}else{
if (A.nI == ){
bsxfun_colvec_1dkernel<<<dim_grid, dim_block>>>(alpha, A.pData, beta, B.pData, Out.pData,
A.nI, A.nJ, datadim, op);
}else if (A.nJ == ){
bsxfun_rowvec_1dkernel<<<dim_grid, dim_block>>>(alpha, A.pData, beta, B.pData, Out.pData,
A.nI, A.nJ, datadim, op );
}else{
return -; //invalid case
} } return ;
} #endif
bsxfun.h multiple threads backup的更多相关文章
- caffe网络在多线程中无法使用GPU的解决方案 | cpp caffe net run in multiple threads
		
本文首发于个人博客https://kezunlin.me/post/8d877e63/,欢迎阅读! cpp caffe net run in multiple threads Guide set_mo ...
 - Multiple Threads reading from the same file(转载)
		
问 I have a xml file that needs to be read from many many times. I am trying to use the Parallel.ForE ...
 - Android 性能优化(16)线程优化:Creating a Manager for Multiple Threads 如何创建一个线程池管理类
		
Creating a Manager for Multiple Threads 1.You should also read Processes and Threads The previous le ...
 - 临界区代码  critical  section  Locks and critical sections in multiple threads
		
临界区 在同步的程序设计中,临界区段(Critical section)指的是一个访问共享资源(例如:共享设备或是共享存储器)的程序片段,而这些共享资源有无法同时被多个线程访问的特性. 当有线程进入临 ...
 - SQLite multiple threads
		
const int loops = 1000; public void DatabaseThreadSafetyTest() { var backgroundThread = new Thread(n ...
 - Hashtable insert failed. Load factor too high. The most common cause is multiple threads writing to the Hashtable simultaneously
		
暂时也没准确定位到问题 https://support.microsoft.com/zh-cn/help/2803754/hotfix-rollup-2803754-is-available-for- ...
 - PatentTips - Controlling TSC offsets for multiple cores and threads
		
BACKGROUND Many processors include a time stamp count (TSC) counter which is typically implemented a ...
 - Libevent源码学习笔记一:event2/event.h
		
一.libevent标准使用方法: 每个程序使用Libevent必须include <event2/event.h> 头文件,并 传给 -levent 链接器.如果只是想使用主要的eve ...
 - OpenMPI源码剖析4:rte.h 头文件的说明信息
		
上一篇文章中说道,我们在 rte.h 中发现了有价值的说明: 我们一块一块来分析,首先看到第一块,关于 Process name Object: * (a) Process name objects ...
 
随机推荐
- Python subprocess.Popen() error (No such file or directory)
			
这个错误很容易引起误解,一般人都会认为是命令执行了,但是命令找不到作为参数对应的文件或者目录.其实还有一层含义,就是这个命令找不到,命令找不到,也会报没有这个文件或者目录的错误. 为什么找不到这个命令 ...
 - AngularJS2 环境搭建:
			
AngularJS2 基础学习: 参考 mybase 3-26 文件 angular 环境的构建:( 由于 Angular 编写的代码不是 浏览器可以直接运行的,需要经过编译,所以需要构建一个环境) ...
 - fcrackzip (zip密码破解工具)
			
现在做一个例子,首先生成一个带有密码的zip的包 zip -P hujhh test.zip test1.txt test2,txt 可以看到密码是5位的纯字母 现在就用我们的这个软件开始破解 fcr ...
 - NOIP2017感想
			
说实话,这次刚刚看到题目的时候真的有点懵.尤其是第一天的第一题,浪费了太多的时间,一开始天真的以为10的9次方,会爆long long.然后就特别傻的写一个高精度,总觉得自己有哪些细节方面处理的不到位 ...
 - 使用 Docker 搭建 Java Web 运行环境(转)
			
原文 http://www.importnew.com/21798.html Docker 是 2014 年最为火爆的技术之一,几乎所有的程序员都听说过它.Docker 是一种“轻量级”容器技术,它几 ...
 - openresty lua 文件上传与删除
			
[1]openresty 上传upload源码库 Github:https://github.com/openresty/lua-resty-upload 源码文件upload.lua文件 [2]上传 ...
 - photoshop cc 安装失败 2%
			
photoshop cc 安装失败 2% C盘--Program Files---Common Files--Adobe--caps ,把这个文件夹中的文件全部删除,然后再安装 C:\Pr ...
 - [autocomplete]如果条目末尾有空格,MustMatch不起作用
			
如果mustMatch被激活,我们发现,当条目最后包含一个空格时,一旦我们从列表中选择值,它将被拒绝.我们已经发现了这个问题,它在搜索事件中:在第184行,您修剪了输入的值: $.each(trimW ...
 - 20190402 管道符,分号,单引号,双引号,&&的使用
			
:分号叫顺序执行 格式:命令:命令 && :前一条命令执行成功,后面命令继续执行:前面命令执行失败,后者不执行 格式:命令+空格&&空格+命令 || :两管道逻辑执行, ...
 - (转)Awesome Human Pose Estimation
			
Awesome Human Pose Estimation 2018-10-08 11:02:35 Copied from: https://github.com/cbsudux/awesome-hu ...