通过整理LeNet、AlexNet、VGG16、googLeNet、ResNet、MLP统计出的常用算子(不包括ReLU)，表格是对比。

Prelu

Cpu版

Gpu版

for (int i = 0; i < count; ++i) {

int c = (i / dim) % channels / div_factor;

top_data[i] = std::max(bottom_data[i], Dtype(0))

+ slope_data[c] * std::min(bottom_data[i], Dtype(0));

}

Kernel代码

for (int i = blockIdx.x * blockDim.x + threadIdx.x; \

i < (n); i += blockDim.x * gridDim.x)

{

int c = (index / dim) % channels / div_factor;

out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];

}

Im2col

Cpu版：

Gpu版本

channel_size = height * width;

for (int channel = channels; channel--; data_im += channel_size) {

for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {

for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {

int input_row = -pad_h + kernel_row * dilation_h;

for (int output_rows = output_h; output_rows; output_rows--) {

if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {

for (int output_cols = output_w; output_cols; output_cols--) {

*(data_col++) = 0;

}

} else {

int input_col = -pad_w + kernel_col * dilation_w;

for (int output_col = output_w; output_col; output_col--) {

if (is_a_ge_zero_and_a_lt_b(input_col, width)) {

*(data_col++) = data_im[input_row * width + input_col];

} else {

*(data_col++) = 0;

}

input_col += stride_w;

}

input_row += stride_h;

}

kernel

for (int i = blockIdx.x * blockDim.x + threadIdx.x; \

i < (n); i += blockDim.x * gridDim.x)

{

const int h_index = i/ width_col;

const int h_col = h_index % height_col;

const int w_col = i % width_col;

const int c_im = h_index / height_col;

const int c_col = c_im * kernel_h * kernel_w;

const int h_offset = h_col * stride_h - pad_h;

const int w_offset = w_col * stride_w - pad_w;

Dtype* data_col_ptr = data_col;

data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;

const Dtype* data_im_ptr = data_im;

data_im_ptr += (c_im * height + h_offset) * width + w_offset;

for (int i = 0; i < kernel_h; ++i) {

for (int j = 0; j < kernel_w; ++j) {

int h_im = h_offset + i * dilation_h;

int w_im = w_offset + j * dilation_w;

*data_col_ptr =

(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?

data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;

data_col_ptr += height_col * width_col;

}

Host代码：

int num_kernels = channels * height_col * width_col;

im2col_gpu_kernel<Dtype><<<(num_kernels+511)/512, 512>>>( … …)；

Pool算子

Cpu版:AVE版本的,MAX类似,缺少Stochastic

Gpu版本

for (int n = 0; n < bottom[0]->num(); ++n) {

for (int c = 0; c < channels_; ++c) {

for (int ph = 0; ph < pooled_height_; ++ph) {

for (int pw = 0; pw < pooled_width_; ++pw) {

int hstart = ph * stride_h_ - pad_h_;

int wstart = pw * stride_w_ - pad_w_;

int hend = min(hstart + kernel_h_, height_);

int wend = min(wstart + kernel_w_, width_);

hstart = max(hstart, 0);

wstart = max(wstart, 0);

const int pool_index = ph * pooled_width_ + pw;

for (int h = hstart; h < hend; ++h) {

for (int w = wstart; w < wend; ++w) { //AVE

top_data[ph * pooled_width_ + pw] +=

bottom_data[h * width_ + w];

}

// compute offset

bottom_data += bottom[0]->offset(0, 1);

top_data += top[0]->offset(0, 1);

}

const int w = index % width;

const int h = (index / width) % height;

const int c = (index / width / height) % channels;

const int n = index / width / height / channels;

const int phstart =

(h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;

const int phend = min((h + pad_h) / stride_h + 1, pooled_height);

const int pwstart =

(w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;

const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);

Dtype gradient = 0;

const int offset = (n * channels + c) * pooled_height * pooled_width;

const Dtype* const top_diff_slice = top_diff + offset;

if (mask) {

const int* const mask_slice = mask + offset;

for (int ph = phstart; ph < phend; ++ph) {

for (int pw = pwstart; pw < pwend; ++pw) {

if (mask_slice[ph * pooled_width + pw] == h * width + w) {

gradient += top_diff_slice[ph * pooled_width + pw];

}

bottom_diff[index] = gradient;

FC算子(InnerProduct)

Cpu版:

Gpu版本：和CPU版本一致

caffe_cpu_gemm<Dtype>(CblasNoTrans, transpose_ ? CblasNoTrans : CblasTrans,M_, N_, K_, (Dtype)1.,bottom_data, weight, (Dtype)0., top_data);

if (bias_term_) {

caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,bias_multiplier_.cpu_data(),this->blobs_[1]->cpu_data(), (Dtype)1., top_data);

}

Dropout算子

Cpu版:

Gpu版本

// Create random numbers

caffe_rng_bernoulli(count, 1. - threshold_, mask);

for (int i = 0; i < count; ++i) {

top_data[i] = bottom_data[i] * mask[i] * scale_;

}

Softmax算子

Cpu版:

Gpu版本

for (int i = 0; i < outer_num_; ++i) {

// initialize scale_data to the first plane

caffe_copy(inner_num_, bottom_data + i * dim, scale_data);

for (int j = 0; j < channels; j++) {

for (int k = 0; k < inner_num_; k++) {

scale_data[k] = std::max(scale_data[k],

bottom_data[i * dim + j * inner_num_ + k]);

}

// subtraction

caffe_cpu_gemm<Dtype>(CblasNoTrans,CblasNoTrans,channels, inner_num_,

1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);

// exponentiation

caffe_exp<Dtype>(dim, top_data, top_data);

// sum after exp

caffe_cpu_gemv<Dtype>(CblasTrans, channels, inner_num_, 1.,

top_data, sum_multiplier_.cpu_data(), 0., scale_data);

// division

for (int j = 0; j < channels; j++) {

caffe_div(inner_num_, top_data, scale_data, top_data);

top_data += inner_num_;

}

kernel 以kernel_channel_sum为例

int n = index / spatial_dim;

int s = index % spatial_dim;

Dtype sum = 0;

for (int c = 0; c < channels; ++c) {

sum += data[(n * channels + c) * spatial_dim + s];

}

channel_sum[index] = sum;

Host代码：

kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,scale_data);

kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,scale_data, top_data);

kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, top_data, top_data);

kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,scale_data);

kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,scale_data, top_data);

Sigmoid算子

Cpu版:

Gpu版本

for (int i = 0; i < count; ++i) {

top_data[i] = sigmoid(bottom_data[i]);

}

Sigmoid：

inline Dtype sigmoid(Dtype x) {

return 0.5 * tanh(0.5 * x) + 0.5;

}

out[index] = 0.5 * tanh(0.5 * in[index]) + 0.5;

LRN算子

Cpu版:

Gpu版本：和Cpu一样

split_layer_->Forward(bottom, split_top_vec_);

square_layer_->Forward(square_bottom_vec_, square_top_vec_);//PowerLayer

pool_layer_->Forward(square_top_vec_, pool_top_vec_);

power_layer_->Forward(pool_top_vec_, power_top_vec_);

product_layer_->Forward(product_bottom_vec_, top);//EltwiseLayer

Split算子

Cpu版:

Gpu版本：和Cpu一样

for (int i = 0; i < top.size(); ++i) {

top[i]->ShareData(*bottom[0]);

}

ShareData：

void Blob<Dtype>::ShareData(const Blob& other) {

CHECK_EQ(count_, other.count());

data_ = other.data();

}

Eltwise算子

Cpu版:MAX版本的

Gpu版本

// Initialize

mask = max_idx_.mutable_cpu_data();

caffe_set(count, -1, mask);

caffe_set(count, Dtype(-FLT_MAX), top_data);

// bottom 0 & 1

bottom_data_a = bottom[0]->cpu_data();

bottom_data_b = bottom[1]->cpu_data();

for (int idx = 0; idx < count; ++idx) {

if (bottom_data_a[idx] > bottom_data_b[idx]) {

top_data[idx] = bottom_data_a[idx]; // maxval

mask[idx] = 0; // maxid

} else {

top_data[idx] = bottom_data_b[idx]; // maxval

mask[idx] = 1; // maxid

}

// bottom 2++

for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) {

bottom_data_b = bottom[blob_idx]->cpu_data();

for (int idx = 0; idx < count; ++idx) {

if (bottom_data_b[idx] > top_data[idx]) {

top_data[idx] = bottom_data_b[idx]; // maxval

mask[idx] = blob_idx; // maxid

}

kernel

Dtype maxval = -FLT_MAX;

int maxidx = -1;

if (bottom_data_a[index] > bottom_data_b[index]) {

// only update for very first bottom_data blob (blob_idx == 0)

if (blob_idx == 0) {

maxval = bottom_data_a[index];

top_data[index] = maxval;

maxidx = blob_idx;

mask[index] = maxidx;

}

} else {

maxval = bottom_data_b[index];

top_data[index] = maxval;

maxidx = blob_idx + 1;

mask[index] = maxidx;

}

Host代码：

mask = max_idx_.mutable_gpu_data();

MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count,bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask);

for (int i = 2; i < bottom.size(); ++i) {

MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask);

}

SigmoidCrossEntropyLoss算子

Cpu版:

Gpu版本

for (int i = 0; i < bottom[0]->count(); ++i) {

const int target_value = static_cast<int>(target[i]);

if (has_ignore_label_ && target_value == ignore_label_) {

continue;

}

loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -

log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));

++valid_count;

}

//return max(1.0,normalizer_)

normalizer_ = get_normalizer(normalization_, valid_count);

top[0]->mutable_cpu_data()[0] = loss / normalizer_;

kernel

const int target_value = static_cast<int>(target[i]);

if (has_ignore_label_ && target_value == ignore_label_) {

loss[i] = 0;

counts[i] = 0;

} else {

loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -

log(1 + exp(input_data[i] - 2 * input_data[i] *

(input_data[i] >= 0)));

counts[i] = 1;

}

Host代码：

SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data,

has_ignore_label_, ignore_label_, count_data);

// Only launch another CUDA kernel if we actually need the valid count.

if (normalization_ == LossParameter_NormalizationMode_VALID &&

has_ignore_label_) {

caffe_gpu_asum(count, count_data, &valid_count);

} else {

valid_count = count;

}

Scale算子

Cpu版:

Gpu版本

for (int n = 0; n < outer_dim_; ++n) {

for (int d = 0; d < scale_dim_; ++d) {

const Dtype factor = scale_data[d];

caffe_cpu_scale(inner_dim_, factor, bottom_data, top_data);

bottom_data += inner_dim_;

top_data += inner_dim_;

}

caffe_cpu_scale：

for (int i = 0; i < N; ++i) {

Y[i] += alpha;

}

kernel

const int scale_index = (index / inner_dim) % scale_dim;

out[index] = in[index] * scale[scale_index];

BatchNorm算子

Cpu版:代码太长待分析	Gpu版本
	实现逻辑与CPU版本一致，只不过所有的函数调用都成了caffe_gpu_gemm这种

Caffe常用算子GPU和CPU对比的更多相关文章

基于opencv的gpu与cpu对比程序，代码来自opencv的文档中
原文链接: http://www.opencv.org.cn/opencvdoc/2.3.2/html/doc/tutorials/gpu/gpu-basics-similarity/gpu-basi ...
SparkRDD简介/常用算子/依赖/缓存
SparkRDD简介/常用算子/依赖/缓存 RDD简介 RDD(Resilient Distributed Dataset)叫做分布式数据集,是Spark中最基本的数据抽象,它代表一个不可变.可分区. ...
GPU、CPU的异同
一.概念 CPU(Center Processing Unit)即中央处理器,GPU(Graphics Processing Unit)即图形处理器. 二.CPU和GPU的相同之处两者都有总线和外界 ...
spark学习(10)-RDD的介绍和常用算子
RDD(弹性分布式数据集,里面并不存储真正要计算的数据,你对RDD的操作,他会在Driver端转换成Task,下发到Executor计算分散在多台集群上的数据) RDD是一个代理,你对代理进行操作,他 ...
【转】GPU 与CPU的作用协调，工作流程、GPU整合到CPU得好处
在不少人的心目中,显卡最大的用途可能就只有两点--玩游戏.看电影,除此之外,GPU并没有其他的作用了.但是随着微软IE9的正式发布,不少人突然发现,微软一直提到一个名词:GPU硬件加速,从而也让不少人 ...
Windows 10下安装配置Caffe并支持GPU加速(修改版)
基本环境建议严格按照版本来 - Windows 10 - Visual Studio 2013 - Matlab R2016b - Anaconda - CUDA 8.0.44 - cuDNN v4 ...
爬虫常用Xpath和CSS3选择器对比
爬虫常用Xpath和CSS3选择器对比 1. 简介 CSS是来配合HTML工作的,和Xpath对比起来,CSS选择器通常都比较短小,但是功能不够强大.CSS中的空白符' '和Xpath的'//'都表示 ...
GPU 与CPU的作用协调，工作流程、GPU整合到CPU得好处
http://blog.csdn.net/maopig/article/details/6803141 在不少人的心目中,显卡最大的用途可能就只有两点——玩游戏.看电影,除此之外,GPU并没有其他的作 ...
spark常用算子总结
算子分为value-transform, key-value-transform, action三种.f是输入给算子的函数,比如lambda x: x**2 常用算子: keys: 取pair rdd ...

随机推荐

百度地图js判断点是否在圆形区域内
/** * Created by LEGION on 2018/10/11. *//** * @fileoverview GeoUtils类提供若干几何算法,用来帮助用户判断点与矩形. * 圆形.多边 ...
mysql timestamp为0值时，python读取后的对象为None
MySQL数据表中,如果timestamp类型的字段,值为0, python从数据库读取数据后,得到对象是什么类型,是否为None呢? 下面来测试下. 创建数据表首先创建数据表,其中字段pr_rul ...
jsp页面中使用javabean
<%@ page language="java" import="java.util.*,com.loaderman.demo.b_cases.*" pa ...
Carousel 走马灯
在有限空间内,循环播放同一类型的图片.文字等内容基础用法适用广泛的基础用法结合使用el-carousel和el-carousel-item标签就得到了一个走马灯.幻灯片的内容是任意的,需要放在e ...
Ceph 故障修复记录（持续更新）
目录文章目录目录问题:故障域与副本数导致的 PG 不正常问题:故障域与副本数导致的 PG 不正常缘起:执行 rbd snap unprotect 执行卡死. 调查:Ceph 集群 PG 不正 ...
八十：memcached之安装与参数
Memcached是一个高并发的内存键值对缓存系统,它的主要作用是将数据库查询结果,内容,以及其它一些耗时的计算结果缓存到系统内存中,从而加速Web应用程序的响应速度. 官网:http://memca ...
k8s常用操作命令
K8s常用命令操作一.kubectl命令补全 .master安装命令补全,并临时生效 yum install -y bash-completion source /usr/share/bash-co ...
Python浅拷贝与深拷贝（可变对象与不可变对象）
第一次遇到深拷贝和浅拷贝的问题是用python在一个for循环中对一个list赋值,使用的语句是 a = b 这个b会不断带入循环,每次计算得到,最后发现list乱七八糟的,后来才发现,python中 ...
docker命令大全与资源汇总
容器生命周期管理 run //创建一个新的容器并运行一个命令 start/stop/restart //启动一个或多个已经被停止的容器:停止一个运行中的容器:重启容器 kill //杀掉一个运行 ...
Spring是什么？什么是IOC（Inversin of control）? 什么是AOP （Aspect-Oriented Programming）？
spring是一个开源容器框架,可以接管web层.service层.dao层.持久层的组件,spring底下是一个bean工厂,用户产生各种bean,spring可以配置各种bean,和维护bean与 ...

Caffe常用算子GPU和CPU对比

Prelu

Im2col

Pool算子

FC算子(InnerProduct)

Dropout算子

Softmax算子

Sigmoid算子

LRN算子

Split算子

Eltwise算子

SigmoidCrossEntropyLoss算子

Scale算子

BatchNorm算子

Caffe常用算子GPU和CPU对比的更多相关文章

随机推荐

热门专题