Caffe常用算子GPU和CPU对比
通过整理LeNet、AlexNet、VGG16、googLeNet、ResNet、MLP统计出的常用算子(不包括ReLU),表格是对比。
Prelu
Cpu版 |
Gpu版 |
for (int i = 0; i < count; ++i) { int c = (i / dim) % channels / div_factor; top_data[i] = std::max(bottom_data[i], Dtype(0)) + slope_data[c] * std::min(bottom_data[i], Dtype(0)); } |
Kernel代码 for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); i += blockDim.x * gridDim.x) { int c = (index / dim) % channels / div_factor; out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; } |
Im2col
Cpu版: |
Gpu版本 |
channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { int input_row = -pad_h + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { for (int output_cols = output_w; output_cols; output_cols--) { *(data_col++) = 0; } } else { int input_col = -pad_w + kernel_col * dilation_w; for (int output_col = output_w; output_col; output_col--) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) { *(data_col++) = data_im[input_row * width + input_col]; } else { *(data_col++) = 0; } input_col += stride_w; } } input_row += stride_h; } } |
kernel for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); i += blockDim.x * gridDim.x) { const int h_index = i/ width_col; const int h_col = h_index % height_col; const int w_col = i % width_col; const int c_im = h_index / height_col; const int c_col = c_im * kernel_h * kernel_w; const int h_offset = h_col * stride_h - pad_h; const int w_offset = w_col * stride_w - pad_w; Dtype* data_col_ptr = data_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; const Dtype* data_im_ptr = data_im; data_im_ptr += (c_im * height + h_offset) * width + w_offset; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { int h_im = h_offset + i * dilation_h; int w_im = w_offset + j * dilation_w; *data_col_ptr = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * dilation_h * width + j * dilation_w] : 0; data_col_ptr += height_col * width_col; } } } Host代码: int num_kernels = channels * height_col * width_col; im2col_gpu_kernel<Dtype><<<(num_kernels+511)/512, 512>>>( … …); |
Pool算子
Cpu版:AVE版本的,MAX类似,缺少Stochastic |
Gpu版本 |
for (int n = 0; n < bottom[0]->num(); ++n) { for (int c = 0; c < channels_; ++c) { for (int ph = 0; ph < pooled_height_; ++ph) { for (int pw = 0; pw < pooled_width_; ++pw) { int hstart = ph * stride_h_ - pad_h_; int wstart = pw * stride_w_ - pad_w_; int hend = min(hstart + kernel_h_, height_); int wend = min(wstart + kernel_w_, width_); hstart = max(hstart, 0); wstart = max(wstart, 0); const int pool_index = ph * pooled_width_ + pw; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { //AVE top_data[ph * pooled_width_ + pw] += bottom_data[h * width_ + w]; } } } } // compute offset bottom_data += bottom[0]->offset(0, 1); top_data += top[0]->offset(0, 1); } } |
const int w = index % width; const int h = (index / width) % height; const int c = (index / width / height) % channels; const int n = index / width / height / channels; const int phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; const int phend = min((h + pad_h) / stride_h + 1, pooled_height); const int pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); Dtype gradient = 0; const int offset = (n * channels + c) * pooled_height * pooled_width; const Dtype* const top_diff_slice = top_diff + offset; if (mask) { const int* const mask_slice = mask + offset; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { if (mask_slice[ph * pooled_width + pw] == h * width + w) { gradient += top_diff_slice[ph * pooled_width + pw]; } } } } bottom_diff[index] = gradient; |
FC算子(InnerProduct)
Cpu版: |
Gpu版本:和CPU版本一致 |
caffe_cpu_gemm<Dtype>(CblasNoTrans, transpose_ ? CblasNoTrans : CblasTrans,M_, N_, K_, (Dtype)1.,bottom_data, weight, (Dtype)0., top_data); if (bias_term_) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,bias_multiplier_.cpu_data(),this->blobs_[1]->cpu_data(), (Dtype)1., top_data); } |
Dropout算子
Cpu版: |
Gpu版本 |
// Create random numbers caffe_rng_bernoulli(count, 1. - threshold_, mask); for (int i = 0; i < count; ++i) { top_data[i] = bottom_data[i] * mask[i] * scale_; } |
Softmax算子
Cpu版: |
Gpu版本 |
for (int i = 0; i < outer_num_; ++i) { // initialize scale_data to the first plane caffe_copy(inner_num_, bottom_data + i * dim, scale_data); for (int j = 0; j < channels; j++) { for (int k = 0; k < inner_num_; k++) { scale_data[k] = std::max(scale_data[k], bottom_data[i * dim + j * inner_num_ + k]); } } // subtraction caffe_cpu_gemm<Dtype>(CblasNoTrans,CblasNoTrans,channels, inner_num_, 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data); // exponentiation caffe_exp<Dtype>(dim, top_data, top_data); // sum after exp caffe_cpu_gemv<Dtype>(CblasTrans, channels, inner_num_, 1., top_data, sum_multiplier_.cpu_data(), 0., scale_data); // division for (int j = 0; j < channels; j++) { caffe_div(inner_num_, top_data, scale_data, top_data); top_data += inner_num_; } } |
kernel 以kernel_channel_sum为例 int n = index / spatial_dim; int s = index % spatial_dim; Dtype sum = 0; for (int c = 0; c < channels; ++c) { sum += data[(n * channels + c) * spatial_dim + s]; } channel_sum[index] = sum; Host代码: kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,scale_data); kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,scale_data, top_data); kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, top_data, top_data); kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,scale_data); kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,scale_data, top_data); |
Sigmoid算子
Cpu版: |
Gpu版本 |
for (int i = 0; i < count; ++i) { top_data[i] = sigmoid(bottom_data[i]); } Sigmoid: inline Dtype sigmoid(Dtype x) { return 0.5 * tanh(0.5 * x) + 0.5; } |
out[index] = 0.5 * tanh(0.5 * in[index]) + 0.5; |
LRN算子
Cpu版: |
Gpu版本:和Cpu一样 |
split_layer_->Forward(bottom, split_top_vec_); square_layer_->Forward(square_bottom_vec_, square_top_vec_);//PowerLayer pool_layer_->Forward(square_top_vec_, pool_top_vec_); power_layer_->Forward(pool_top_vec_, power_top_vec_); product_layer_->Forward(product_bottom_vec_, top);//EltwiseLayer |
Split算子
Cpu版: |
Gpu版本:和Cpu一样 |
for (int i = 0; i < top.size(); ++i) { top[i]->ShareData(*bottom[0]); } ShareData: void Blob<Dtype>::ShareData(const Blob& other) { CHECK_EQ(count_, other.count()); data_ = other.data(); } |
Eltwise算子
Cpu版:MAX版本的 |
Gpu版本 |
// Initialize mask = max_idx_.mutable_cpu_data(); caffe_set(count, -1, mask); caffe_set(count, Dtype(-FLT_MAX), top_data); // bottom 0 & 1 bottom_data_a = bottom[0]->cpu_data(); bottom_data_b = bottom[1]->cpu_data(); for (int idx = 0; idx < count; ++idx) { if (bottom_data_a[idx] > bottom_data_b[idx]) { top_data[idx] = bottom_data_a[idx]; // maxval mask[idx] = 0; // maxid } else { top_data[idx] = bottom_data_b[idx]; // maxval mask[idx] = 1; // maxid } } // bottom 2++ for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) { bottom_data_b = bottom[blob_idx]->cpu_data(); for (int idx = 0; idx < count; ++idx) { if (bottom_data_b[idx] > top_data[idx]) { top_data[idx] = bottom_data_b[idx]; // maxval mask[idx] = blob_idx; // maxid } } } |
kernel Dtype maxval = -FLT_MAX; int maxidx = -1; if (bottom_data_a[index] > bottom_data_b[index]) { // only update for very first bottom_data blob (blob_idx == 0) if (blob_idx == 0) { maxval = bottom_data_a[index]; top_data[index] = maxval; maxidx = blob_idx; mask[index] = maxidx; } } else { maxval = bottom_data_b[index]; top_data[index] = maxval; maxidx = blob_idx + 1; mask[index] = maxidx; } Host代码: mask = max_idx_.mutable_gpu_data(); MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count,bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask); for (int i = 2; i < bottom.size(); ++i) { MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask); } |
SigmoidCrossEntropyLoss算子
Cpu版: |
Gpu版本 |
for (int i = 0; i < bottom[0]->count(); ++i) { const int target_value = static_cast<int>(target[i]); if (has_ignore_label_ && target_value == ignore_label_) { continue; } loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); ++valid_count; } //return max(1.0,normalizer_) normalizer_ = get_normalizer(normalization_, valid_count); top[0]->mutable_cpu_data()[0] = loss / normalizer_; |
kernel const int target_value = static_cast<int>(target[i]); if (has_ignore_label_ && target_value == ignore_label_) { loss[i] = 0; counts[i] = 0; } else { loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) - log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); counts[i] = 1; } Host代码: SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data, has_ignore_label_, ignore_label_, count_data); // Only launch another CUDA kernel if we actually need the valid count. if (normalization_ == LossParameter_NormalizationMode_VALID && has_ignore_label_) { caffe_gpu_asum(count, count_data, &valid_count); } else { valid_count = count; } |
Scale算子
Cpu版: |
Gpu版本 |
for (int n = 0; n < outer_dim_; ++n) { for (int d = 0; d < scale_dim_; ++d) { const Dtype factor = scale_data[d]; caffe_cpu_scale(inner_dim_, factor, bottom_data, top_data); bottom_data += inner_dim_; top_data += inner_dim_; } } caffe_cpu_scale: for (int i = 0; i < N; ++i) { Y[i] += alpha; } |
kernel const int scale_index = (index / inner_dim) % scale_dim; out[index] = in[index] * scale[scale_index]; |
BatchNorm算子
Cpu版:代码太长待分析 |
Gpu版本 |
实现逻辑与CPU版本一致,只不过所有的函数调用都成了caffe_gpu_gemm这种 |
Caffe常用算子GPU和CPU对比的更多相关文章
- 基于opencv的gpu与cpu对比程序,代码来自opencv的文档中
原文链接: http://www.opencv.org.cn/opencvdoc/2.3.2/html/doc/tutorials/gpu/gpu-basics-similarity/gpu-basi ...
- SparkRDD简介/常用算子/依赖/缓存
SparkRDD简介/常用算子/依赖/缓存 RDD简介 RDD(Resilient Distributed Dataset)叫做分布式数据集,是Spark中最基本的数据抽象,它代表一个不可变.可分区. ...
- GPU、CPU的异同
一.概念 CPU(Center Processing Unit)即中央处理器,GPU(Graphics Processing Unit)即图形处理器. 二.CPU和GPU的相同之处 两者都有总线和外界 ...
- spark学习(10)-RDD的介绍和常用算子
RDD(弹性分布式数据集,里面并不存储真正要计算的数据,你对RDD的操作,他会在Driver端转换成Task,下发到Executor计算分散在多台集群上的数据) RDD是一个代理,你对代理进行操作,他 ...
- 【转】GPU 与CPU的作用协调,工作流程、GPU整合到CPU得好处
在不少人的心目中,显卡最大的用途可能就只有两点--玩游戏.看电影,除此之外,GPU并没有其他的作用了.但是随着微软IE9的正式发布,不少人突然发现,微软一直提到一个名词:GPU硬件加速,从而也让不少人 ...
- Windows 10下安装配置Caffe并支持GPU加速(修改版)
基本环境 建议严格按照版本来 - Windows 10 - Visual Studio 2013 - Matlab R2016b - Anaconda - CUDA 8.0.44 - cuDNN v4 ...
- 爬虫常用Xpath和CSS3选择器对比
爬虫常用Xpath和CSS3选择器对比 1. 简介 CSS是来配合HTML工作的,和Xpath对比起来,CSS选择器通常都比较短小,但是功能不够强大.CSS中的空白符' '和Xpath的'//'都表示 ...
- GPU 与CPU的作用协调,工作流程、GPU整合到CPU得好处
http://blog.csdn.net/maopig/article/details/6803141 在不少人的心目中,显卡最大的用途可能就只有两点——玩游戏.看电影,除此之外,GPU并没有其他的作 ...
- spark常用算子总结
算子分为value-transform, key-value-transform, action三种.f是输入给算子的函数,比如lambda x: x**2 常用算子: keys: 取pair rdd ...
随机推荐
- 2019-8-13未命名文件 sdfsdfsdfsdfsdfsdf
2019-8-13未命名文件 sdfsdfsdfsdfsdfsdf 新建模板小书匠 欢迎使用 小书匠(xiaoshujiang)编辑器,您可以通过 小书匠主按钮>模板 里的模板管理来改变新建文章 ...
- ConstraintLayout的简单介绍和使用
ConstraintLayout是Android Studio 2.2中主要的新增功能之一,也是Google在去年的I/O大会上重点宣传的一个功能.我们都知道,在传统的Android开发当中,界面基本 ...
- ts-loader如何与vue单文件组件衔接
.ts-loader是如何与vue单文件组件衔接作用的 https://github.com/microsoft/TypeScript-Vue-Starter https://www.npmjs.co ...
- Servlet中@WebServlet("XXXX")注解无效,访问servlet报404错误
1.问题描述 servlet使用注解配置,经检查无错误,如图: tomcat正常启动,页面正常运行,当访问servlet时出现无响应的情况,控制台报错404,未访问到servlet. 经检查,我的错误 ...
- 爱伪装(AWZ)/爱立思(ALS)改机改串一键新机原理分析
简介 爱伪装(AWZ)/爱立思(ALS)是一款iOS越狱系统上的改机工具,可以修改多种系统参数达到伪装设备型号及各种软硬件属性的目的,同时提供了防越狱检测机制,常用于iOS上的推广刷量,配合代理/VP ...
- Linux-把任务放到后台
公司用的服务器,只能ssh远程操作,每天都会自动退出账户,不知道怎么回事儿,很郁闷.所以每天早起重新登录后发现进程已经关闭了,因为你运行的任务是和terminal关联在一起的,terminal关闭后, ...
- 【新功能前瞻】SpreadJS 纯前端表格控件V12.2:打印增强、拖拽填充等六大特性
新版本来袭:葡萄城 SpreadJS 纯前端表格控件的全新版本 V12.2 将于8月正式发布! 作为一款备受华为.招商银行.中国平安.苏宁易购等行业专家和前端开发者认可的纯 JavaScript 电子 ...
- 小菜鸟之Oracle数据库
select * from STUDENT; select * from mark; select * from COURSE; select * from teacher; --注释 select ...
- Oracle表概念
对于初学者来说,对表的概念也有一定的认识.因为我们对数据库的操作,90%以上是对表的操作. 常见表的规则表(Regular table),严格意义上来说又叫 heap table(堆表),也就是我们最 ...
- 2019.07.05 纪中_B
今日膜拜:czj大佬orz%%% 2019.07.05[NOIP提高组]模拟 B 组 今天做题的时候大概能判断出题人的考点,可是就是没学过...特别痛苦 T0:栈的定义,模拟就好了T1:感觉像是找规律 ...