cuda yv12_to

前言

　　项目需要将yv12转rgb24，由于基于x86平台，开始就没多想，直接用ipp加速实现了，后来在评估项目瓶颈的时候发现，1080p的视频每一帧转换居然要花8ms，刚好项目里有用到nvidia gtx960，因此就产生了直接用cuda实现一个yv12转rgb24的想法。

具体实施

　　我一向不喜欢造轮子，因此，第一步就是搜索有没有现成的代码。搜索了很久，包括opencv里都没找到yv12 to rgb24的，还好网上找到了一篇yv12 to argb的，我拿过来照着改改就ok了（包括代码风格及bug修复）。下面直接贴出代码，有任何疑问，可以留言讨论

#include "cuda.h"

#include "cuda_runtime.h"

#include "cuda_runtime_api.h"

#include <stdio.h>

#define COLOR_COMPONENT_BIT_SIZE 10

#define COLOR_COMPONENT_MASK     0x3FF

__constant__ float const_hue_colorspace_mat[9]={1.1644f,0.0f,1.596f,1.1644f,-0.3918f,-0.813f,1.1644f,2.0172f,0.0f};

__device__ static void yuv2rgb(const int *yuvi, float *red, float *green,float *blue)

{

    float luma, chromacb, chromacr;

    // Prepare for hue adjustment

    luma     =(float)yuvi[0];

    chromacb =(float)((int)yuvi[1]-512.0f);

    chromacr =(float)((int)yuvi[2]-512.0f);

   // Convert YUV To RGB with hue adjustment

   *red   = (luma     * const_hue_colorspace_mat[0])+

            (chromacb * const_hue_colorspace_mat[1])+

            (chromacr * const_hue_colorspace_mat[2]);

   *green = (luma     * const_hue_colorspace_mat[3])+

            (chromacb * const_hue_colorspace_mat[4])+

            (chromacr * const_hue_colorspace_mat[5]);

   *blue  = (luma     * const_hue_colorspace_mat[6])+

            (chromacb * const_hue_colorspace_mat[7])+

            (chromacr * const_hue_colorspace_mat[8]);

}

__global__ void yv12torgb24_fourpixel(const unsigned char *src, unsigned char *dst, int width, int height, int dst_pitch)

{

    // Pad borders with duplicate pixels, and we multiply by 2 because we process 4 pixels per thread

    const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);

    const int y = blockIdx.y * (blockDim.y << 1) + (threadIdx.y << 1);

    if((x + 1) >= width ||(y + 1) >= height)

       return;

    // Read 4 Luma components at a time

    int yuv101010Pel[4];

    yuv101010Pel[0] = (src[y * width + x]) << 2;

    yuv101010Pel[1] = (src[y * width + x + 1]) << 2;

    yuv101010Pel[2] = (src[(y + 1)* width + x]) << 2;

    yuv101010Pel[3] = (src[(y + 1)* width + x + 1]) << 2;

    const unsigned int voffset = width * height;

    const unsigned int uoffset = voffset + (voffset >> 2);

    const unsigned int vpitch = width >> 1;

    const unsigned int upitch = vpitch;

    const int x_chroma = x >> 1;

    const int y_chroma = y >> 1;

    int chromaCb = src[uoffset + y_chroma * upitch + x_chroma];      //U

    int chromaCr = src[voffset + y_chroma * vpitch + x_chroma];      //V

    yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));

    yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

    yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));

    yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

    yuv101010Pel[2] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));

    yuv101010Pel[2] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

    yuv101010Pel[3] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));

    yuv101010Pel[3] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

    // this steps performs the color conversion

    int yuvi[12];

    float red[4], green[4], blue[4];

    yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);

    yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);

    yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

    yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);

    yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);

    yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

    yuvi[6] = (yuv101010Pel[2] & COLOR_COMPONENT_MASK);

    yuvi[7] = ((yuv101010Pel[2] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);

    yuvi[8] = ((yuv101010Pel[2] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

    yuvi[9] = (yuv101010Pel[3] & COLOR_COMPONENT_MASK);

    yuvi[10] = ((yuv101010Pel[3] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);

    yuvi[11] = ((yuv101010Pel[3] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

    // YUV to RGB Transformation conversion

    yuv2rgb(&yuvi[0], &red[0], &green[0], &blue[0]);

    yuv2rgb(&yuvi[3], &red[1], &green[1], &blue[1]);

    yuv2rgb(&yuvi[6], &red[2], &green[2], &blue[2]);

    yuv2rgb(&yuvi[9], &red[3], &green[3], &blue[3]);

    float _red, _green, _blue;

    _red   =::fmin(::fmax(red[0], 0.0f), 1023.f);

    _green =::fmin(::fmax(green[0], 0.0f), 1023.f);

    _blue  =::fmin(::fmax(blue[0], 0.0f), 1023.f);

    dst[y * dst_pitch + x*3 + 0] = (((unsigned int)_blue) & 0x3ff) >> 2;

    dst[y * dst_pitch + x*3 + 1] = (((unsigned int)_green) & 0x3ff) >> 2;

    dst[y * dst_pitch + x*3 + 2] = (((unsigned int)_red) & 0x3ff) >> 2;

    _red   =::fmin(::fmax(red[1], 0.0f), 1023.f);

    _green =::fmin(::fmax(green[1], 0.0f), 1023.f);

    _blue  =::fmin(::fmax(blue[1], 0.0f), 1023.f);

    dst[y * dst_pitch + x*3 + 3] = (((unsigned int)_blue) & 0x3ff) >> 2;

    dst[y * dst_pitch + x*3 + 4] = (((unsigned int)_green) & 0x3ff) >> 2;

    dst[y * dst_pitch + x*3 + 5] = (((unsigned int)_red) & 0x3ff) >> 2;

    _red   =::fmin(::fmax(red[2], 0.0f), 1023.f);

    _green =::fmin(::fmax(green[2], 0.0f), 1023.f);

    _blue  =::fmin(::fmax(blue[2], 0.0f), 1023.f);

    dst[(y+1) * dst_pitch + x*3 + 0] = (((unsigned int)_blue) & 0x3ff) >> 2;

    dst[(y+1) * dst_pitch + x*3 + 1] = (((unsigned int)_green) & 0x3ff) >> 2;

    dst[(y+1) * dst_pitch + x*3 + 2] = (((unsigned int)_red) & 0x3ff) >> 2;

    _red   =::fmin(::fmax(red[3], 0.0f), 1023.f);

    _green =::fmin(::fmax(green[3], 0.0f), 1023.f);

    _blue  =::fmin(::fmax(blue[3], 0.0f), 1023.f);

    dst[(y+1) * dst_pitch + x*3 + 3] = (((unsigned int)_blue) & 0x3ff) >> 2;

    dst[(y+1) * dst_pitch + x*3 + 4] = (((unsigned int)_green) & 0x3ff) >> 2;

    dst[(y+1) * dst_pitch + x*3 + 5] = (((unsigned int)_red) & 0x3ff) >> 2;

}

bool yv12_to_rgb24(unsigned char *src, unsigned char *dst,int src_width,int src_height, int dst_pitch)

{

    unsigned char *d_src;

    unsigned int src_mem_size = sizeof(unsigned char ) * src_width * src_height * 3/2;

    dim3 block(32,8);

    int gridx = (src_width +2*block.x -1)/(2*block.x);

    int gridy = (src_height +2*block.y -1)/(2*block.y);

    dim3 grid(gridx, gridy);

    cudaMalloc((void**)&d_src,src_mem_size);

    cudaMemcpy(d_src, src, src_mem_size, cudaMemcpyHostToDevice);

    yv12torgb24_fourpixel<<<grid,block>>>(d_src, dst, src_width, src_height, dst_pitch);

    cudaFree(d_src);

    return true;

}

总结

经过cuda加速后的转换能够在1ms左右完成，还是比较理想的^_

完！

2016年8月

cuda yv12_to_rgb24的更多相关文章

CUDA[2] Hello,World
Section 0:Hello,World 这次我们亲自尝试一下如何用粗(CU)大(DA)写程序 CUDA最新版本是7.5,然而即使是最新版本也不兼容VS2015 ...推荐使用VS2012 进入VS ...
CUDA[1] Introductory
Section 0 :Induction of CUDA CUDA是啥?CUDA®: A General-Purpose Parallel Computing Platform and Program ...
Couldn't open CUDA library cublas64_80.dll etc. tensorflow-gpu on windows
I c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\dso_load ...
ubuntu 16.04 + N驱动安装＋CUDA+Qt5 + opencv
Nvidia driver installation(after download XX.run installation file) 1. ctrl+Alt+F1 //go to virtual ...
手把手教你搭建深度学习平台——避坑安装theano+CUDA
python有多混乱我就不多说了.这个混论不仅是指整个python市场混乱,更混乱的还有python的各种附加依赖包.为了一劳永逸解决python的各种依赖包对深度学习造成的影响,本文中采用pytho ...
[CUDA] CUDA to DL
又是一枚祖国的骚年,阅览做做笔记:http://www.cnblogs.com/neopenx/p/4643705.html 这里只是一些基础知识.帮助理解DL tool的实现. “这也是深度学习带来 ...
基于Ubuntu14.04系统的nvidia tesla K40驱动和cuda 7.5安装笔记
基于Ubuntu14.04系统的nvidia tesla K40驱动和cuda 7.5安装笔记飞翔的蜘蛛人注1:本人新手,文章中不准确的地方,欢迎批评指正注2:知识储备应达到Linux入门级水平 ...
CUDA程序设计(一)
为什么需要GPU 几年前我启动并主导了一个项目,当时还在谷歌,这个项目叫谷歌大脑.该项目利用谷歌的计算基础设施来构建神经网络. 规模大概比之前的神经网络扩大了一百倍,我们的方法是用约一千台电脑.这确实 ...
使用 CUDA范例精解通用GPU编程配套程序的方法
用vs新建一个cuda的项目,然后将系统自动生成的那个.cu里头的内容,除了头文件引用外,全部替代成先有代码的内容. 然后程序就能跑了. 因为新建的是cuda的项目,所以所有的头文件和库的引用系统都会 ...

随机推荐

剑指offer-跳台阶08
题目描述一只青蛙一次可以跳上1级台阶,也可以跳上2级.求该青蛙跳上一个n级的台阶总共有多少种跳法(先后次序不同算不同的结果). class Solution: def jumpFloor(self, ...
关于iframe的使用以及自适应页面高度
1. <a href="port" target="frame_view">港口资料</a> <iframe id="e ...
linux备忘录-shell脚本
知识 shell执行方式 shell执行方式有通过source或. 在现在的bash环境中执行脚本变量等会保持通过bash shell.sh或sh shell.sh 使用一个新的bash环境执行 ...
URAL 1741 Communication Fiend（最短路径）
Description Kolya has returned from a summer camp and now he's a real communication fiend. He spends ...
Could not resolve com.android.support.constraint:constraint-layout:1.1.3.
原文地址: http://fanjiajia.cn/2018/09/25/Android%20Studio%20Could%20not%20resolve%20com.android.support. ...
C#中的Stack的Peek操作，曝出异常
我们在遍历一个栈的时候,有时候需要判断栈顶元素,用到了Peek元素,然后再用Pop元素,但是这个时候会出现一个逻辑错误, 当用Pop删除全部栈的元素时,再用Peek就会报错, InvalidOpera ...
Mybatis学习系列（七）缓存机制
Mybatis缓存介绍 MyBatis提供一级缓存和二级缓存机制. 一级缓存是Sqlsession级别的缓存,Sqlsession类的实例对象中有一个hashmap用于缓存数据.不同的Sqlsessi ...
JMS实战——ActiveMQ实现Pub-Sub
前言上篇博客<JMS实战--ActiveMQ>介绍了ActiveMQ的安装,并实现了简单的PTP模型.这篇博客我们来看一下Pub-Sub模型,之后来总结一下JMS. 实现项目结构其中 ...
【SSH】——Hibernate三种状态之间的转化
Hibernate的三种状态为:transient.persistent和detached.对这三种状态的理解可以结合Session缓存,在Session缓存中的状态为persistent,另外两种不 ...
【工具学习】——Maven的安装与配置
[含义] 什么是构建? 构建,英文build.构建包括编译.运行.生成文档.打包.部署等等工作内容,如果我们每天手工去干这些事情,那会浪费很多的时间.因此,构建管理工具应运而生. maven,作为项目 ...

cuda yv12_to_rgb24

前言

具体实施

总结

cuda yv12_to_rgb24的更多相关文章

随机推荐

热门专题