1. 初始化

import tensorrt as trt

import pycuda.driver as cuda

import pycuda.autoinit    # 此句代码中未使用，但是必须有。this is useful, otherwise stream = cuda.Stream() will cause 'explicit_context_dependent failed: invalid device context - no currently active context?'

如注解所示，import pycuda.autoinit这句话程序中未使用，但是必须包含，否则程序运行会出错。

2. 保存onnx模型

def saveONNX(model, filepath, c, h, w):

    model = model.cuda()

    dummy_input = torch.randn(1, c, h, w, device='cuda')

    torch.onnx.export(model, dummy_input, filepath, verbose=True)

3. 创建tensorrt引擎

def build_engine(onnx_file_path):

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)   # INFO

    # For more information on TRT basics, refer to the introductory samples.

    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:

        if builder.platform_has_fast_fp16:

            print('this card support fp16')

        if builder.platform_has_fast_int8:

            print('this card support int8')

        builder.max_workspace_size = 1 << 30

        with open(onnx_file_path, 'rb') as model:

           parser.parse(model.read())

        return builder.build_cuda_engine(network)

# This function builds an engine from a Caffe model.

def build_engine_int8(onnx_file_path, calib):

    TRT_LOGGER = trt.Logger()

    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:

        # We set the builder batch size to be the same as the calibrator's, as we use the same batches

        # during inference. Note that this is not required in general, and inference batch size is

        # independent of calibration batch size.

        builder.max_batch_size = 1  # calib.get_batch_size()

        builder.max_workspace_size = 1 << 30

        builder.int8_mode = True

        builder.int8_calibrator = calib

        with open(onnx_file_path, 'rb') as model:

           parser.parse(model.read())   # , dtype=trt.float32

        return builder.build_cuda_engine(network)

4. 保存及载入引擎

def save_engine(engine, engine_dest_path):

    buf = engine.serialize()

    with open(engine_dest_path, 'wb') as f:

        f.write(buf)

def load_engine(engine_path):

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)  # INFO

    with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:

        return runtime.deserialize_cuda_engine(f.read())

5. 分配缓冲区

class HostDeviceMem(object):

    def __init__(self, host_mem, device_mem):

        self.host = host_mem

        self.device = device_mem

    def __str__(self):

        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):

        return self.__str__()

def allocate_buffers(engine):

    inputs = []

    outputs = []

    bindings = []

    stream = cuda.Stream()

    for binding in engine:

        dtype = trt.nptype(engine.get_binding_dtype(binding))

        # Allocate host and device buffers

        host_mem = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size, dtype)

        device_mem = cuda.mem_alloc(host_mem.nbytes)

        # Append the device buffer to device bindings.

        bindings.append(int(device_mem))

        # Append to the appropriate list.

        if engine.binding_is_input(binding):

            inputs.append(HostDeviceMem(host_mem, device_mem))

        else:

            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream

6. 前向推断

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):

    # Transfer input data to the GPU.

    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

    # Run inference.

    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)

    # Transfer predictions back from the GPU.

    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

    # Synchronize the stream

    stream.synchronize()

    # Return only the host outputs.

    return [out.host for out in outputs]

7. 矫正（Calibrator）

使用tensorrt的int8时，需要矫正。具体可参见test_onnx_int8及calibrator.py。

8. 具体的推断代码

img_numpy = img.ravel().astype(np.float32)

np.copyto(inputs[0].host, img_numpy)

output = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

output = [np.reshape(stage_i, (10)) for stage_i in output]  # 有多个输出时遍历

9. 代码分析

程序中主要包括下面6个函数。

test_pytorch()            # 测试pytorch模型的代码

export_onnx()             # 导出pytorch模型到onnx模型

test_onnx_fp32()          # 测试tensorrt的fp32模型（有保存引擎的代码）

test_onnx_fp32_engine()   # 测试tensorrt的fp32引擎的代码

test_onnx_int8()          # 测试tensorrt的int8模型（有保存引擎的代码）

test_onnx_int8_engine()   # 测试tensorrt的int8引擎的代码

10. 说明

9的部分函数中，最开始有一句：

torch.load('mnist_cnn_3.pth') # 如果结果不对，加上这句话

因为有时候会碰到，不使用这句话，直接运行代码时，结果完全不正确;加上这句话之后，结果正确了。

具体原因为找到。。。也就先记在这里吧。

（原）pytorch中使用TensorRT的更多相关文章

（原）CNN中的卷积、1x1卷积及在pytorch中的验证
转载请注明处处: http://www.cnblogs.com/darkknightzh/p/9017854.html 参考网址: https://pytorch.org/docs/stable/nn ...
pytorch中tensorboardX的用法
在代码中改好存储Log的路径命令行中输入 tensorboard --logdir /home/huihua/NewDisk1/PycharmProjects/pytorch-deeplab-xce ...
Pytorch中RoI pooling layer的几种实现
Faster-RCNN论文中在RoI-Head网络中,将128个RoI区域对应的feature map进行截取,而后利用RoI pooling层输出7*7大小的feature map.在pytorch ...
[转载]PyTorch中permute的用法
[转载]PyTorch中permute的用法来源:https://blog.csdn.net/york1996/article/details/81876886 permute(dims) 将ten ...
Pytorch中的自编码(autoencoder)
Pytorch中的自编码(autoencoder) 本文资料来源:https://www.bilibili.com/video/av15997678/?p=25 什么是自编码先压缩原数据.提取出最有 ...
pytorch中网络特征图(feture map)、卷积核权重、卷积核最匹配样本、类别激活图(Class Activation Map/CAM)、网络结构的可视化方法
目录 0,可视化的重要性: 1,特征图(feture map) 2,卷积核权重 3,卷积核最匹配样本 4,类别激活图(Class Activation Map/CAM) 5,网络结构的可视化 0,可视 ...
C++primer原书中的一个错误（派生类using声明对基类权限的影响）
在C++primer 第4版的 15章 15.2.5中有以下这样一段提示: "注解:派生类能够恢复继承成员的訪问级别,但不能使訪问级别比基类中原来指定的更严格或者更宽松." 在vs ...
PyTorch官方中文文档：PyTorch中文文档
PyTorch中文文档 PyTorch是使用GPU和CPU优化的深度学习张量库. 说明自动求导机制 CUDA语义扩展PyTorch 多进程最佳实践序列化语义 Package参考 torch to ...
Java原子类中CAS的底层实现
Java原子类中CAS的底层实现从Java到c++到汇编, 深入讲解cas的底层原理. 介绍原理前, 先来一个Demo 以AtomicBoolean类为例.先来一个调用cas的demo. 主线程在f ...

随机推荐

django之分页器、多对多关系、form校验组件
批量插入数据 bulk_create # 1.往书籍表中插入数据 1000 # for i in range(1000): # 这种插入方式效率极低 # models.Book.objects.cr ...
scp、rsync、xsync
scp. 拷贝完全相同 scp -r etc/hadoop/dfs.hosts root@192.168.121.134:/usr/local/hadoop/hadoop-2.7.6/etc/hado ...
了解html标签
<title></title> 1.网页标题 2.当我们收藏网页时,默认标题就是网页标题 3.seo(搜索引擎优化) <h1></h1>~<h6& ...
GDB 调试C++
原来比较熟悉用gdb调试C程序,没有用过gdb调试C++程序,原理上没有什么区别.在形式上有一些区别,因为C++支持名字空间和class等机制,把函数的可见域做了隔离. 拿envoy的代码作个例子: ...
[POI2014]RAJ（最短路，拓扑排序）
对于一个点 \(x\) 如何求答案? 由于这个图是个有向无环图,可以先拓扑排序一遍,求出每个点的拓扑序,从起点到它的最长路 \(d2\),从它到终点的最长路 \(d1\).(我写代码是这么写的,注意顺 ...
[LeetCode] 903. Valid Permutations for DI Sequence DI序列的有效排列
We are given S, a length n string of characters from the set {'D', 'I'}. (These letters stand for &q ...
vue中子组件的methods中获取到props中的值
这个官网很清楚,也很简单,父组件中使用v-bind绑定传送,子组件使用props接收即可例如: 父组件中 <template> <div> <head-top>& ...
推荐一款移动端天气App即刻天气
推荐一款移动端天气App即刻天气一应用描述即刻天气预报是一个提供全国各城市15日天气预报和空气质量的APP,包含全国3000个城市天气预报,3万个乡镇天气,15日及48小时空气质量预报,是万千用 ...
Laravel框架下路由的使用（源码解析）
本篇文章给大家带来的内容是关于Laravel框架下路由的使用(源码解析),有一定的参考价值,有需要的朋友可以参考一下,希望对你有所帮助. 前言我的解析文章并非深层次多领域的解析攻略.但是参考着开发文 ...
【MySQL】Mariadb字符集
Mariadb字符集如果不设置字符集,可以查看mariadb的字符集的默认设置是latin1. 如下命令,查看Mariadb的默认配置: [root@oradb ~]# /usr/local/mys ...

（原）pytorch中使用TensorRT