1.确定要提供服务的inference的input,output,以及exporter的signature;(这里用classify的signature做例子,input为byte数组,output为float数组)

2.编写proto文件

syntax = "proto3";
 
 
 
 
option java_package = "com.lenovo.tensorflow";
 
option java_outer_classname = "ServingApiProtocol";
 
option java_generic_services = true;
 
option java_generate_equals_and_hash = true;
 
 
 
 
package com.lenovo;
 
 
 
 
message ClassificationRequest {
 
    bytes imageData = 1 ;
 
}
 
 
 
 
message ClassificationResponse {
 
    repeated float score = 1;
 
}
 
 
 
 
service ClassificationService {
 
    rpc classify(ClassificationRequest) returns (ClassificationResponse) {}
 
}

2.生成相关代码(这里client为java,server为c++)

具体操作参考grpc.io的文档

3.编写inference C++代码

/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// A gRPC server that classifies images into digit 0-9.
// Given each request with an image pixels encoded as floats, the server
// responds with 10 float values as probabilities for digit 0-9 respectively.
// The classification is done by running image data through a convolutional
// network trained and exported by mnist_model.py.
// The server constantly monitors a file system storage path for models.
// Whenever a new version of model is available, it eagerly unloads older
// version before loading the new one. The server also batches multiple
// requests together and does batched inference for efficiency.
// The intention of this example to demonstrate usage of DymanicManager,
// VersionPolicy and BasicBatchScheduler.
#include <stddef.h>
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "grpc++/completion_queue.h"
#include "grpc++/security/server_credentials.h"
#include "grpc++/server.h"
#include "grpc++/server_builder.h"
#include "grpc++/server_context.h"
#include "grpc++/support/async_unary_call.h"
#include "grpc++/support/status.h"
#include "grpc++/support/status_code_enum.h"
#include "grpc/grpc.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.pb.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/init_main.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/command_line_flags.h"
#include "tensorflow_serving/batching/basic_batch_scheduler.h"
#include "tensorflow_serving/batching/batch_scheduler.h"
#include "tensorflow_serving/core/manager.h"
#include "tensorflow_serving/core/servable_handle.h"
#include "tensorflow_serving/core/servable_id.h"
#include "tensorflow_serving/lenovo/serving-api.pb.h"
#include "tensorflow_serving/lenovo/serving-api.grpc.pb.h"
#include "tensorflow_serving/servables/tensorflow/simple_servers.h"
#include "tensorflow_serving/session_bundle/manifest.pb.h"
#include "tensorflow_serving/session_bundle/session_bundle.h"
#include "tensorflow_serving/session_bundle/signature.h"
using grpc::InsecureServerCredentials;
using grpc::Server;
using grpc::ServerAsyncResponseWriter;
using grpc::ServerBuilder;
using grpc::ServerContext;
using grpc::ServerCompletionQueue;
using grpc::Status;
using grpc::StatusCode;
using tensorflow::string;
using tensorflow::Tensor;
using tensorflow::serving::ClassificationSignature;
using com::lenovo::ClassificationRequest;
using com::lenovo::ClassificationResponse;
using com::lenovo::ClassificationService;
namespace {
const int kImageSize = 107;
const int kNumChannels = 3;
const int kImageDataSize = kImageSize * kImageSize * kNumChannels;
const int kNumLabels = 2;
class LenovoServiceImpl;
// Class encompassing the state and logic needed to serve a request.
class CallData {
 public:
  CallData(LenovoServiceImpl* service_impl,
           ClassificationService::AsyncService* service,
           ServerCompletionQueue* cq);
  void Proceed();
  void Finish(Status status);
  const ClassificationRequest& request() { return request_; }
  ClassificationResponse* mutable_response() { return &response_; }
 private:
  // Service implementation.
  LenovoServiceImpl* service_impl_;
  // The means of communication with the gRPC runtime for an asynchronous
  // server.
  ClassificationService::AsyncService* service_;
  // The producer-consumer queue where for asynchronous server notifications.
  ServerCompletionQueue* cq_;
  // Context for the rpc, allowing to tweak aspects of it such as the use
  // of compression, authentication, as well as to send metadata back to the
  // client.
  ServerContext ctx_;
  // What we get from the client.
  ClassificationRequest request_;
  // What we send back to the client.
  ClassificationResponse response_;
  // The means to get back to the client.
  ServerAsyncResponseWriter<ClassificationResponse> responder_;
  // Let's implement a tiny state machine with the following states.
  enum CallStatus { CREATE, PROCESS, FINISH };
  CallStatus status_;  // The current serving state.
};
// A Task holds all of the information for a single inference request.
struct Task : public tensorflow::serving::BatchTask {
  ~Task() override = default;
  size_t size() const override { return 1; }
  Task(CallData* calldata_arg)
      : calldata(calldata_arg) {}
  CallData* calldata;
};
class LenovoServiceImpl final {
 public:
  LenovoServiceImpl(const string& servable_name,
                   std::unique_ptr<tensorflow::serving::Manager> manager);
  void Classify(CallData* call_data);
  // Produces classifications for a batch of requests and associated responses.
  void DoClassifyInBatch(
      std::unique_ptr<tensorflow::serving::Batch<Task>> batch);
  // Name of the servable to use for inference.
  const string servable_name_;
  // Manager in charge of loading and unloading servables.
  std::unique_ptr<tensorflow::serving::Manager> manager_;
  // A scheduler for batching multiple request calls into single calls to
  // Session->Run().
  std::unique_ptr<tensorflow::serving::BasicBatchScheduler<Task>>
      batch_scheduler_;
};
// Take in the "service" instance (in this case representing an asynchronous
// server) and the completion queue "cq" used for asynchronous communication
// with the gRPC runtime.
CallData::CallData(LenovoServiceImpl* service_impl,
                   ClassificationService::AsyncService* service,
                   ServerCompletionQueue* cq)
    : service_impl_(service_impl),
      service_(service), cq_(cq), responder_(&ctx_), status_(CREATE) {
  // Invoke the serving logic right away.
  Proceed();
}
void CallData::Proceed() {
  if (status_ == CREATE) {
    // As part of the initial CREATE state, we *request* that the system
    // start processing Classify requests. In this request, "this" acts are
    // the tag uniquely identifying the request (so that different CallData
    // instances can serve different requests concurrently), in this case
    // the memory address of this CallData instance.
    service_->Requestclassify(&ctx_, &request_, &responder_, cq_, cq_, this);
    // Make this instance progress to the PROCESS state.
    status_ = PROCESS;
  else if (status_ == PROCESS) {
    // Spawn a new CallData instance to serve new clients while we process
    // the one for this CallData. The instance will deallocate itself as
    // part of its FINISH state.
    new CallData(service_impl_, service_, cq_);
    // Start processing.
    service_impl_->Classify(this);
  else {
    GPR_ASSERT(status_ == FINISH);
    // Once in the FINISH state, deallocate ourselves (CallData).
    delete this;
  }
}
void CallData::Finish(Status status) {
  status_ = FINISH;
  responder_.Finish(response_, status, this);
}
LenovoServiceImpl::LenovoServiceImpl(
    const string& servable_name,
    std::unique_ptr<tensorflow::serving::Manager> manager)
    : servable_name_(servable_name), manager_(std::move(manager)) {
  // Setup a batcher used to combine multiple requests (tasks) into a single
  // graph run for efficiency.
  // The batcher queues tasks until,
  //  (a) the next task would cause the batch to exceed the size target;
  //  (b) waiting for more tasks to be added would exceed the timeout.
  // at which point it processes the entire batch.
  //
  // Use the default batch-size, timeout and thread options.  In general
  // the numbers are extremely performance critical and should be tuned based
  // specific graph structure and usage.
  tensorflow::serving::BasicBatchScheduler<Task>::Options scheduler_options;
  scheduler_options.thread_pool_name = "mnist_service_batch_threads";
  // Use a very large queue, to avoid rejecting requests. (Note: a production
  // server with load balancing may want to use the default, much smaller,
  // value.)
  scheduler_options.max_enqueued_batches = 1;
  TF_CHECK_OK(tensorflow::serving::BasicBatchScheduler<Task>::Create(
      scheduler_options,
      [this](std::unique_ptr<tensorflow::serving::Batch<Task>> batch) {
        this->DoClassifyInBatch(std::move(batch));
      },
      &batch_scheduler_));
}
// Creates a gRPC Status from a TensorFlow Status.
Status ToGRPCStatus(const tensorflow::Status& status) {
  return Status(static_cast<grpc::StatusCode>(status.code()),
                status.error_message());
}
// WARNING(break-tutorial-inline-code): The following code snippet is
// in-lined in tutorials, please update tutorial documents accordingly
// whenever code changes.
void LenovoServiceImpl::Classify(CallData* calldata) {
  // Verify input.
  /** FIXME
  if (calldata->request().image_data_size() != kImageDataSize) {
    calldata->Finish(
        Status(StatusCode::INVALID_ARGUMENT,
               tensorflow::strings::StrCat(
                   "expected image_data of size ", kImageDataSize,
                   ", got ", calldata->request().image_data_size())));
    return;
  }
  */
  // Create and submit a task to the batch scheduler.
  std::unique_ptr<Task> task(new Task(calldata));
  tensorflow::Status status = batch_scheduler_->Schedule(&task);
  if (!status.ok()) {
    calldata->Finish(ToGRPCStatus(status));
    return;
  }
}
// Produces classifications for a batch of requests and associated responses.
void LenovoServiceImpl::DoClassifyInBatch(
    std::unique_ptr<tensorflow::serving::Batch<Task>> batch) {
  batch->WaitUntilClosed();
  if (batch->empty()) {
    return;
  }
  const int batch_size = batch->num_tasks();
  // Replies to each task with the given error status.
  auto complete_with_error = [&batch](StatusCode code, const string& msg) {
    Status status(code, msg);
    for (int i = 0; i < batch->num_tasks(); i++) {
      Task* task = batch->mutable_task(i);
      task->calldata->Finish(status);
    }
  };
  // Get a handle to the SessionBundle.  The handle ensures the Manager does
  // not reload this while it is in use.
  // WARNING(break-tutorial-inline-code): The following code snippet is
  // in-lined in tutorials, please update tutorial documents accordingly
  // whenever code changes.
  auto handle_request =
      tensorflow::serving::ServableRequest::Latest(servable_name_);
  tensorflow::serving::ServableHandle<tensorflow::serving::SessionBundle>
      bundle;
  const tensorflow::Status lookup_status =
      manager_->GetServableHandle(handle_request, &bundle);
  if (!lookup_status.ok()) {
    complete_with_error(StatusCode::INTERNAL,
                        lookup_status.error_message());
    return;
  }
  // Get the default signature of the graph.  Expected to be a
  // classification signature.
  tensorflow::serving::ClassificationSignature signature;
  const tensorflow::Status signature_status =
      GetClassificationSignature(bundle->meta_graph_def, &signature);
  if (!signature_status.ok()) {
    complete_with_error(StatusCode::INTERNAL,
                        signature_status.error_message());
    return;
  }
  // Transform protobuf input to inference input tensor.
  // See mnist_model.py for details.
  // WARNING(break-tutorial-inline-code): The following code snippet is
  // in-lined in tutorials, please update tutorial documents accordingly
  // whenever code changes.
  Tensor input(tensorflow::DT_STRING, {batch_size, kImageDataSize});
  auto dst = input.flat_outer_dims<float>().data();
  for (int i = 0; i < batch_size; ++i) {
    std::copy_n(
        batch->mutable_task(i)->calldata->request().imagedata().begin(),
        kImageDataSize, dst);
    dst += kImageDataSize;
  }
  // Run classification.
  tensorflow::Tensor scores;
  const tensorflow::Status run_status =
      RunClassification(signature, input, bundle->session.get(),
                        nullptr /* classes */, &scores);
  if (!run_status.ok()) {
    complete_with_error(StatusCode::INTERNAL, run_status.error_message());
    return;
  }
  if (scores.dtype() != tensorflow::DT_FLOAT) {
    complete_with_error(
        StatusCode::INTERNAL,
        tensorflow::strings::StrCat(
            "Expected output Tensor of DT_FLOAT.  Got: ",
            tensorflow::DataType_Name(scores.dtype())));
    return;
  }
  if (scores.dim_size(1) != kNumLabels) {
    complete_with_error(
        StatusCode::INTERNAL,
        tensorflow::strings::StrCat(
            "Expected ", kNumLabels, " labels in each output.  Got: ",
            scores.dim_size(1)));
    return;
  }
  // Transform inference output tensor to protobuf output.
  // See mnist_model.py for details.
  const auto& scores_mat = scores.matrix<float>();
  for (int i = 0; i < batch_size; ++i) {
    auto calldata = batch->mutable_task(i)->calldata;
    for (int c = 0; c < scores.dim_size(1); ++c) {
      calldata->mutable_response()->add_score(scores_mat(i, c));
    }
    calldata->Finish(Status::OK);
  }
}
void HandleRpcs(LenovoServiceImpl* service_impl,
                ClassificationService::AsyncService* service,
                ServerCompletionQueue* cq) {
  // Spawn a new CallData instance to serve new clients.
  new CallData(service_impl, service, cq);
  void* tag;  // uniquely identifies a request.
  bool ok;
  while (true) {
    // Block waiting to read the next event from the completion queue. The
    // event is uniquely identified by its tag, which in this case is the
    // memory address of a CallData instance.
    cq->Next(&tag, &ok);
    GPR_ASSERT(ok);
    static_cast<CallData*>(tag)->Proceed();
  }
}
// Runs MnistService server until shutdown.
void RunServer(const int port, const string& servable_name,
               std::unique_ptr<tensorflow::serving::Manager> manager) {
  // "0.0.0.0" is the way to listen on localhost in gRPC.
  const string server_address = "0.0.0.0:" + std::to_string(port);
  ClassificationService::AsyncService service;
  ServerBuilder builder;
  std::shared_ptr<grpc::ServerCredentials> creds = InsecureServerCredentials();
  builder.AddListeningPort(server_address, creds);
  builder.RegisterService(&service);
  std::unique_ptr<ServerCompletionQueue> cq = builder.AddCompletionQueue();
  std::unique_ptr<Server> server(builder.BuildAndStart());
  LOG(INFO) << "Running...";
  LenovoServiceImpl service_impl(servable_name, std::move(manager));
  HandleRpcs(&service_impl, &service, cq.get());
}
}  // namespace
int main(int argc, char** argv) {
  // Parse command-line options.
  tensorflow::int32 port = 0;
  const bool parse_result =
      tensorflow::ParseFlags(&argc, argv, {tensorflow::Flag("port", &port)});
  if (!parse_result) {
    LOG(FATAL) << "Error parsing command line flags.";
  }
  if (argc != 2) {
    LOG(FATAL) << "Usage: lenovo_inference --port=9000 /path/to/exports";
  }
  const string export_base_path(argv[1]);
  tensorflow::port::InitMain(argv[0], &argc, &argv);
  // WARNING(break-tutorial-inline-code): The following code snippet is
  // in-lined in tutorials, please update tutorial documents accordingly
  // whenever code changes.
  std::unique_ptr<tensorflow::serving::Manager> manager;
  tensorflow::Status status = tensorflow::serving::simple_servers::
      CreateSingleTFModelManagerFromBasePath(export_base_path, &manager);
  TF_CHECK_OK(status) << "Error creating manager";
  // Wait until at least one model is loaded.
  std::vector<tensorflow::serving::ServableId> ready_ids;
  // TODO(b/25545573): Create a more streamlined startup mechanism than polling.
  do {
    LOG(INFO) << "Waiting for models to be loaded...";
    tensorflow::Env::Default()->SleepForMicroseconds(1 * 1000 * 1000 /*1 sec*/);
    ready_ids = manager->ListAvailableServableIds();
  while (ready_ids.empty());
  // Run the service.
  RunServer(port, ready_ids[0].name, std::move(manager));
  return 0;
}

4.在源代码目录下编写bzael的BUILD文件

# Description: Tensorflow Serving examples.
 
 
 
 
package(
 
    default_visibility = ["//tensorflow_serving:internal"],
 
    features = [
 
        "-parse_headers",
 
        "no_layering_check",
 
    ],
 
)
 
 
 
 
licenses(["notice"])  # Apache 2.0
 
 
 
 
exports_files(["LICENSE"])
 
 
 
 
load("//tensorflow_serving:serving.bzl""serving_proto_library")
 
 
 
 
filegroup(
 
    name = "all_files",
 
    srcs = glob(
 
        ["**/*"],
 
        exclude = [
 
            "**/METADATA",
 
            "**/OWNERS",
 
        ],
 
    ),
 
)
 
 
 
 
serving_proto_library(
 
    name = "lenovo_inference_proto",
 
    srcs = ["serving-api.proto"],
 
    has_services = 1,
 
    cc_api_version = 2,
 
    cc_grpc_version = 1,
 
)
 
 
 
 
 
 
 
cc_binary(
 
    name = "lenovo_inference",
 
    srcs = [
 
        "lenovo_inference.cc",
 
    ],
 
    linkopts = ["-lm"],
 
    deps = [
 
        ":lenovo_inference_proto",
 
        "//tensorflow_serving/batching:basic_batch_scheduler",
 
        "//tensorflow_serving/batching:batch_scheduler",
 
        "//tensorflow_serving/core:manager",
 
        "//tensorflow_serving/core:servable_handle",
 
        "//tensorflow_serving/core:servable_id",
 
        "//tensorflow_serving/servables/tensorflow:simple_servers",
 
        "//tensorflow_serving/session_bundle",
 
        "//tensorflow_serving/session_bundle:manifest_proto",
 
        "//tensorflow_serving/session_bundle:signature",
 
        "@grpc//:grpc++",
 
        "@org_tensorflow//tensorflow/core:framework",
 
        "@org_tensorflow//tensorflow/core:lib",
 
        "@org_tensorflow//tensorflow/core:protos_all_cc",
 
        "@org_tensorflow//tensorflow/core:tensorflow",
 
    ],
 
)

5.编译代码

bazel build tensorflow_serving/lenovo/...

6.运行server

./bazel-bin/tensorflow_serving/lenovo/lenovo_inference --port=<server port> <export model path>

serving inference的更多相关文章

  1. 4 个场景揭秘,如何低成本让容器化应用 Serverless 化?

    作者 | changshuai FaaS 的门槛 Serverless 形态的云服务帮助开发者承担了大量复杂的扩缩容.运维.容量规划.云产品打通集成等责任,使得开发者可以专注业务逻辑.提高交付速度 ( ...

  2. 阿里云函数计算发布新功能,支持容器镜像,加速应用 Serverless 进程

    我们先通过一段视频来看看函数计算和容器相结合后,在视频转码场景下的优秀表现.点击观看视频 >> FaaS 的门槛 Serverless 形态的云服务帮助开发者承担了大量复杂的扩缩容.运维. ...

  3. tensorflow serving

    1.安装tensorflow serving 1.1确保当前环境已经安装并可运行tensorflow 从github上下载源码 git clone --recurse-submodules https ...

  4. TensorFlow Serving简介

    一.TensorFlow Serving简介 TensorFlow Serving是GOOGLE开源的一个服务系统,适用于部署机器学习模型,灵活.性能高.可用于生产环境. TensorFlow Ser ...

  5. Paddle Inference推理部署

    Paddle Inference推理部署 飞桨(PaddlePaddle)是集深度学习核心框架.工具组件和服务平台为一体的技术先进.功能完备的开源深度学习平台,已被中国企业广泛使用,深度契合企业应用需 ...

  6. 服务化部署框架Paddle Serving

    服务化部署框架Paddle Serving 概述 常见的深度学习模型开发流程需要经过问题定义.数据准备.特征提取.建模.训练过程,以及最后一个环--将训练出来的模型部署应用到实际业务中.如图1所示,当 ...

  7. Paddle Inference原生推理库

    Paddle Inference原生推理库 深度学习一般分为训练和推理两个部分,训练是神经网络"学习"的过程,主要关注如何搜索和求解模型参数,发现训练数据中的规律,生成模型.有了训 ...

  8. Tensorflow serving的编译

    Tensorflow serving提供了部署tensorflow生成的模型给线上服务的方法,包括模型的export,load等等. 安装参考这个 https://github.com/tensorf ...

  9. springboot Serving Web Content with Spring MVC

    Serving Web Content with Spring MVC This guide walks you through the process of creating a "hel ...

随机推荐

  1. dxRangeTrackBar使用教程

    Properties: Max:最大值 Min:最小值 Frequency:设置刻度值多大值显示PageSize:选择时跳动的区域大小 SelectionColor:选择区域颜色 ShowSelect ...

  2. 修改vs2010中html的默认模板

    用vs2010开发,新建html时,html页面会生成HTML 4 XHTML的header,如何把它改成干净的html5风格? 步骤: 修改 你的安装目录\Microsoft Visual Stud ...

  3. ls 命令查看文件时候,按修改时间倒序或升序排列

    1,按照时间升序 命令:ls -lrt 详细解释: -l use a long listing format 以长列表方式显示(详细信息方式) -t sort by modification time ...

  4. bcolz的新操作

    1.直接修改 eg:把data.bcolz文件中A列为0的数据填充为1000. data = bcolz.open("data.bcolz", "a") #以& ...

  5. [vue]组件的导入

    参考: http://vue2.mmxiaowu.com/article/584a3957fc007e72b0f576d9 vue组件的注册 1.通过components方式注册 2.通过router ...

  6. ansible-playbook 快速入门

    管理用户密码: --- - hosts: test tasks: - name: changed password shell: echo root:123456 | chpasswd remote_ ...

  7. 2019.03.18 连接my sql

    11.登陆功能(链接MySQL) python manage.py starapp movie 新建一个应用模块之后要记得到setting添加这个应用模块 在python2中你还有去导入一个MySQL ...

  8. Linux配置SSH免登录

    [root@Linux01 ~]# ssh-keygen #生成公私钥 [root@Linux01 ~]# ssh-copy-id -i ~/.ssh/id_rsa.pub hadoopuser@Ha ...

  9. 学习笔记 python 面向对象学习

    封装: 封装是面向对象的特征之一,是对象和类概念的主要特性. 封装,也就是把客观事物封装成抽象的类,并且类可以把自己的数据和方法只让可信的类或者对象操作,对不可信的进行信息隐藏. 继承: 继承是指这样 ...

  10. unity3d-游戏实战突出重围,第三天 绘制数字

    实现效果: 准备资源 using UnityEngine; using System.Collections; public class hznum : MonoBehaviour { //存储图片资 ...