網頁

2023年6月21日 星期三

如何在 python 中使用 tao 產生的 yolov4 模型

參考 Nvidia TAO Computer Vision Sample Workflows 產生 yolov4-tiny 模型
此 模型 與 一般產生 的 模型 不一樣
一般產生的模型 參考 tensorrt_demos 即可在 python 中使用

只能將 
tao yolo_v4_tiny export 出的模型(.etlt)用於 DeepStream
而由 
tao  converter 產生的 trt.engine 不能使用於 DeepStream 也不能在 python 中使用

有說明如何使用 tao 產生的模型在 Triton 伺服器上
在 yolov3_postprocessor.py 中發現 tao 產生的 yolo
已經將輸出的 NMS 處理過, 並將內容置於 
BatchNMS(-1,1): 偵測出的數量
BatchNMS_1(-1,200,4): 座標
BatchNMS_2(-1,200): 信心
BatchNMS_3(-1,200): 類別
輸入的方式也有改變
cv2 讀出的圖 不需 cvtColor, 也不用除以 255.0
只需將 BHWC 轉成 BCHW
img = img.transpose((2, 0, 1)).astype(np.float32)

tao 的執行是在 docker 中,所以很難除錯
發現下列命令,可以直接進入 docker 中,執行 python, 查詢版本環境等
docker run -it --rm --gpus all \
  -v "/mnt/Data/tao/yolo_v4_tiny_1.4.1":"/workspace/tao-experiments" \
  -v "/mnt/Data/TensorRT/tensorrt_demos":"/workspace/tensorrt_demos" \
  -v "/mnt/CT1000SSD/ImageData/Light":"/workspace/Light" \
  nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5 \
  bash

將模型轉換成 TensorRT 除了使用
!tao converter -k $KEY \
                   -p Input,1x3x416x416,8x3x416x416,16x3x416x416 \
                   -e $USER_EXPERIMENT_DIR/export/trt.engine \
                   -t fp32 \
                   $USER_EXPERIMENT_DIR/export/yolov4_cspdarknet_tiny_epoch_$EPOCH.etlt
外, 也可使用
!tao-deploy yolo_v4_tiny gen_trt_engine \
  -m $USER_EXPERIMENT_DIR/export/yolov4_cspdarknet_tiny_epoch_$EPOCH.etlt \
  -e $SPECS_DIR/yolo_v4_tiny_retrain_kitti.txt \
  -k $KEY \
  --data_type fp32 \
  --batch_size 1 \
  --engine_file $USER_EXPERIMENT_DIR/export/yolov4_tao_deplay.trt
但若是要在不同平台上轉換
參考 TAO Converter 下載安裝,並執行轉換
./tao-converter_v4.0.0_trt8.5.1.7 \
  -k nvidia_tlt \
  -p Input,1x3x416x416,2x3x416x416,4x3x416x416 \
  -e yolo_v4_tiny_1.4.1/yolo_v4_tiny/export/yolov4_tao_converter_fp32.engine \
  -t fp32 \
  yolo_v4_tiny_1.4.1/yolo_v4_tiny/export/yolov4_cspdarknet_tiny_epoch_080.etlt

參考 tensorrt_demos 修改 utils/yolo_with_plugins.py, 改名成 triton_yolo_with_plugins.py 如下
"""yolo_with_plugins.py
Implementation of TrtYOLO class with the yolo_layer plugins.
"""
from __future__ import print_function
import ctypes
import numpy as np
import cv2
import tensorrt as trt
import pycuda.driver as cuda

try:
    ctypes.cdll.LoadLibrary('./plugins/libyolo_layer.so')
except OSError as e:
    raise SystemExit('ERROR: failed to load ./plugins/libyolo_layer.so.  '
                     'Did you forget to do a "make" in the "./plugins/" '
                     'subdirectory?') from e

def _preprocess_yolo(img, input_shape, letter_box=False):
    """Preprocess an image before TRT YOLO inferencing.
    # Args
        img: int8 numpy array of shape (img_h, img_w, 3)
        input_shape: a tuple of (H, W)
        letter_box: boolean, specifies whether to keep aspect ratio and
                    create a "letterboxed" image for inference
    # Returns
        preprocessed img: float32 numpy array of shape (3, H, W)
    """
    if letter_box:
        img_h, img_w, _ = img.shape
        new_h, new_w = input_shape[0], input_shape[1]
        offset_h, offset_w = 0, 0
        if (new_w / img_w) <= (new_h / img_h):
            new_h = int(img_h * new_w / img_w)
            offset_h = (input_shape[0] - new_h) // 2
        else:
            new_w = int(img_w * new_h / img_h)
            offset_w = (input_shape[1] - new_w) // 2
        resized = cv2.resize(img, (new_w, new_h))
        img = np.full((input_shape[0], input_shape[1], 3), 127, dtype=np.uint8)
        img[offset_h:(offset_h + new_h), offset_w:(offset_w + new_w), :] = resized
    else:
        img = cv2.resize(img, (input_shape[1], input_shape[0]))

    #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.transpose((2, 0, 1)).astype(np.float32)
    #img /= 255.0
    return img

class HostDeviceMem(object):
    """Simple helper data class that's a little nicer to use than a 2-tuple."""
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def get_input_shape(engine):
    """Get input shape of the TensorRT YOLO engine."""
    binding = engine[0]
    assert engine.binding_is_input(binding)
    binding_dims = engine.get_binding_shape(binding)
    if len(binding_dims) == 4:
        return tuple(binding_dims[2:])
    elif len(binding_dims) == 3:
        return tuple(binding_dims[1:])
    else:
        raise ValueError('bad dims of binding %s: %s' % (binding, str(binding_dims)))

def allocate_buffers(engine, context):
    """Allocates all host/device in/out buffers required for an engine."""
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        binding_dims = engine.get_binding_shape(binding)
        binding_dtype = engine.get_tensor_dtype(binding)
        binding_format = engine.get_tensor_format_desc(binding)
        binding_loc = engine.get_tensor_location(binding)
        binding_mode = engine.get_tensor_mode(binding)
        binding_shape = engine.get_tensor_shape(binding)
        binding_shape_inference = engine.is_shape_inference_io(binding)
        print('binding_dims:{} {} {}'.format(binding, binding_dims, binding_dtype))
        print('  {}'.format(binding_format))
        print('  {} {} {} {}'.format(binding_loc, binding_mode, binding_shape, binding_shape_inference))
        size = trt.volume(binding_dims)
        if size < 0: size *= -1;
        print('  size:{}'.format(size))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            #binding_pro_shape = engine.get_profile_shape(0, binding)
            #print('  {}'.format(binding_pro_shape))
            if binding_dims[0] == -1:
                alloc_dims = np.copy(binding_dims)
                alloc_dims[0] = 1
                context.set_binding_shape(0, alloc_dims)
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    """do_inference (for TensorRT 6.x or lower)
    This function is generalized for multiple inputs/outputs.
    Inputs and outputs are expected to be lists of HostDeviceMem objects.
    """
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size,
                          bindings=bindings,
                          stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

def do_inference_v2(context, bindings, inputs, outputs, stream):
    """do_inference_v2 (for TensorRT 7.0+)
    This function is generalized for multiple inputs/outputs for full
    dimension networks.
    Inputs and outputs are expected to be lists of HostDeviceMem objects.
    """
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

class TrtYOLO(object):
    """TrtYOLO class encapsulates things needed to run TRT YOLO."""
    def _load_engine(self):
        TRTbin = 'yolo/%s.trt' % self.model
        TRTbin = self.model
        with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def __init__(self, model, category_num=80, letter_box=False, cuda_ctx=None):
        """Initialize TensorRT plugins, engine and conetxt."""
        self.model = model
        self.category_num = category_num
        self.letter_box = letter_box
        self.cuda_ctx = cuda_ctx
        if self.cuda_ctx:
            self.cuda_ctx.push()

        self.inference_fn = do_inference if trt.__version__[0] < '7' \
                                         else do_inference_v2
        self.trt_logger = trt.Logger(trt.Logger.INFO)
        # add for errors
        # IPluginCreator not found in Plugin Registry
        # getPluginCreator could not find plugin: BatchedNMSDynamic_TRT version: 1
        # Serialization assertion plan->header.magicTag == rt::kPLAN_MAGIC_TAG failed
        trt.init_libnvinfer_plugins(self.trt_logger, namespace="")
        self.engine = self._load_engine()

        self.input_shape = get_input_shape(self.engine)

        try:
            self.context = self.engine.create_execution_context()
            self.inputs, self.outputs, self.bindings, self.stream = \
                allocate_buffers(self.engine, self.context)
        except Exception as e:
            raise RuntimeError('fail to allocate CUDA resources') from e
        finally:
            if self.cuda_ctx:
                self.cuda_ctx.pop()

    def __del__(self):
        """Free CUDA memories."""
        del self.outputs
        del self.inputs
        del self.stream

    def detect(self, img, letter_box=None):
        """Detect objects in the input image."""
        letter_box = self.letter_box if letter_box is None else letter_box
        img_h, img_w, _ = img.shape
        img_resized = _preprocess_yolo(img, self.input_shape, letter_box)
        #print(img_resized.shape, img_resized.dtype)

        # Set host input to the image. The do_inference() function
        # will copy the input to the GPU before executing.
        self.inputs[0].host = np.ascontiguousarray(img_resized)
        if self.cuda_ctx:
            self.cuda_ctx.push()
        trt_outputs = self.inference_fn(
            context=self.context,
            bindings=self.bindings,
            inputs=self.inputs,
            outputs=self.outputs,
            stream=self.stream)
        if self.cuda_ctx:
            self.cuda_ctx.pop()

        y_pred = [i.reshape(1, -1,)[:1] for i in trt_outputs]
        keep_k, boxes, scores, cls_id = y_pred
        #print(keep_k.shape)
        #print(boxes.shape)
        keep_k[0,0] = 1
        locs = np.empty((0,4), dtype=np.uint)
        cids = np.empty((0,1), dtype=np.uint)
        confs = np.empty((0,1), dtype=np.float32)
        for idx, k in enumerate(keep_k.reshape(-1)):
            mul = np.array([img_w,img_h,img_w,img_h])
            loc = boxes[idx].reshape(-1, 4)[:k] * mul
            loc = loc.astype(np.uint)
            cid = cls_id[idx].reshape(-1, 1)[:k]
            cid = cid.astype(np.uint)
            conf = scores[idx].reshape(-1, 1)[:k]
            locs = np.concatenate((locs, loc), axis=0)
            cids = np.concatenate((cids, cid), axis=0)
            confs = np.concatenate((confs, conf), axis=0)
        #print(locs.shape, cids.shape, confs.shape)
        #print(locs, cids, confs)
        return locs, confs, cids

下列程式使用上列的程式
import cv2
import numpy as np
import tensorrt as trt
import pycuda.autoinit # This is needed for initializing CUDA driver
import pycuda.driver as cuda
from utils.triton_yolo_with_plugins import TrtYOLO

#MODEL_PATH = '/workspace/tao-experiments/yolo_v4_tiny/export/yolov4_tao_convert.engine'
MODEL_PATH = '/workspace/tao-experiments/yolo_v4_tiny/export/yolov4_tao_deplay.trt'
#MODEL_PATH = '/workspace/tao-experiments/yolo_v4_tiny/export/trt.engine'
        
def main():
    trt_yolo = TrtYOLO(MODEL_PATH, 5, True)
    img_org = cv2.imread('bb.jpg')
    img = np.copy(img_org)
    print(img.shape, img.dtype)
    boxes, confs, clss = trt_yolo.detect(img, False)
    print(boxes.shape, confs.shape, clss.shape)
    print(boxes, confs, clss)
    for box, conf, clss in zip(boxes, confs, clss):
        x_min, y_min, x_max, y_max = box[0], box[1], box[2], box[3]
        cv2.rectangle(img, (x_min, y_min), (x_max, y_max), (255, 255, 255), 2)
        print(box, conf, clss)
    cv2.imwrite('aa.jpg', img)
    print('aaa')

if __name__ == '__main__':
    main()

除錯說明
訊息: Serialization assertion plan->header.magicTag == rt::kPLAN_MAGIC_TAG failed
解決: TensorRt 的版本不一致,安裝不同版本,或利用 docker
訊息: IPluginCreator not found in Plugin Registry
訊息: getPluginCreator could not find plugin: BatchedNMSDynamic_TRT version: 1
解決: 需安裝 TensorRT OSS
在 load_engine() 之前加上
trt.init_libnvinfer_plugins(self.trt_logger, namespace="")

沒有留言:

張貼留言