網頁

顯示具有 Yolo 標籤的文章。 顯示所有文章
顯示具有 Yolo 標籤的文章。 顯示所有文章

2023年7月3日 星期一

YOLOv8 and TensorRT

參考 YOLOv8 GitHub 官網

1. 下載 DeepStream-Yolo, Ultralytics YOLOv8
git clone https://github.com/marcoslucianops/DeepStream-Yolo.git
git clone https://github.com/ultralytics/ultralytics.git /mnt/Data/DeepStream/DeepStream-Yolo/ultralytics

2. 建立 deepstream_yolo docker container
docker_run.sh
xhost +
docker run --name='deepstream_yolo' --gpus all -it --net=host --privileged \
  -v /tmp/.X11-unix:/tmp/.X11-unix \
  -v /etc/localtime:/etc/localtime \
  -v /mnt/Data/DeepStream/DeepStream-Yolo/DeepStream-Yolo:/home/DeepStream-Yolo \
  -v /mnt/Data/DeepStream/DeepStream-Yolo/ultralytics:/home/ultralytics \
  -v /mnt/Data/DeepStream/DeepStream-Yolo/read_me:/home/read_me \
  -v /mnt/Data/DeepStream/DeepStream-Yolo/datasets:/home/datasets \
  -v /mnt/CT1000SSD/ImageData/Light:/home/Light \
  -e DISPLAY=$DISPLAY \
  -w /home/read_me \
  nvcr.io/nvidia/deepstream:6.2-devel
  
3. 在 Docker 內, 安裝 DeepStream-Yolo
apt-get install build-essential
/opt/nvidia/deepstream/deepstream/user_additional_install.sh
cd /home/DeepStream-Yolo
CUDA_VER=11.8 make -C nvdsinfer_custom_impl_Yolo

4. 在 Docker 內, 安裝 Ultralytics YOLOv8
#python3 -m pip install --upgrade pip
pip3 install --upgrade pip
pip3 install protobuf numpy
cd /home/ultralytics
#pip install -e .
pip3 install -r requirements.txt
python3 setup.py install
pip3 install onnx onnxsim onnxruntime

5. 在 Docker 內, 下載,轉換,測試 yolov8s.pt, yolov8s.pt 模型
cd /home/ultralytics
cp /home/DeepStream-Yolo/utils/export_yoloV8.py /home/ultralytics
wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8s.pt
wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt
python3 export_yoloV8.py -w yolov8s.pt --dynamic
python3 export_yoloV8.py -w yolov8n.pt --dynamic
cp yolov8s.onnx labels.txt /home/DeepStream-Yolo
cp yolov8n.onnx labels.txt /home/DeepStream-Yolo

6. 移除 deepstream_yolo container
$ docker container rm deepstream_yolo

7. 重新進入 Docker
docker_attach.sh
xhost +
docker start deepstream_yolo
docker attach deepstream_yolo

8. 轉換模型格式為 onnx
yolov8n.py
from ultralytics import YOLO

# Load a model
model = YOLO("yolov8n.yaml")  # build a new model from scratch
model = YOLO("yolov8n.pt")  # load a pretrained model (recommended for training)

# Use the model
model.train(data="coco128.yaml", epochs=3)  # train the model
metrics = model.val()  # evaluate model performance on the validation set
results = model("https://ultralytics.com/images/bus.jpg")  # predict on an image
path = model.export(format="onnx")  # export the model to ONNX format

執行 python3 yolov8n.py 出現下列錯誤
ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
修正方式
$ sudo systemctl stop docker
取得 container id
$ docker inspect deepstream_yolo | grep Id
"Id": "???????"
編輯 container 的 ShmSize
$ sudo vi /var/lib/docker/containers/your_container_id/hostconfig.json
"ShmSize":8589934592
$ sudo systemctl restart docker
$ ./docker_attach.sh

9. 在 DeepStream 中測試 onnx 模型
# cd /home/DeepStream-Yolo

# vi config_infer_primary_yoloV8.txt
onnx-file=yolov8s.onnx
onnx-file=yolov8n.onnx

# vi deepstream_app_config.txt
uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4
uri=rtsp://root:A1234567@192.168.0.107:554/live1s1.sdp
live-source=0
live-source=1
config-file=config_infer_primary.txt
config-file=config_infer_primary_yoloV8.txt
file-loop=0
file-loop=1

# deepstream-app -c deepstream_app_config.txt

10. 準備自己的圖形資料, PASCAL VOC(LabelImg 產生的 xml) 格式轉換成 txt
prepare_detect.py
import cv2
import os
import random
import re
import xml.etree.ElementTree as ET

import numpy as np

LIGHT_CLASSES_LIST = [
    'forward_right',
    'others',
    'red',
    'red_left',
    'yellow',
    ]
        
def save_false_positives(img_org, iName, xName, tag, classIdx, 
        clip_x0, clip_y0, clip_x1, clip_y1):
    img_new = img_org[clip_y0:clip_y1, clip_x0:clip_x1]
    fPath, fName = os.path.split(iName)
    fName, fExt = os.path.splitext(fName)
    fName = fName + tag + fExt
    rndPaths = ['train', 'val', 'test']
    rndPath = random.choices(rndPaths, weights=(8,1,1))[0]
    iName = os.path.join('/home/datasets/Light/images', rndPath, fName)
    cv2.imwrite(iName, img_new)
        
def convert_box(size, box):
    dw, dh = 1. / size[0], 1. / size[1]
    x, y, w, h = (box[0] + box[1]) / 2.0 - 1, (box[2] + box[3]) / 2.0 - 1, box[1] - box[0], box[3] - box[2]
    return x * dw, y * dh, w * dw, h * dh
          
def save_file(img_org, iName, xName, tag, classIdx, 
        p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y, 
        img_w, img_h, xmin, ymin, xmax, ymax,
        clip_x0, clip_y0, clip_x1, clip_y1):
    img_new = img_org[clip_y0:clip_y1, clip_x0:clip_x1]
    fPath, fName = os.path.split(iName)
    fName, fExt = os.path.splitext(fName)
    fName = fName + tag + fExt
    rndPaths = ['train', 'val', 'test']
    rndPath = random.choices(rndPaths, weights=(8,1,1))[0]
    iName = os.path.join('/home/datasets/Light/images', rndPath, fName)
    cv2.imwrite(iName, img_new)
    
    w = clip_x1 - clip_x0
    h = clip_y1 - clip_y0
    xmin = xmin - clip_x0
    ymin = ymin - clip_y0
    xmax = xmax - clip_x0
    ymax = ymax - clip_y0
    bb = convert_box((w, h), (xmin, xmax, ymin, ymax))
    fPath, fName = os.path.split(xName)
    fName, fExt = os.path.splitext(fName)
    fName = fName + tag + '.txt'
    tName = os.path.join('/home/datasets/Light/labels', rndPath, fName)
    with open(tName, 'w') as f:
        f.write(" ".join([str(a) for a in (classIdx, *bb)]) + '\n')
        
def gen_img_yolo(iName, xName):
    tree = ET.parse(open(xName))
    root = tree.getroot()
    img_w = int(root.find('size').find('width').text)
    img_h = int(root.find('size').find('height').text)
    for idx, object in enumerate(root.findall('object')):
        name = object.find('name').text
        classIdx = LIGHT_CLASSES_LIST.index(name)
        #print(classIdx, name)
        bndbox = object.find('bndbox')
        p0x = int(bndbox.find('p0x').text)
        p0y = int(bndbox.find('p0y').text)
        p1x = int(bndbox.find('p1x').text)
        p1y = int(bndbox.find('p1y').text)
        p2x = int(bndbox.find('p2x').text)
        p2y = int(bndbox.find('p2y').text)
        p3x = int(bndbox.find('p3x').text)
        p3y = int(bndbox.find('p3y').text)
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        if xmin != p0x or xmin != p3x or ymin != p0y or ymin != p1y or \
                xmax != p1x or xmax != p2x or ymax != p2y or ymax != p3y:
            print('error:bndbox', xName)
            exit()
        if idx > 0:
            print('error:object', xName)
            exit()
    img_org = cv2.imread(iName)
    if img_org.shape[0] != img_h or img_org.shape[1] != img_w:
        print(img_org.shape, (img_h, img_w))
        exit()
    img = np.copy(img_org)

    clip_x0 = random.randrange(0, int(xmin*0.5))
    clip_y0 = random.randrange(0, int(ymin*0.5))
    clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.5), img_w+1)
    clip_y1 = random.randrange(int(ymax + (img_h-ymax)*0.5), img_h+1)
    save_file(img_org, iName, xName, '', classIdx, 
            p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y, 
            img_w, img_h, xmin, ymin, xmax, ymax,
            clip_x0, clip_y0, clip_x1, clip_y1)
    ratio = (xmax - xmin) / img_w
    if ratio < 0.3:
        clip_x0 = random.randrange(int(xmin*0.3), int(xmin*0.8))
        clip_y0 = random.randrange(int(ymin*0.3), int(ymin*0.8))
        clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), int(xmax + (img_w-xmax)*0.7))
        clip_y1 = random.randrange(int(ymax + (img_h-ymax)*0.2), int(ymax + (img_h-ymax)*0.7))
        save_file(img_org, iName, xName, '_a', classIdx, 
                p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y, 
                img_w, img_h, xmin, ymin, xmax, ymax,
                clip_x0, clip_y0, clip_x1, clip_y1)
        clip_x0 = random.randrange(int(xmin*0.5), int(xmin*0.9))
        clip_y0 = random.randrange(int(ymin*0.5), int(ymin*0.9))
        clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.1), int(xmax + (img_w-xmax)*0.5))
        clip_y1 = random.randrange(int(ymax + (img_h-ymax)*0.1), int(ymax + (img_h-ymax)*0.5))
        save_file(img_org, iName, xName, '_b', classIdx, 
                p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y, 
                img_w, img_h, xmin, ymin, xmax, ymax,
                clip_x0, clip_y0, clip_x1, clip_y1)
        if xmin > (img_w - xmax):
            if ymin > (img_h - ymax):
                clip_x0 = random.randrange(0, int(xmin*0.8))
                clip_y0 = random.randrange(0, int(ymin*0.8))
                clip_x1 = random.randrange(int(xmin), int(xmin+(xmax-xmin)*0.8))
                clip_y1 = random.randrange(int(ymin), int(ymin+(ymax-ymin)*0.8))
                root.remove(object)
                save_false_positives(img_org, iName, xName, '_f0', classIdx, 
                        clip_x0, clip_y0, clip_x1, clip_y1)
            else:
                clip_x0 = random.randrange(0, int(xmin*0.8))
                clip_y0 = random.randrange(int(ymin+(ymax-ymin)*0.2), int(ymax))
                clip_x1 = random.randrange(int(xmin), int(xmin + (xmax-xmin)*0.8))
                clip_y1 = random.randrange(int(ymax+(img_h-ymax)*0.2), img_h)
                root.remove(object)
                save_false_positives(img_org, iName, xName, '_f1', classIdx, 
                        clip_x0, clip_y0, clip_x1, clip_y1)
        else:
            if ymin > (img_h - ymax):
                clip_x0 = random.randrange(int(xmin+(xmax-xmin)*0.2), int(xmax))
                clip_y0 = random.randrange(0, int(ymin*0.8))
                clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), img_w)
                clip_y1 = random.randrange(int(ymin), int(ymin+(ymax-ymin)*0.8))
                root.remove(object)
                save_false_positives(img_org, iName, xName, '_f2', classIdx, 
                        clip_x0, clip_y0, clip_x1, clip_y1)
            else:
                clip_x0 = random.randrange(int(xmin+(xmax-xmin)*0.2), int(xmax))
                clip_y0 = random.randrange(int(ymin+(ymax-ymin)*0.2), int(ymax))
                clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), img_w)
                clip_y1 = random.randrange(int(ymax+(img_h-ymax)*0.2), img_h)
                root.remove(object)
                save_false_positives(img_org, iName, xName, '_f3', classIdx, 
                        clip_x0, clip_y0, clip_x1, clip_y1)
    elif ratio < 0.7:
        clip_x0 = random.randrange(int(xmin*0.1), int(xmin*0.7))
        clip_y0 = random.randrange(int(ymin*0.1), int(ymin*0.7))
        clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.3), int(xmax + (img_w-xmax)*0.9))
        clip_y1 = random.randrange(int(ymax + (img_h-ymax)*0.3), int(ymax + (img_h-ymax)*0.9))
        save_file(img_org, iName, xName, '_c', classIdx, 
                p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y, 
                img_w, img_h, xmin, ymin, xmax, ymax,
                clip_x0, clip_y0, clip_x1, clip_y1)
        if xmin > (img_w - xmax):
            if ymin > (img_h - ymax):
                clip_x0 = random.randrange(0, int(xmin*0.8))
                clip_y0 = random.randrange(0, int(ymin*0.8))
                clip_x1 = random.randrange(int(xmin), int(xmin+(xmax-xmin)*0.8))
                clip_y1 = random.randrange(int(ymin), int(ymin+(ymax-ymin)*0.8))
                root.remove(object)
                save_false_positives(img_org, iName, xName, '_f4', classIdx, 
                        clip_x0, clip_y0, clip_x1, clip_y1)
            else:
                clip_x0 = random.randrange(0, int(xmin*0.8))
                clip_y0 = random.randrange(int(ymin+(ymax-ymin)*0.2), int(ymax))
                clip_x1 = random.randrange(int(xmin), int(xmin + (xmax-xmin)*0.8))
                clip_y1 = random.randrange(int(ymax+(img_h-ymax)*0.2), img_h)
                root.remove(object)
                save_false_positives(img_org, iName, xName, '_f5', classIdx, 
                        clip_x0, clip_y0, clip_x1, clip_y1)
        else:
            if ymin > (img_h - ymax):
                clip_x0 = random.randrange(int(xmin+(xmax-xmin)*0.2), int(xmax))
                clip_y0 = random.randrange(0, int(ymin*0.8))
                clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), img_w)
                clip_y1 = random.randrange(int(ymin), int(ymin+(ymax-ymin)*0.8))
                root.remove(object)
                save_false_positives(img_org, iName, xName, '_f6', classIdx, 
                        clip_x0, clip_y0, clip_x1, clip_y1)
            else:
                clip_x0 = random.randrange(int(xmin+(xmax-xmin)*0.2), int(xmax))
                clip_y0 = random.randrange(int(ymin+(ymax-ymin)*0.2), int(ymax))
                clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), img_w)
                clip_y1 = random.randrange(int(ymax+(img_h-ymax)*0.2), img_h)
                root.remove(object)
                save_false_positives(img_org, iName, xName, '_f7', classIdx, 
                        clip_x0, clip_y0, clip_x1, clip_y1)
    elif ratio < 1.0:
        pass
    return

def recursive_folder(path):
    files = os.listdir(path)
    files.sort()
    for file in files:
        fullName = os.path.join(path, file)
        if os.path.isfile(fullName):
            fPath, fName = os.path.split(fullName)
            fName, fExt = os.path.splitext(fName)
            if fExt in ('.jpg'):
                xPath = fPath + '.xml'
                xName = fName + '.xml'
                xFName = os.path.join(xPath, xName)
                if os.path.isfile(xFName):
                    gen_img_yolo(fullName, xFName)
                else:
                    print(xFName)
        else:
            recursive_folder(fullName)

def main():
    recursive_folder('/home/Light')

if __name__ == '__main__':
    main()

11. 訓練自己的模型
from ultralytics import YOLO

# Load a model
model = YOLO('yolov8n.pt')  # load a pretrained model (recommended for training)

# Train the model
model.train(data='VOC.yaml', epochs=100, imgsz=640)

12. 查詢 onnx 模型的輸出輸入層
import onnx
model = onnx.load('yolov8n.onnx')
g_in = model.graph.input
g_out = model.graph.output


2023年6月21日 星期三

如何在 python 中使用 tao 產生的 yolov4 模型

參考 Nvidia TAO Computer Vision Sample Workflows 產生 yolov4-tiny 模型
此 模型 與 一般產生 的 模型 不一樣
一般產生的模型 參考 tensorrt_demos 即可在 python 中使用

只能將 
tao yolo_v4_tiny export 出的模型(.etlt)用於 DeepStream
而由 
tao  converter 產生的 trt.engine 不能使用於 DeepStream 也不能在 python 中使用

有說明如何使用 tao 產生的模型在 Triton 伺服器上
在 yolov3_postprocessor.py 中發現 tao 產生的 yolo
已經將輸出的 NMS 處理過, 並將內容置於 
BatchNMS(-1,1): 偵測出的數量
BatchNMS_1(-1,200,4): 座標
BatchNMS_2(-1,200): 信心
BatchNMS_3(-1,200): 類別
輸入的方式也有改變
cv2 讀出的圖 不需 cvtColor, 也不用除以 255.0
只需將 BHWC 轉成 BCHW
img = img.transpose((2, 0, 1)).astype(np.float32)

tao 的執行是在 docker 中,所以很難除錯
發現下列命令,可以直接進入 docker 中,執行 python, 查詢版本環境等
docker run -it --rm --gpus all \
  -v "/mnt/Data/tao/yolo_v4_tiny_1.4.1":"/workspace/tao-experiments" \
  -v "/mnt/Data/TensorRT/tensorrt_demos":"/workspace/tensorrt_demos" \
  -v "/mnt/CT1000SSD/ImageData/Light":"/workspace/Light" \
  nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5 \
  bash

將模型轉換成 TensorRT 除了使用
!tao converter -k $KEY \
                   -p Input,1x3x416x416,8x3x416x416,16x3x416x416 \
                   -e $USER_EXPERIMENT_DIR/export/trt.engine \
                   -t fp32 \
                   $USER_EXPERIMENT_DIR/export/yolov4_cspdarknet_tiny_epoch_$EPOCH.etlt
外, 也可使用
!tao-deploy yolo_v4_tiny gen_trt_engine \
  -m $USER_EXPERIMENT_DIR/export/yolov4_cspdarknet_tiny_epoch_$EPOCH.etlt \
  -e $SPECS_DIR/yolo_v4_tiny_retrain_kitti.txt \
  -k $KEY \
  --data_type fp32 \
  --batch_size 1 \
  --engine_file $USER_EXPERIMENT_DIR/export/yolov4_tao_deplay.trt
但若是要在不同平台上轉換
參考 TAO Converter 下載安裝,並執行轉換
./tao-converter_v4.0.0_trt8.5.1.7 \
  -k nvidia_tlt \
  -p Input,1x3x416x416,2x3x416x416,4x3x416x416 \
  -e yolo_v4_tiny_1.4.1/yolo_v4_tiny/export/yolov4_tao_converter_fp32.engine \
  -t fp32 \
  yolo_v4_tiny_1.4.1/yolo_v4_tiny/export/yolov4_cspdarknet_tiny_epoch_080.etlt

參考 tensorrt_demos 修改 utils/yolo_with_plugins.py, 改名成 triton_yolo_with_plugins.py 如下
"""yolo_with_plugins.py
Implementation of TrtYOLO class with the yolo_layer plugins.
"""
from __future__ import print_function
import ctypes
import numpy as np
import cv2
import tensorrt as trt
import pycuda.driver as cuda

try:
    ctypes.cdll.LoadLibrary('./plugins/libyolo_layer.so')
except OSError as e:
    raise SystemExit('ERROR: failed to load ./plugins/libyolo_layer.so.  '
                     'Did you forget to do a "make" in the "./plugins/" '
                     'subdirectory?') from e

def _preprocess_yolo(img, input_shape, letter_box=False):
    """Preprocess an image before TRT YOLO inferencing.
    # Args
        img: int8 numpy array of shape (img_h, img_w, 3)
        input_shape: a tuple of (H, W)
        letter_box: boolean, specifies whether to keep aspect ratio and
                    create a "letterboxed" image for inference
    # Returns
        preprocessed img: float32 numpy array of shape (3, H, W)
    """
    if letter_box:
        img_h, img_w, _ = img.shape
        new_h, new_w = input_shape[0], input_shape[1]
        offset_h, offset_w = 0, 0
        if (new_w / img_w) <= (new_h / img_h):
            new_h = int(img_h * new_w / img_w)
            offset_h = (input_shape[0] - new_h) // 2
        else:
            new_w = int(img_w * new_h / img_h)
            offset_w = (input_shape[1] - new_w) // 2
        resized = cv2.resize(img, (new_w, new_h))
        img = np.full((input_shape[0], input_shape[1], 3), 127, dtype=np.uint8)
        img[offset_h:(offset_h + new_h), offset_w:(offset_w + new_w), :] = resized
    else:
        img = cv2.resize(img, (input_shape[1], input_shape[0]))

    #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.transpose((2, 0, 1)).astype(np.float32)
    #img /= 255.0
    return img

class HostDeviceMem(object):
    """Simple helper data class that's a little nicer to use than a 2-tuple."""
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def get_input_shape(engine):
    """Get input shape of the TensorRT YOLO engine."""
    binding = engine[0]
    assert engine.binding_is_input(binding)
    binding_dims = engine.get_binding_shape(binding)
    if len(binding_dims) == 4:
        return tuple(binding_dims[2:])
    elif len(binding_dims) == 3:
        return tuple(binding_dims[1:])
    else:
        raise ValueError('bad dims of binding %s: %s' % (binding, str(binding_dims)))

def allocate_buffers(engine, context):
    """Allocates all host/device in/out buffers required for an engine."""
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        binding_dims = engine.get_binding_shape(binding)
        binding_dtype = engine.get_tensor_dtype(binding)
        binding_format = engine.get_tensor_format_desc(binding)
        binding_loc = engine.get_tensor_location(binding)
        binding_mode = engine.get_tensor_mode(binding)
        binding_shape = engine.get_tensor_shape(binding)
        binding_shape_inference = engine.is_shape_inference_io(binding)
        print('binding_dims:{} {} {}'.format(binding, binding_dims, binding_dtype))
        print('  {}'.format(binding_format))
        print('  {} {} {} {}'.format(binding_loc, binding_mode, binding_shape, binding_shape_inference))
        size = trt.volume(binding_dims)
        if size < 0: size *= -1;
        print('  size:{}'.format(size))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            #binding_pro_shape = engine.get_profile_shape(0, binding)
            #print('  {}'.format(binding_pro_shape))
            if binding_dims[0] == -1:
                alloc_dims = np.copy(binding_dims)
                alloc_dims[0] = 1
                context.set_binding_shape(0, alloc_dims)
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    """do_inference (for TensorRT 6.x or lower)
    This function is generalized for multiple inputs/outputs.
    Inputs and outputs are expected to be lists of HostDeviceMem objects.
    """
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size,
                          bindings=bindings,
                          stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

def do_inference_v2(context, bindings, inputs, outputs, stream):
    """do_inference_v2 (for TensorRT 7.0+)
    This function is generalized for multiple inputs/outputs for full
    dimension networks.
    Inputs and outputs are expected to be lists of HostDeviceMem objects.
    """
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

class TrtYOLO(object):
    """TrtYOLO class encapsulates things needed to run TRT YOLO."""
    def _load_engine(self):
        TRTbin = 'yolo/%s.trt' % self.model
        TRTbin = self.model
        with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def __init__(self, model, category_num=80, letter_box=False, cuda_ctx=None):
        """Initialize TensorRT plugins, engine and conetxt."""
        self.model = model
        self.category_num = category_num
        self.letter_box = letter_box
        self.cuda_ctx = cuda_ctx
        if self.cuda_ctx:
            self.cuda_ctx.push()

        self.inference_fn = do_inference if trt.__version__[0] < '7' \
                                         else do_inference_v2
        self.trt_logger = trt.Logger(trt.Logger.INFO)
        # add for errors
        # IPluginCreator not found in Plugin Registry
        # getPluginCreator could not find plugin: BatchedNMSDynamic_TRT version: 1
        # Serialization assertion plan->header.magicTag == rt::kPLAN_MAGIC_TAG failed
        trt.init_libnvinfer_plugins(self.trt_logger, namespace="")
        self.engine = self._load_engine()

        self.input_shape = get_input_shape(self.engine)

        try:
            self.context = self.engine.create_execution_context()
            self.inputs, self.outputs, self.bindings, self.stream = \
                allocate_buffers(self.engine, self.context)
        except Exception as e:
            raise RuntimeError('fail to allocate CUDA resources') from e
        finally:
            if self.cuda_ctx:
                self.cuda_ctx.pop()

    def __del__(self):
        """Free CUDA memories."""
        del self.outputs
        del self.inputs
        del self.stream

    def detect(self, img, letter_box=None):
        """Detect objects in the input image."""
        letter_box = self.letter_box if letter_box is None else letter_box
        img_h, img_w, _ = img.shape
        img_resized = _preprocess_yolo(img, self.input_shape, letter_box)
        #print(img_resized.shape, img_resized.dtype)

        # Set host input to the image. The do_inference() function
        # will copy the input to the GPU before executing.
        self.inputs[0].host = np.ascontiguousarray(img_resized)
        if self.cuda_ctx:
            self.cuda_ctx.push()
        trt_outputs = self.inference_fn(
            context=self.context,
            bindings=self.bindings,
            inputs=self.inputs,
            outputs=self.outputs,
            stream=self.stream)
        if self.cuda_ctx:
            self.cuda_ctx.pop()

        y_pred = [i.reshape(1, -1,)[:1] for i in trt_outputs]
        keep_k, boxes, scores, cls_id = y_pred
        #print(keep_k.shape)
        #print(boxes.shape)
        keep_k[0,0] = 1
        locs = np.empty((0,4), dtype=np.uint)
        cids = np.empty((0,1), dtype=np.uint)
        confs = np.empty((0,1), dtype=np.float32)
        for idx, k in enumerate(keep_k.reshape(-1)):
            mul = np.array([img_w,img_h,img_w,img_h])
            loc = boxes[idx].reshape(-1, 4)[:k] * mul
            loc = loc.astype(np.uint)
            cid = cls_id[idx].reshape(-1, 1)[:k]
            cid = cid.astype(np.uint)
            conf = scores[idx].reshape(-1, 1)[:k]
            locs = np.concatenate((locs, loc), axis=0)
            cids = np.concatenate((cids, cid), axis=0)
            confs = np.concatenate((confs, conf), axis=0)
        #print(locs.shape, cids.shape, confs.shape)
        #print(locs, cids, confs)
        return locs, confs, cids

下列程式使用上列的程式
import cv2
import numpy as np
import tensorrt as trt
import pycuda.autoinit # This is needed for initializing CUDA driver
import pycuda.driver as cuda
from utils.triton_yolo_with_plugins import TrtYOLO

#MODEL_PATH = '/workspace/tao-experiments/yolo_v4_tiny/export/yolov4_tao_convert.engine'
MODEL_PATH = '/workspace/tao-experiments/yolo_v4_tiny/export/yolov4_tao_deplay.trt'
#MODEL_PATH = '/workspace/tao-experiments/yolo_v4_tiny/export/trt.engine'
        
def main():
    trt_yolo = TrtYOLO(MODEL_PATH, 5, True)
    img_org = cv2.imread('bb.jpg')
    img = np.copy(img_org)
    print(img.shape, img.dtype)
    boxes, confs, clss = trt_yolo.detect(img, False)
    print(boxes.shape, confs.shape, clss.shape)
    print(boxes, confs, clss)
    for box, conf, clss in zip(boxes, confs, clss):
        x_min, y_min, x_max, y_max = box[0], box[1], box[2], box[3]
        cv2.rectangle(img, (x_min, y_min), (x_max, y_max), (255, 255, 255), 2)
        print(box, conf, clss)
    cv2.imwrite('aa.jpg', img)
    print('aaa')

if __name__ == '__main__':
    main()

除錯說明
訊息: Serialization assertion plan->header.magicTag == rt::kPLAN_MAGIC_TAG failed
解決: TensorRt 的版本不一致,安裝不同版本,或利用 docker
訊息: IPluginCreator not found in Plugin Registry
訊息: getPluginCreator could not find plugin: BatchedNMSDynamic_TRT version: 1
解決: 需安裝 TensorRT OSS
在 load_engine() 之前加上
trt.init_libnvinfer_plugins(self.trt_logger, namespace="")

2022年2月16日 星期三

yolo v4 to TensorRt

參考: tensorrt_demos

$ cd TensorRT
$ git clone https://github.com/jkjung-avt/tensorrt_demos.git
$ cd tensorrt_demos

安裝環境
$ cd yolo
$ ./install_pycuda.sh
$ pip3 install onnx==1.4.1
$ cd ../plugins
$ make

下載 yolo 的 weights 和 cfg
$ cd ../yolo
$ ./download_yolo.sh
轉換 weights 到 trt
$ python3 yolo_to_onnx.py -m yolov4-tiny-416
$ python3 onnx_to_tensorrt.py -m yolov4-tiny-416
測試
$ cd ..
$ python trt_yolo.py --image doc/dog_trt_yolov4_416.jpg -m yolov4-tiny-416

使用 int8
$ cd yolo
$ ln -s yolov4-tiny-416.cfg yolov4-tiny-int8-416.cfg
$ ln -s yolov4-tiny-416.onnx yolov4-tiny-int8-416.onnx
$ mkdir calib_images
# and copy our image to calib_images
$ python3 onnx_to_tensorrt.py -v --int8 -m yolov4-tiny-int8-416
會產生如下錯誤
[03/02/2022-15:22:07] [TRT] [V] 001_convolutional + 001_convolutional_bn Set Tactic Name: sm70_xmma_fprop_implicit_gemm_f16f16_f16f16_f16_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_t1r3s3 Tactic: 46202665595848747
[03/02/2022-15:22:07] [TRT] [V] Deleting timing cache: 2020 entries, 504 hits
[03/02/2022-15:22:07] [TRT] [E] 1: Unexpected exception
ERROR: failed to build the TensorRT engine!
$ vi onnx_to_tensorrt.py
將 from calibrator import YOLOEntropyCalibrator 移到程式開頭

2022年2月9日 星期三

Yolo tiny v4 to tensorflow and tflite

參考 tensorflow-yolov4-tflite

只能使用 tensorflow==2.3.0rc0
不要使用別的版本,也不要用 GPU

視情況修改 core/config.py
__C.YOLO.CLASSES
__C.YOLO.ANCHORS_TINY

for tensorflow format load by tf.saved_model.load()
$ python save_model.py --weights /your_path_to/weights/yolov4-tiny-vehicle-r_final.weights \
--output ./checkpoints/yolov4-tiny-416 \
--input_size 416 --model yolov4 --tiny
$ python convert_tflite.py --weights ./checkpoints/yolov4-tiny-416-tflite \
--output ./checkpoints/yolov4-tiny-416.tflite

for tensorflow tflite load  by tf.lite.Interpreter()
$ python save_model.py --weights /your_path_to/weights/yolov4-tiny-vehicle-r_final.weights \
--output ./checkpoints/yolov4-tiny-416-tflite \
--input_size 416 --model yolov4 --tiny --framework tflite
$ python convert_tflite.py --weights ./checkpoints/yolov4-tiny-416-tflite \
--output ./checkpoints/yolov4-tiny-416-fp16.tflite \
--quantize_mode float16

2020年10月28日 星期三

在 DeepStream 上使用 Yolo v4

參考 Using YOLOv4 on NVIDIA DeepStream 5.0
darknet 下載 yolov4.cfg, yolov4.weights, yolov4-tiny.cfg, yolov4-tiny.weights
可修改 cfg 檔的 width, height

$ pip3 install torch
$ pip3 install torchvision
$ git clone https://github.com/Tianxiaomo/pytorch-YOLOv4.git
$ cd pytorch-YOLOv4
$ pip3 install onnxruntime
$ python3 demo_darknet2onnx.py yolov4.cfg yolov4.weights ./data/giraffe.jpg 1
產生 yolov4_1_3_416_416_static.onnx
/usr/src/tensorrt/bin/trtexec --onnx=yolov4_1_3_416_416_static.onnx \
--explicitBatch --saveEngine=yolov4_1_3_416_416_fp16.engine \
--workspace=4096 --fp16
產生 yolov4_1_3_416_416_fp16.engine
$ cd ..
$ git clone https://github.com/NVIDIA-AI-IOT/yolov4_deepstream
$ cd yolov4_deepstream
$ sudo cp -r deepstream_yolov4 /opt/nvidia/deepstream/deepstream-5.0/sources
$ cd /opt/nvidia/deepstream/deepstream-5.0/sources
$ sudo chown user.group deepstream_yolov4
$ cd deepstream_yolov4/nvdsinfer_custom_impl_Yolo/
$ make
$ cd ..
拷貝 yolov4_1_3_416_416_fp16.engine 到此
修改 config_infer_primary_yoloV4.txt
model-engine-file=yolov4-tiny_1_3_416_416_fp16.engine

修改 deepstream_app_config_yoloV4.txt
# 不存檔
[sink0]
enable=0
# 螢幕顯示
[sink1]
enable=1
type=2
sync=0
display-id=0
offset-x=0
offset-y=0
width=0
height=0
overlay-id=1
source-id=0
# 不要在此使用 model-engine-file
[primary-gie]
enable=1
#model-engine-file=yolov4_1_3_320_320_fp16.engine
# 開啟 tracker
[tracker]
enable=1

執行
$ deepstream-app -c deepstream_app_config_yoloV4.txt 




2020年9月25日 星期五

accuracy, precision, recall 的理解

預設真假 和 事實的真假
TP(True Positive): 事實為真,預測為真
FN(False Negative): 事實為真,預測為假
FP(False Positive): 事實為假,預測為真
TN(True Negative): 事實為假,預測為假

Accuuuracy = (TP+TN) / (TP+TN+FP+FN)
正確率:在所有情況中,正確預測的比率

Precision = (TP) / (TP+FP)
精確率:預測為真的情況中,有多少是真

Recall = (TP) / (TP+FN)
召回率:為真的情況下,有多少預測為真

Precision 高, Recall 低:捉到的大部分是 真的,但會漏掉 真的
Precision 低, Recall 高:真的 大部分會被捉到,但會有不少 假的

2020年8月25日 星期二

Jetson Nano darknet

參考 Nvidia Jetson Nano 使用心得

$ git clone https://github.com/pjreddie/darknet.git
出現下列錯誤
error: 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' undeclared

改為使用
$ git clone https://github.com/AlexeyAB/darknet
$ cd darknet
$ vi Makefile
GPU=1
CUDNN=1
CUDNN_HALF=1
OPENCV=1
LIBSO=1

$ make -j4

參考 AlexeyAB/darknet
cfg 和 weights 由 YOLOv4 model zoo 下載
yolov4-tiny.weights (建議使用)
yolov4-leaky-416.weight
yolov4-mish-416.weight


nano@nano-desktop:~/Data/darknet/darknet.AlexeyAB$ ./darknet detector demo ../cfg/coco.data ../cfg/yolov4-leaky-416.cfg ../weights/yolov4-leaky-416.weights 'nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)1280, height=(int)720,format=(string)NV12, framerate=(fraction)30/1 ! nvvidconv flip-method=2 ! video/x-raw, format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink'

nano@nano-desktop:~/Data/darknet/darknet.AlexeyAB$ python3 darknet_video.py --weights ../weights/yolov4-leaky-416.weights --data_file ../cfg/coco.data --config_file ../cfg/yolov4-leaky-416.cfg --input 'nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)416, height=(int)416,format=(string)NV12, framerate=(fraction)30/1 ! nvvidconv flip-method=2 ! video/x-raw, format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink' --ext_output

需要按 Enter 才會進下一個畫面



2019年6月22日 星期六

Nvidia Jetson AGX Xavier 之 darknet 安裝

nvidia@jetson-0423418048807:~/XavierSSD$ export PATH=${PATH}:/usr/local/cuda/bin
nvidia@jetson-0423418048807:~/XavierSSD$ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
nvidia@jetson-0423418048807:~/XavierSSD$ git clone https://github.com/pjreddie/darknet
nvidia@jetson-0423418048807:~/XavierSSD$ cd darknet/
nvidia@jetson-0423418048807:~/XavierSSD/darknet$ vi Makefile 修改下列三行
GPU=1
CUDNN=1
OPENCV=1

nvidia@jetson-0423418048807:~/XavierSSD$ make


2018年10月6日 星期六

build darknet yolo

git clone https://github.com/AlexeyAB/darknet.git

下載 CUDA Toolkit 9.1
https://developer.nvidia.com/cuda-toolkit-archive
安裝失敗,請參考下列步驟
https://yingrenn.blogspot.com/2018/07/cuda.html

下載 cuDNN, 請選擇 cuDNN v7.0 for CUDA 9.1 for 正確 Windows 版本
https://developer.nvidia.com/rdp/cudnn-archive

使用 VS2015 開啟 D:\Tensorflow\Yolo\darknet\build\darknet\darknet.sln
切換 Win32 到 x64
Project/darknet properities/
輸入 Configuration Properties/"CUDA C/C++"/CUDA Toolkit Custom Dir
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.1
輸入 Additional Include Directories
D:\Tensorflow\Yolo\opencv\build\include
D:\CUDNN\cudnn-9.1-windows7-x64-v7\cuda\include
輸入 Additional Library Directories
D:\Tensorflow\Yolo\opencv\build\x64\vc14\lib
D:\CUDNN\cudnn-9.1-windows7-x64-v7\cuda\lib\x64

2018年7月9日 星期一

CUDA 安裝失敗

CUDA 安裝失敗,通常是由於 Visual Studio Integration 失敗
所以透過自訂安裝,跳過不安裝 Visual Studio Integration, 可以安裝成功
Installer Type 要選擇 exe(local)

而 Visual Studio Integration 的安裝方式如下:
1. 使得可以編譯 CUDA 程式
注意安裝 CUDA 時的路徑,拷貝出 CUDAVisualStudioIntegration 目錄夾
將 D:\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions
目錄下所有檔案拷貝至
C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V140\BuildCustomizations
2. 使得 Visual Studio 可以新建 CUDA 專案
將目錄
D:\CUDAVisualStudioIntegration\extras\visual_studio_integration\CudaProjectVsWizards
拷貝至
C:\Program Files (x86)\Microsoft Visual Studio 14.0\Common7\IDE\Extensions
3. 安裝
D:\CUDAVisualStudioIntegration\NVIDIA_Nsight_Visual_Studio_Edition_Win64_5.4.0.17229.msi

2018年7月3日 星期二

Yolo

目錄 data/img

檔案 data/obj.data
classes= 2
train  = data/train.txt
valid  = data/train.txt
names = data/obj.names (相對於執行檔目錄)
backup = backup/

檔案 data/obj.names
air
bird

檔案 data/train.txt
data/img/air1.jpg
data/img/air2.jpg
data/img/air3.jpg

檔案 yolo-obj.cfg
(測試用)
batch=1
subdivisions=1
(訓練用)
batch=64
subdivisions=1, (視記憶體大小修改,記憶體小則使用64)
修改所有 [yolo] 層內的
classes =
修改所有 [yolo] 前一個 [convolutional] 層內的
filters = (classes + 5) * 3

標記
yolo_mark.exe data/img data/train.txt data/obj.names

訓練
darknet.exe detector train data/obj.data yolo-obj.cfg darknet19_448.conv.23
obj.data 內的 backup 指定輸出 weights 存放位置
darknet19_448.conv.23: 其實就是 weights, 要接續中斷的訓練時,則改為新產生的 weights
-dont_show: 不顯示 Loss-Window

檢測訓練結果(IoU, mAP)
darknet.exe detector map data/obj.data yolo-obj.cfg backup\yolo-objj_7000.weights

COCO Yolo v3(4GB GPU): yolov3.cfg, yolov3.weights
COCO Yolo v3 tiny(1GB GPU): yolov3-tiny.cfg, yo.ov3-tiny.weights
COCO Yolo v2(4GB GPU): yolov2.cfg, yolov2.weights
VOC Yolo v2(4GB GPU): yolo-voc.cfg, yolo-voc.weights
COCO Yolo v2 tiny(1GB GPU): yolov2-tiny.cfg, yolov2-tiny.weights
VOC Yolo v2 tiny(1GB GPU): yolov2-tiny-voc.cfg, yolov2-tiny-voc.weights
以上似乎是訓練時的需求,檢測或分類時似乎沒那麼大的需求

darknet.exe 參數
-i <index>, 指定 GPU, 可用 nvidia-smi.exe 查詢
-nogpu, 不使用 GPU
-thresh <val>, 預設為 0.25
-c <num>, OpenCV 影像, 預設為 0
-ext_output, 輸出物件位置
detector test, 相片
detector demo, 影片
detector train, 訓練
detector map, 檢測訓練結果
classifier predict, 分類

./darknet detect cfg/yolov3.cfg yolov3.weights data/dog.jpg
./darknet detector test cfg/coco.data cfg/yolov3.cfg yolov3.weights data/dog.jpg
以上兩個命令一樣

使用命令取得 yolov3-tiny.conv.15
darknet.exe partial cfg/yolov3-tiny.cfg yolov3-tiny.weights yolov3-tiny.conv.15 15

如何增進物件檢測
訓練前:
.cfg 檔內的 random=1
增加 .cfg 檔內的 width, height (須為 32 的倍數)
執行下列命令,重新計算 anchors, 更改 .cfg 檔內的 anchors
darknet.exe detector calc_anchors voc.data -num_of_clusters 9 -width 416 -height 416
小心標註相片內的物件,每一物件都要標註,而且不要標錯
每個物件最好有 2000 以上的影像,包含有不同的大小、角度、光線、背景等
不要被檢出的物件要在相片內,而且不能被標註

訓練時相片和標註檔的對映
darknet.c
int main(int argc, char **argv)
>run_detector(argc, argv);
detector.c
void run_detector(int argc, char **argv)
>train_detector(datacfg, cfg, weights, gpus, ngpus, clear, dont_show);
void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int dont_show)
> pthread_t load_thread = load_data(args);
data.c
pthread_t load_data(load_args args)
>if(pthread_create(&thread, 0, load_threads, ptr)) error("Thread creation failed");
void *load_threads(void *ptr)
>threads[i] = load_data_in_thread(args);
if(pthread_create(&thread, 0, load_thread, ptr)) error("Thread creation failed");
void *load_thread(void *ptr)
>*a.d = load_data_detection(a.n, a.paths, a.m, a.w, a.h, a.c, a.num_boxes, a.classes, a.flip, a.jitter, a.hue, a.saturation, a.exposure, a.small_object);
data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, float jitter, float hue, float saturation, float exposure, int small_object)
>fill_truth_detection(filename, boxes, d.y.vals[i], classes, flip, dx, dy, 1./sx, 1./sy, small_object, w, h);
void fill_truth_detection(char *path, int num_boxes, float *truth, int classes, int flip, float dx, float dy, float sx, float sy, int small_object, int net_w, int net_h)
>replace_image_to_label(path, labelpath);
utils.c
void replace_image_to_label(char *input_path, char *output_path)

在相片上標註偵測出的物件
image.c
void draw_detections_cv_v3(IplImage* show_img, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output)

network.c
將 image 轉成 network
float *network_predict(network net, float *input)
從 network 中取得 detection
detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter)