生活紀錄: Yolo

顯示具有 Yolo 標籤的文章。顯示所有文章

2023年7月3日星期一

YOLOv8 and TensorRT

參考 YOLOv8 GitHub 官網

參考 YOLOv8 文件

參考 DeepStream-Yolo

1. 下載 DeepStream-Yolo, Ultralytics YOLOv8

git clone https://github.com/marcoslucianops/DeepStream-Yolo.git

git clone https://github.com/ultralytics/ultralytics.git /mnt/Data/DeepStream/DeepStream-Yolo/ultralytics

2. 建立 deepstream_yolo docker container

docker_run.sh

xhost +

docker run --name='deepstream_yolo' --gpus all -it --net=host --privileged \

-v /tmp/.X11-unix:/tmp/.X11-unix \

-v /etc/localtime:/etc/localtime \

-v /mnt/Data/DeepStream/DeepStream-Yolo/DeepStream-Yolo:/home/DeepStream-Yolo \

-v /mnt/Data/DeepStream/DeepStream-Yolo/ultralytics:/home/ultralytics \

-v /mnt/Data/DeepStream/DeepStream-Yolo/read_me:/home/read_me \

-v /mnt/Data/DeepStream/DeepStream-Yolo/datasets:/home/datasets \

-v /mnt/CT1000SSD/ImageData/Light:/home/Light \

-e DISPLAY=$DISPLAY \

-w /home/read_me \

nvcr.io/nvidia/deepstream:6.2-devel

3. 在 Docker 內, 安裝 DeepStream-Yolo

apt-get install build-essential

/opt/nvidia/deepstream/deepstream/user_additional_install.sh

cd /home/DeepStream-Yolo

CUDA_VER=11.8 make -C nvdsinfer_custom_impl_Yolo

4. 在 Docker 內, 安裝 Ultralytics YOLOv8

#python3 -m pip install --upgrade pip

pip3 install --upgrade pip

pip3 install protobuf numpy

cd /home/ultralytics

#pip install -e .

pip3 install -r requirements.txt

python3 setup.py install

pip3 install onnx onnxsim onnxruntime

5. 在 Docker 內, 下載，轉換，測試 yolov8s.pt, yolov8s.pt 模型

cd /home/ultralytics

cp /home/DeepStream-Yolo/utils/export_yoloV8.py /home/ultralytics

wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8s.pt

wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt

python3 export_yoloV8.py -w yolov8s.pt --dynamic

python3 export_yoloV8.py -w yolov8n.pt --dynamic

cp yolov8s.onnx labels.txt /home/DeepStream-Yolo

cp yolov8n.onnx labels.txt /home/DeepStream-Yolo

6. 移除 deepstream_yolo container

$ docker container rm deepstream_yolo

7. 重新進入 Docker

docker_attach.sh

xhost +

docker start deepstream_yolo

docker attach deepstream_yolo

8. 轉換模型格式為 onnx

yolov8n.py

from ultralytics import YOLO

# Load a model

model = YOLO("yolov8n.yaml") # build a new model from scratch

model = YOLO("yolov8n.pt") # load a pretrained model (recommended for training)

# Use the model

model.train(data="coco128.yaml", epochs=3) # train the model

metrics = model.val() # evaluate model performance on the validation set

results = model("https://ultralytics.com/images/bus.jpg") # predict on an image

path = model.export(format="onnx") # export the model to ONNX format

執行 python3 yolov8n.py 出現下列錯誤

ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).

修正方式

$ sudo systemctl stop docker

取得 container id

$ docker inspect deepstream_yolo | grep Id

"Id": "???????"

編輯 container 的 ShmSize

$ sudo vi /var/lib/docker/containers/your_container_id/hostconfig.json

"ShmSize":8589934592

$ sudo systemctl restart docker

$ ./docker_attach.sh

9. 在 DeepStream 中測試 onnx 模型

# cd /home/DeepStream-Yolo

# vi config_infer_primary_yoloV8.txt

onnx-file=yolov8s.onnx

onnx-file=yolov8n.onnx

# vi deepstream_app_config.txt

uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4

uri=rtsp://root:A1234567@192.168.0.107:554/live1s1.sdp

live-source=0

live-source=1

config-file=config_infer_primary.txt

config-file=config_infer_primary_yoloV8.txt

file-loop=0

file-loop=1

# deepstream-app -c deepstream_app_config.txt

10. 準備自己的圖形資料, PASCAL VOC(LabelImg 產生的 xml) 格式轉換成 txt

參考 Ultralytics 文件之 VOC Dataset 說明

參考範例程式

prepare_detect.py

import cv2

import os

import random

import re

import xml.etree.ElementTree as ET

import numpy as np

LIGHT_CLASSES_LIST = [

'forward_right',

'others',

'red',

'red_left',

'yellow',

]

def save_false_positives(img_org, iName, xName, tag, classIdx,

clip_x0, clip_y0, clip_x1, clip_y1):

img_new = img_org[clip_y0:clip_y1, clip_x0:clip_x1]

fPath, fName = os.path.split(iName)

fName, fExt = os.path.splitext(fName)

fName = fName + tag + fExt

rndPaths = ['train', 'val', 'test']

rndPath = random.choices(rndPaths, weights=(8,1,1))[0]

iName = os.path.join('/home/datasets/Light/images', rndPath, fName)

cv2.imwrite(iName, img_new)

def convert_box(size, box):

dw, dh = 1. / size[0], 1. / size[1]

x, y, w, h = (box[0] + box[1]) / 2.0 - 1, (box[2] + box[3]) / 2.0 - 1, box[1] - box[0], box[3] - box[2]

return x * dw, y * dh, w * dw, h * dh

def save_file(img_org, iName, xName, tag, classIdx,

p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y,

img_w, img_h, xmin, ymin, xmax, ymax,

clip_x0, clip_y0, clip_x1, clip_y1):

img_new = img_org[clip_y0:clip_y1, clip_x0:clip_x1]

fPath, fName = os.path.split(iName)

fName, fExt = os.path.splitext(fName)

fName = fName + tag + fExt

rndPaths = ['train', 'val', 'test']

rndPath = random.choices(rndPaths, weights=(8,1,1))[0]

iName = os.path.join('/home/datasets/Light/images', rndPath, fName)

cv2.imwrite(iName, img_new)

w = clip_x1 - clip_x0

h = clip_y1 - clip_y0

xmin = xmin - clip_x0

ymin = ymin - clip_y0

xmax = xmax - clip_x0

ymax = ymax - clip_y0

bb = convert_box((w, h), (xmin, xmax, ymin, ymax))

fPath, fName = os.path.split(xName)

fName, fExt = os.path.splitext(fName)

fName = fName + tag + '.txt'

tName = os.path.join('/home/datasets/Light/labels', rndPath, fName)

with open(tName, 'w') as f:

f.write(" ".join([str(a) for a in (classIdx, *bb)]) + '\n')

def gen_img_yolo(iName, xName):

tree = ET.parse(open(xName))

root = tree.getroot()

img_w = int(root.find('size').find('width').text)

img_h = int(root.find('size').find('height').text)

for idx, object in enumerate(root.findall('object')):

name = object.find('name').text

classIdx = LIGHT_CLASSES_LIST.index(name)

#print(classIdx, name)

bndbox = object.find('bndbox')

p0x = int(bndbox.find('p0x').text)

p0y = int(bndbox.find('p0y').text)

p1x = int(bndbox.find('p1x').text)

p1y = int(bndbox.find('p1y').text)

p2x = int(bndbox.find('p2x').text)

p2y = int(bndbox.find('p2y').text)

p3x = int(bndbox.find('p3x').text)

p3y = int(bndbox.find('p3y').text)

xmin = int(bndbox.find('xmin').text)

ymin = int(bndbox.find('ymin').text)

xmax = int(bndbox.find('xmax').text)

ymax = int(bndbox.find('ymax').text)

if xmin != p0x or xmin != p3x or ymin != p0y or ymin != p1y or \

xmax != p1x or xmax != p2x or ymax != p2y or ymax != p3y:

print('error:bndbox', xName)

exit()

if idx > 0:

print('error:object', xName)

exit()

img_org = cv2.imread(iName)

if img_org.shape[0] != img_h or img_org.shape[1] != img_w:

print(img_org.shape, (img_h, img_w))

exit()

img = np.copy(img_org)

clip_x0 = random.randrange(0, int(xmin*0.5))

clip_y0 = random.randrange(0, int(ymin*0.5))

clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.5), img_w+1)

clip_y1 = random.randrange(int(ymax + (img_h-ymax)*0.5), img_h+1)

save_file(img_org, iName, xName, '', classIdx,

p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y,

img_w, img_h, xmin, ymin, xmax, ymax,

clip_x0, clip_y0, clip_x1, clip_y1)

ratio = (xmax - xmin) / img_w

if ratio < 0.3:

clip_x0 = random.randrange(int(xmin*0.3), int(xmin*0.8))

clip_y0 = random.randrange(int(ymin*0.3), int(ymin*0.8))

clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), int(xmax + (img_w-xmax)*0.7))

clip_y1 = random.randrange(int(ymax + (img_h-ymax)*0.2), int(ymax + (img_h-ymax)*0.7))

save_file(img_org, iName, xName, '_a', classIdx,

p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y,

img_w, img_h, xmin, ymin, xmax, ymax,

clip_x0, clip_y0, clip_x1, clip_y1)

clip_x0 = random.randrange(int(xmin*0.5), int(xmin*0.9))

clip_y0 = random.randrange(int(ymin*0.5), int(ymin*0.9))

clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.1), int(xmax + (img_w-xmax)*0.5))

clip_y1 = random.randrange(int(ymax + (img_h-ymax)*0.1), int(ymax + (img_h-ymax)*0.5))

save_file(img_org, iName, xName, '_b', classIdx,

p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y,

img_w, img_h, xmin, ymin, xmax, ymax,

clip_x0, clip_y0, clip_x1, clip_y1)

if xmin > (img_w - xmax):

if ymin > (img_h - ymax):

clip_x0 = random.randrange(0, int(xmin*0.8))

clip_y0 = random.randrange(0, int(ymin*0.8))

clip_x1 = random.randrange(int(xmin), int(xmin+(xmax-xmin)*0.8))

clip_y1 = random.randrange(int(ymin), int(ymin+(ymax-ymin)*0.8))

root.remove(object)

save_false_positives(img_org, iName, xName, '_f0', classIdx,

clip_x0, clip_y0, clip_x1, clip_y1)

else:

clip_x0 = random.randrange(0, int(xmin*0.8))

clip_y0 = random.randrange(int(ymin+(ymax-ymin)*0.2), int(ymax))

clip_x1 = random.randrange(int(xmin), int(xmin + (xmax-xmin)*0.8))

clip_y1 = random.randrange(int(ymax+(img_h-ymax)*0.2), img_h)

root.remove(object)

save_false_positives(img_org, iName, xName, '_f1', classIdx,

clip_x0, clip_y0, clip_x1, clip_y1)

else:

if ymin > (img_h - ymax):

clip_x0 = random.randrange(int(xmin+(xmax-xmin)*0.2), int(xmax))

clip_y0 = random.randrange(0, int(ymin*0.8))

clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), img_w)

clip_y1 = random.randrange(int(ymin), int(ymin+(ymax-ymin)*0.8))

root.remove(object)

save_false_positives(img_org, iName, xName, '_f2', classIdx,

clip_x0, clip_y0, clip_x1, clip_y1)

else:

clip_x0 = random.randrange(int(xmin+(xmax-xmin)*0.2), int(xmax))

clip_y0 = random.randrange(int(ymin+(ymax-ymin)*0.2), int(ymax))

clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), img_w)

clip_y1 = random.randrange(int(ymax+(img_h-ymax)*0.2), img_h)

root.remove(object)

save_false_positives(img_org, iName, xName, '_f3', classIdx,

clip_x0, clip_y0, clip_x1, clip_y1)

elif ratio < 0.7:

clip_x0 = random.randrange(int(xmin*0.1), int(xmin*0.7))

clip_y0 = random.randrange(int(ymin*0.1), int(ymin*0.7))

clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.3), int(xmax + (img_w-xmax)*0.9))

clip_y1 = random.randrange(int(ymax + (img_h-ymax)*0.3), int(ymax + (img_h-ymax)*0.9))

save_file(img_org, iName, xName, '_c', classIdx,

p0x, p0y, p1x, p1y, p2x, p2y, p3x, p3y,

img_w, img_h, xmin, ymin, xmax, ymax,

clip_x0, clip_y0, clip_x1, clip_y1)

if xmin > (img_w - xmax):

if ymin > (img_h - ymax):

clip_x0 = random.randrange(0, int(xmin*0.8))

clip_y0 = random.randrange(0, int(ymin*0.8))

clip_x1 = random.randrange(int(xmin), int(xmin+(xmax-xmin)*0.8))

clip_y1 = random.randrange(int(ymin), int(ymin+(ymax-ymin)*0.8))

root.remove(object)

save_false_positives(img_org, iName, xName, '_f4', classIdx,

clip_x0, clip_y0, clip_x1, clip_y1)

else:

clip_x0 = random.randrange(0, int(xmin*0.8))

clip_y0 = random.randrange(int(ymin+(ymax-ymin)*0.2), int(ymax))

clip_x1 = random.randrange(int(xmin), int(xmin + (xmax-xmin)*0.8))

clip_y1 = random.randrange(int(ymax+(img_h-ymax)*0.2), img_h)

root.remove(object)

save_false_positives(img_org, iName, xName, '_f5', classIdx,

clip_x0, clip_y0, clip_x1, clip_y1)

else:

if ymin > (img_h - ymax):

clip_x0 = random.randrange(int(xmin+(xmax-xmin)*0.2), int(xmax))

clip_y0 = random.randrange(0, int(ymin*0.8))

clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), img_w)

clip_y1 = random.randrange(int(ymin), int(ymin+(ymax-ymin)*0.8))

root.remove(object)

save_false_positives(img_org, iName, xName, '_f6', classIdx,

clip_x0, clip_y0, clip_x1, clip_y1)

else:

clip_x0 = random.randrange(int(xmin+(xmax-xmin)*0.2), int(xmax))

clip_y0 = random.randrange(int(ymin+(ymax-ymin)*0.2), int(ymax))

clip_x1 = random.randrange(int(xmax + (img_w-xmax)*0.2), img_w)

clip_y1 = random.randrange(int(ymax+(img_h-ymax)*0.2), img_h)

root.remove(object)

save_false_positives(img_org, iName, xName, '_f7', classIdx,

clip_x0, clip_y0, clip_x1, clip_y1)

elif ratio < 1.0:

pass

return

def recursive_folder(path):

files = os.listdir(path)

files.sort()

for file in files:

fullName = os.path.join(path, file)

if os.path.isfile(fullName):

fPath, fName = os.path.split(fullName)

fName, fExt = os.path.splitext(fName)

if fExt in ('.jpg'):

xPath = fPath + '.xml'

xName = fName + '.xml'

xFName = os.path.join(xPath, xName)

if os.path.isfile(xFName):

gen_img_yolo(fullName, xFName)

else:

print(xFName)

else:

recursive_folder(fullName)

def main():

recursive_folder('/home/Light')

if __name__ == '__main__':

main()

11. 訓練自己的模型

from ultralytics import YOLO

# Load a model

model = YOLO('yolov8n.pt') # load a pretrained model (recommended for training)

# Train the model

model.train(data='VOC.yaml', epochs=100, imgsz=640)

12. 查詢 onnx 模型的輸出輸入層

import onnx

model = onnx.load('yolov8n.onnx')

g_in = model.graph.input

g_out = model.graph.output

2023年6月21日星期三

如何在 python 中使用 tao 產生的 yolov4 模型

參考 Nvidia TAO Computer Vision Sample Workflows 產生 yolov4-tiny 模型

此模型與一般產生的模型不一樣

一般產生的模型參考 tensorrt_demos 即可在 python 中使用

但是照著 Nvidia TAO Computer Vision Sample Workflows 文件

只能將

tao yolo_v4_tiny export 出的模型(.etlt)用於 DeepStream

而由

tao converter 產生的 trt.engine 不能使用於 DeepStream 也不能在 python 中使用

參考 TAO Toolkit Triton Apps

有說明如何使用 tao 產生的模型在 Triton 伺服器上

在 yolov3_postprocessor.py 中發現 tao 產生的 yolo

已經將輸出的 NMS 處理過, 並將內容置於

BatchNMS(-1,1): 偵測出的數量

BatchNMS_1(-1,200,4): 座標

BatchNMS_2(-1,200): 信心

BatchNMS_3(-1,200): 類別

輸入的方式也有改變

cv2 讀出的圖不需 cvtColor, 也不用除以 255.0

只需將 BHWC 轉成 BCHW

img = img.transpose((2, 0, 1)).astype(np.float32)

tao 的執行是在 docker 中，所以很難除錯

發現下列命令，可以直接進入 docker 中，執行 python, 查詢版本環境等

docker run -it --rm --gpus all \

-v "/mnt/Data/tao/yolo_v4_tiny_1.4.1":"/workspace/tao-experiments" \

-v "/mnt/Data/TensorRT/tensorrt_demos":"/workspace/tensorrt_demos" \

-v "/mnt/CT1000SSD/ImageData/Light":"/workspace/Light" \

nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5 \

bash

將模型轉換成 TensorRT 除了使用

!tao converter -k $KEY \

-p Input,1x3x416x416,8x3x416x416,16x3x416x416 \

-e $USER_EXPERIMENT_DIR/export/trt.engine \

-t fp32 \

$USER_EXPERIMENT_DIR/export/yolov4_cspdarknet_tiny_epoch_$EPOCH.etlt

外, 也可使用

!tao-deploy yolo_v4_tiny gen_trt_engine \

-m $USER_EXPERIMENT_DIR/export/yolov4_cspdarknet_tiny_epoch_$EPOCH.etlt \

-e $SPECS_DIR/yolo_v4_tiny_retrain_kitti.txt \

-k $KEY \

--data_type fp32 \

--batch_size 1 \

--engine_file $USER_EXPERIMENT_DIR/export/yolov4_tao_deplay.trt

但若是要在不同平台上轉換

參考 TAO Converter 下載安裝，並執行轉換

./tao-converter_v4.0.0_trt8.5.1.7 \

-k nvidia_tlt \

-p Input,1x3x416x416,2x3x416x416,4x3x416x416 \

-e yolo_v4_tiny_1.4.1/yolo_v4_tiny/export/yolov4_tao_converter_fp32.engine \

-t fp32 \

yolo_v4_tiny_1.4.1/yolo_v4_tiny/export/yolov4_cspdarknet_tiny_epoch_080.etlt

參考 tensorrt_demos 修改 utils/yolo_with_plugins.py, 改名成 triton_yolo_with_plugins.py 如下

"""yolo_with_plugins.py

Implementation of TrtYOLO class with the yolo_layer plugins.

"""

from __future__ import print_function

import ctypes

import numpy as np

import cv2

import tensorrt as trt

import pycuda.driver as cuda

try:

ctypes.cdll.LoadLibrary('./plugins/libyolo_layer.so')

except OSError as e:

raise SystemExit('ERROR: failed to load ./plugins/libyolo_layer.so. '

'Did you forget to do a "make" in the "./plugins/" '

'subdirectory?') from e

def _preprocess_yolo(img, input_shape, letter_box=False):

"""Preprocess an image before TRT YOLO inferencing.

# Args

img: int8 numpy array of shape (img_h, img_w, 3)

input_shape: a tuple of (H, W)

letter_box: boolean, specifies whether to keep aspect ratio and

create a "letterboxed" image for inference

# Returns

preprocessed img: float32 numpy array of shape (3, H, W)

"""

if letter_box:

img_h, img_w, _ = img.shape

new_h, new_w = input_shape[0], input_shape[1]

offset_h, offset_w = 0, 0

if (new_w / img_w) <= (new_h / img_h):

new_h = int(img_h * new_w / img_w)

offset_h = (input_shape[0] - new_h) // 2

else:

new_w = int(img_w * new_h / img_h)

offset_w = (input_shape[1] - new_w) // 2

resized = cv2.resize(img, (new_w, new_h))

img = np.full((input_shape[0], input_shape[1], 3), 127, dtype=np.uint8)

img[offset_h:(offset_h + new_h), offset_w:(offset_w + new_w), :] = resized

else:

img = cv2.resize(img, (input_shape[1], input_shape[0]))

#img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

img = img.transpose((2, 0, 1)).astype(np.float32)

#img /= 255.0

return img

class HostDeviceMem(object):

"""Simple helper data class that's a little nicer to use than a 2-tuple."""

def __init__(self, host_mem, device_mem):

self.host = host_mem

self.device = device_mem

def __str__(self):

return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

def __repr__(self):

return self.__str__()

def get_input_shape(engine):

"""Get input shape of the TensorRT YOLO engine."""

binding = engine[0]

assert engine.binding_is_input(binding)

binding_dims = engine.get_binding_shape(binding)

if len(binding_dims) == 4:

return tuple(binding_dims[2:])

elif len(binding_dims) == 3:

return tuple(binding_dims[1:])

else:

raise ValueError('bad dims of binding %s: %s' % (binding, str(binding_dims)))

def allocate_buffers(engine, context):

"""Allocates all host/device in/out buffers required for an engine."""

inputs = []

outputs = []

bindings = []

stream = cuda.Stream()

for binding in engine:

binding_dims = engine.get_binding_shape(binding)

binding_dtype = engine.get_tensor_dtype(binding)

binding_format = engine.get_tensor_format_desc(binding)

binding_loc = engine.get_tensor_location(binding)

binding_mode = engine.get_tensor_mode(binding)

binding_shape = engine.get_tensor_shape(binding)

binding_shape_inference = engine.is_shape_inference_io(binding)

print('binding_dims:{} {} {}'.format(binding, binding_dims, binding_dtype))

print(' {}'.format(binding_format))

print(' {} {} {} {}'.format(binding_loc, binding_mode, binding_shape, binding_shape_inference))

size = trt.volume(binding_dims)

if size < 0: size *= -1;

print(' size:{}'.format(size))

dtype = trt.nptype(engine.get_binding_dtype(binding))

# Allocate host and device buffers

host_mem = cuda.pagelocked_empty(size, dtype)

device_mem = cuda.mem_alloc(host_mem.nbytes)

# Append the device buffer to device bindings.

bindings.append(int(device_mem))

# Append to the appropriate list.

if engine.binding_is_input(binding):

#binding_pro_shape = engine.get_profile_shape(0, binding)

#print(' {}'.format(binding_pro_shape))

if binding_dims[0] == -1:

alloc_dims = np.copy(binding_dims)

alloc_dims[0] = 1

context.set_binding_shape(0, alloc_dims)

inputs.append(HostDeviceMem(host_mem, device_mem))

else:

outputs.append(HostDeviceMem(host_mem, device_mem))

return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):

"""do_inference (for TensorRT 6.x or lower)

This function is generalized for multiple inputs/outputs.

Inputs and outputs are expected to be lists of HostDeviceMem objects.

"""

# Transfer input data to the GPU.

[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

# Run inference.

context.execute_async(batch_size=batch_size,

bindings=bindings,

stream_handle=stream.handle)

# Transfer predictions back from the GPU.

[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

# Synchronize the stream

stream.synchronize()

# Return only the host outputs.

return [out.host for out in outputs]

def do_inference_v2(context, bindings, inputs, outputs, stream):

"""do_inference_v2 (for TensorRT 7.0+)

This function is generalized for multiple inputs/outputs for full

dimension networks.

Inputs and outputs are expected to be lists of HostDeviceMem objects.

"""

# Transfer input data to the GPU.

[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

# Run inference.

context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

# Transfer predictions back from the GPU.

[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

# Synchronize the stream

stream.synchronize()

# Return only the host outputs.

return [out.host for out in outputs]

class TrtYOLO(object):

"""TrtYOLO class encapsulates things needed to run TRT YOLO."""

def _load_engine(self):

TRTbin = 'yolo/%s.trt' % self.model

TRTbin = self.model

with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime:

return runtime.deserialize_cuda_engine(f.read())

def __init__(self, model, category_num=80, letter_box=False, cuda_ctx=None):

"""Initialize TensorRT plugins, engine and conetxt."""

self.model = model

self.category_num = category_num

self.letter_box = letter_box

self.cuda_ctx = cuda_ctx

if self.cuda_ctx:

self.cuda_ctx.push()

self.inference_fn = do_inference if trt.__version__[0] < '7' \

else do_inference_v2

self.trt_logger = trt.Logger(trt.Logger.INFO)

# add for errors

# IPluginCreator not found in Plugin Registry

# getPluginCreator could not find plugin: BatchedNMSDynamic_TRT version: 1

# Serialization assertion plan->header.magicTag == rt::kPLAN_MAGIC_TAG failed

trt.init_libnvinfer_plugins(self.trt_logger, namespace="")

self.engine = self._load_engine()

self.input_shape = get_input_shape(self.engine)

try:

self.context = self.engine.create_execution_context()

self.inputs, self.outputs, self.bindings, self.stream = \

allocate_buffers(self.engine, self.context)

except Exception as e:

raise RuntimeError('fail to allocate CUDA resources') from e

finally:

if self.cuda_ctx:

self.cuda_ctx.pop()

def __del__(self):

"""Free CUDA memories."""

del self.outputs

del self.inputs

del self.stream

def detect(self, img, letter_box=None):

"""Detect objects in the input image."""

letter_box = self.letter_box if letter_box is None else letter_box

img_h, img_w, _ = img.shape

img_resized = _preprocess_yolo(img, self.input_shape, letter_box)

#print(img_resized.shape, img_resized.dtype)

# Set host input to the image. The do_inference() function

# will copy the input to the GPU before executing.

self.inputs[0].host = np.ascontiguousarray(img_resized)

if self.cuda_ctx:

self.cuda_ctx.push()

trt_outputs = self.inference_fn(

context=self.context,

bindings=self.bindings,

inputs=self.inputs,

outputs=self.outputs,

stream=self.stream)

if self.cuda_ctx:

self.cuda_ctx.pop()

y_pred = [i.reshape(1, -1,)[:1] for i in trt_outputs]

keep_k, boxes, scores, cls_id = y_pred

#print(keep_k.shape)

#print(boxes.shape)

keep_k[0,0] = 1

locs = np.empty((0,4), dtype=np.uint)

cids = np.empty((0,1), dtype=np.uint)

confs = np.empty((0,1), dtype=np.float32)

for idx, k in enumerate(keep_k.reshape(-1)):

mul = np.array([img_w,img_h,img_w,img_h])

loc = boxes[idx].reshape(-1, 4)[:k] * mul

loc = loc.astype(np.uint)

cid = cls_id[idx].reshape(-1, 1)[:k]

cid = cid.astype(np.uint)

conf = scores[idx].reshape(-1, 1)[:k]

locs = np.concatenate((locs, loc), axis=0)

cids = np.concatenate((cids, cid), axis=0)

confs = np.concatenate((confs, conf), axis=0)

#print(locs.shape, cids.shape, confs.shape)

#print(locs, cids, confs)

return locs, confs, cids

下列程式使用上列的程式

import cv2

import numpy as np

import tensorrt as trt

import pycuda.autoinit # This is needed for initializing CUDA driver

import pycuda.driver as cuda

from utils.triton_yolo_with_plugins import TrtYOLO

#MODEL_PATH = '/workspace/tao-experiments/yolo_v4_tiny/export/yolov4_tao_convert.engine'

MODEL_PATH = '/workspace/tao-experiments/yolo_v4_tiny/export/yolov4_tao_deplay.trt'

#MODEL_PATH = '/workspace/tao-experiments/yolo_v4_tiny/export/trt.engine'

def main():

trt_yolo = TrtYOLO(MODEL_PATH, 5, True)

img_org = cv2.imread('bb.jpg')

img = np.copy(img_org)

print(img.shape, img.dtype)

boxes, confs, clss = trt_yolo.detect(img, False)

print(boxes.shape, confs.shape, clss.shape)

print(boxes, confs, clss)

for box, conf, clss in zip(boxes, confs, clss):

x_min, y_min, x_max, y_max = box[0], box[1], box[2], box[3]

cv2.rectangle(img, (x_min, y_min), (x_max, y_max), (255, 255, 255), 2)

print(box, conf, clss)

cv2.imwrite('aa.jpg', img)

print('aaa')

if __name__ == '__main__':

main()

除錯說明

訊息: Serialization assertion plan->header.magicTag == rt::kPLAN_MAGIC_TAG failed

解決: TensorRt 的版本不一致，安裝不同版本，或利用 docker

訊息: IPluginCreator not found in Plugin Registry

訊息: getPluginCreator could not find plugin: BatchedNMSDynamic_TRT version: 1

解決: 需安裝 TensorRT OSS

在 load_engine() 之前加上

trt.init_libnvinfer_plugins(self.trt_logger, namespace="")

2022年2月16日星期三

yolo v4 to TensorRt

參考: tensorrt_demos

$ cd TensorRT

$ git clone https://github.com/jkjung-avt/tensorrt_demos.git

$ cd tensorrt_demos

安裝環境

$ cd yolo

$ ./install_pycuda.sh

$ pip3 install onnx==1.4.1

$ cd ../plugins

$ make

下載 yolo 的 weights 和 cfg

$ cd ../yolo

$ ./download_yolo.sh

轉換 weights 到 trt

$ python3 yolo_to_onnx.py -m yolov4-tiny-416

$ python3 onnx_to_tensorrt.py -m yolov4-tiny-416

測試

$ cd ..

$ python trt_yolo.py --image doc/dog_trt_yolov4_416.jpg -m yolov4-tiny-416

使用 int8

$ cd yolo

$ ln -s yolov4-tiny-416.cfg yolov4-tiny-int8-416.cfg

$ ln -s yolov4-tiny-416.onnx yolov4-tiny-int8-416.onnx

$ mkdir calib_images

# and copy our image to calib_images

$ python3 onnx_to_tensorrt.py -v --int8 -m yolov4-tiny-int8-416

會產生如下錯誤

[03/02/2022-15:22:07] [TRT] [V] 001_convolutional + 001_convolutional_bn Set Tactic Name: sm70_xmma_fprop_implicit_gemm_f16f16_f16f16_f16_nhwckrsc_nhwc_tilesize128x256x32_stage1_warpsize2x4x1_g1_tensor8x8x4_t1r3s3 Tactic: 46202665595848747

[03/02/2022-15:22:07] [TRT] [V] Deleting timing cache: 2020 entries, 504 hits

[03/02/2022-15:22:07] [TRT] [E] 1: Unexpected exception

ERROR: failed to build the TensorRT engine!

$ vi onnx_to_tensorrt.py

將 from calibrator import YOLOEntropyCalibrator 移到程式開頭

2022年2月9日星期三

Yolo tiny v4 to tensorflow and tflite

參考 tensorflow-yolov4-tflite

只能使用 tensorflow==2.3.0rc0

不要使用別的版本，也不要用 GPU

視情況修改 core/config.py

__C.YOLO.CLASSES

__C.YOLO.ANCHORS_TINY

for tensorflow format load by tf.saved_model.load()

$ python save_model.py --weights /your_path_to/weights/yolov4-tiny-vehicle-r_final.weights \

--output ./checkpoints/yolov4-tiny-416 \

--input_size 416 --model yolov4 --tiny

$ python convert_tflite.py --weights ./checkpoints/yolov4-tiny-416-tflite \

--output ./checkpoints/yolov4-tiny-416.tflite

for tensorflow tflite load by tf.lite.Interpreter()

$ python save_model.py --weights /your_path_to/weights/yolov4-tiny-vehicle-r_final.weights \

--output ./checkpoints/yolov4-tiny-416-tflite \

--input_size 416 --model yolov4 --tiny --framework tflite

$ python convert_tflite.py --weights ./checkpoints/yolov4-tiny-416-tflite \

--output ./checkpoints/yolov4-tiny-416-fp16.tflite \

--quantize_mode float16

2020年10月28日星期三

在 DeepStream 上使用 Yolo v4

參考 Using YOLOv4 on NVIDIA DeepStream 5.0

參考 NVIDIA-AI-IOT / yolov4_deepstream

到 darknet 下載 yolov4.cfg, yolov4.weights, yolov4-tiny.cfg, yolov4-tiny.weights

可修改 cfg 檔的 width, height

$ pip3 install torch

$ pip3 install torchvision

$ git clone https://github.com/Tianxiaomo/pytorch-YOLOv4.git

$ cd pytorch-YOLOv4

$ pip3 install onnxruntime

$ python3 demo_darknet2onnx.py yolov4.cfg yolov4.weights ./data/giraffe.jpg 1

產生 yolov4_1_3_416_416_static.onnx

/usr/src/tensorrt/bin/trtexec --onnx=yolov4_1_3_416_416_static.onnx \

--explicitBatch --saveEngine=yolov4_1_3_416_416_fp16.engine \

--workspace=4096 --fp16

產生 yolov4_1_3_416_416_fp16.engine

$ cd ..

$ git clone https://github.com/NVIDIA-AI-IOT/yolov4_deepstream

$ cd yolov4_deepstream

$ sudo cp -r deepstream_yolov4 /opt/nvidia/deepstream/deepstream-5.0/sources

$ cd /opt/nvidia/deepstream/deepstream-5.0/sources

$ sudo chown user.group deepstream_yolov4

$ cd deepstream_yolov4/nvdsinfer_custom_impl_Yolo/

$ make

$ cd ..

拷貝 yolov4_1_3_416_416_fp16.engine 到此

修改 config_infer_primary_yoloV4.txt

model-engine-file=yolov4-tiny_1_3_416_416_fp16.engine

修改 deepstream_app_config_yoloV4.txt

# 不存檔

[sink0]

enable=0

# 螢幕顯示

[sink1]

enable=1

type=2

sync=0

display-id=0

offset-x=0

offset-y=0

width=0

height=0

overlay-id=1

source-id=0

# 不要在此使用 model-engine-file

[primary-gie]

enable=1

#model-engine-file=yolov4_1_3_320_320_fp16.engine

# 開啟 tracker

[tracker]

enable=1

執行

$ deepstream-app -c deepstream_app_config_yoloV4.txt

2020年9月25日星期五

accuracy, precision, recall 的理解

預設真假和事實的真假

TP(True Positive): 事實為真，預測為真

FN(False Negative): 事實為真，預測為假

FP(False Positive): 事實為假，預測為真

TN(True Negative): 事實為假，預測為假

Accuuuracy = (TP+TN) / (TP+TN+FP+FN)

正確率：在所有情況中，正確預測的比率

Precision = (TP) / (TP+FP)

精確率：預測為真的情況中，有多少是真

Recall = (TP) / (TP+FN)

召回率：為真的情況下，有多少預測為真

Precision 高, Recall 低：捉到的大部分是真的，但會漏掉真的

Precision 低, Recall 高：真的大部分會被捉到，但會有不少假的

2020年8月25日星期二

Jetson Nano darknet

參考 Nvidia Jetson Nano 使用心得

$ git clone https://github.com/pjreddie/darknet.git

出現下列錯誤

error: 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' undeclared

改為使用

$ git clone https://github.com/AlexeyAB/darknet

$ cd darknet

$ vi Makefile

GPU=1

CUDNN=1

CUDNN_HALF=1

OPENCV=1

LIBSO=1

$ make -j4

參考 AlexeyAB/darknet
cfg 和 weights 由 YOLOv4 model zoo 下載

yolov4-tiny.weights (建議使用)

yolov4-leaky-416.weight

yolov4-mish-416.weight

nano@nano-desktop:~/Data/darknet/darknet.AlexeyAB$ ./darknet detector demo ../cfg/coco.data ../cfg/yolov4-leaky-416.cfg ../weights/yolov4-leaky-416.weights 'nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)1280, height=(int)720,format=(string)NV12, framerate=(fraction)30/1 ! nvvidconv flip-method=2 ! video/x-raw, format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink'

nano@nano-desktop:~/Data/darknet/darknet.AlexeyAB$ python3 darknet_video.py --weights ../weights/yolov4-leaky-416.weights --data_file ../cfg/coco.data --config_file ../cfg/yolov4-leaky-416.cfg --input 'nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)416, height=(int)416,format=(string)NV12, framerate=(fraction)30/1 ! nvvidconv flip-method=2 ! video/x-raw, format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink' --ext_output

需要按 Enter 才會進下一個畫面

2019年6月22日星期六

Nvidia Jetson AGX Xavier 之 darknet 安裝

nvidia@jetson-0423418048807:~/XavierSSD$ export PATH=${PATH}:/usr/local/cuda/bin
nvidia@jetson-0423418048807:~/XavierSSD$ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64

nvidia@jetson-0423418048807:~/XavierSSD$ git clone https://github.com/pjreddie/darknet

nvidia@jetson-0423418048807:~/XavierSSD$ cd darknet/

nvidia@jetson-0423418048807:~/XavierSSD/darknet$ vi Makefile 修改下列三行

GPU=1

CUDNN=1

OPENCV=1

nvidia@jetson-0423418048807:~/XavierSSD$ make

2018年10月6日星期六

build darknet yolo

git clone https://github.com/AlexeyAB/darknet.git

下載 CUDA Toolkit 9.1
https://developer.nvidia.com/cuda-toolkit-archive
安裝失敗，請參考下列步驟
https://yingrenn.blogspot.com/2018/07/cuda.html

下載 cuDNN, 請選擇 cuDNN v7.0 for CUDA 9.1 for 正確 Windows 版本
https://developer.nvidia.com/rdp/cudnn-archive

使用 VS2015 開啟 D:\Tensorflow\Yolo\darknet\build\darknet\darknet.sln
切換 Win32 到 x64
Project/darknet properities/
輸入 Configuration Properties/"CUDA C/C++"/CUDA Toolkit Custom Dir
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.1
輸入 Additional Include Directories
D:\Tensorflow\Yolo\opencv\build\include
D:\CUDNN\cudnn-9.1-windows7-x64-v7\cuda\include
輸入 Additional Library Directories
D:\Tensorflow\Yolo\opencv\build\x64\vc14\lib
D:\CUDNN\cudnn-9.1-windows7-x64-v7\cuda\lib\x64

2018年7月9日星期一

CUDA 安裝失敗

CUDA 安裝失敗，通常是由於 Visual Studio Integration 失敗
所以透過自訂安裝，跳過不安裝 Visual Studio Integration, 可以安裝成功
Installer Type 要選擇 exe(local)

而 Visual Studio Integration 的安裝方式如下：
1. 使得可以編譯 CUDA 程式
注意安裝 CUDA 時的路徑，拷貝出 CUDAVisualStudioIntegration 目錄夾
將 D:\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions
目錄下所有檔案拷貝至
C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V140\BuildCustomizations
2. 使得 Visual Studio 可以新建 CUDA 專案
將目錄
D:\CUDAVisualStudioIntegration\extras\visual_studio_integration\CudaProjectVsWizards
拷貝至
C:\Program Files (x86)\Microsoft Visual Studio 14.0\Common7\IDE\Extensions
3. 安裝
D:\CUDAVisualStudioIntegration\NVIDIA_Nsight_Visual_Studio_Edition_Win64_5.4.0.17229.msi

2018年7月3日星期二

Yolo

目錄 data/img

檔案 data/obj.data
classes= 2
train = data/train.txt
valid = data/train.txt
names = data/obj.names (相對於執行檔目錄)
backup = backup/

檔案 data/obj.names
air
bird

檔案 data/train.txt
data/img/air1.jpg
data/img/air2.jpg
data/img/air3.jpg

檔案 yolo-obj.cfg
(測試用)
batch=1
subdivisions=1
(訓練用)
batch=64
subdivisions=1, (視記憶體大小修改，記憶體小則使用64)
修改所有 [yolo] 層內的
classes =
修改所有 [yolo] 前一個 [convolutional] 層內的
filters = (classes + 5) * 3

標記
yolo_mark.exe data/img data/train.txt data/obj.names

訓練
darknet.exe detector train data/obj.data yolo-obj.cfg darknet19_448.conv.23
obj.data 內的 backup 指定輸出 weights 存放位置
darknet19_448.conv.23: 其實就是 weights, 要接續中斷的訓練時，則改為新產生的 weights
-dont_show: 不顯示 Loss-Window

檢測訓練結果(IoU, mAP)
darknet.exe detector map data/obj.data yolo-obj.cfg backup\yolo-objj_7000.weights

COCO Yolo v3(4GB GPU): yolov3.cfg, yolov3.weights
COCO Yolo v3 tiny(1GB GPU): yolov3-tiny.cfg, yo.ov3-tiny.weights
COCO Yolo v2(4GB GPU): yolov2.cfg, yolov2.weights
VOC Yolo v2(4GB GPU): yolo-voc.cfg, yolo-voc.weights
COCO Yolo v2 tiny(1GB GPU): yolov2-tiny.cfg, yolov2-tiny.weights
VOC Yolo v2 tiny(1GB GPU): yolov2-tiny-voc.cfg, yolov2-tiny-voc.weights
以上似乎是訓練時的需求，檢測或分類時似乎沒那麼大的需求

darknet.exe 參數
-i <index>, 指定 GPU, 可用 nvidia-smi.exe 查詢
-nogpu, 不使用 GPU
-thresh <val>, 預設為 0.25
-c <num>, OpenCV 影像, 預設為 0
-ext_output, 輸出物件位置
detector test, 相片
detector demo, 影片
detector train, 訓練
detector map, 檢測訓練結果
classifier predict, 分類

./darknet detect cfg/yolov3.cfg yolov3.weights data/dog.jpg
./darknet detector test cfg/coco.data cfg/yolov3.cfg yolov3.weights data/dog.jpg
以上兩個命令一樣

使用命令取得 yolov3-tiny.conv.15
darknet.exe partial cfg/yolov3-tiny.cfg yolov3-tiny.weights yolov3-tiny.conv.15 15

如何增進物件檢測
訓練前：
.cfg 檔內的 random=1
增加 .cfg 檔內的 width, height (須為 32 的倍數)
執行下列命令，重新計算 anchors, 更改 .cfg 檔內的 anchors
darknet.exe detector calc_anchors voc.data -num_of_clusters 9 -width 416 -height 416
小心標註相片內的物件，每一物件都要標註，而且不要標錯
每個物件最好有 2000 以上的影像，包含有不同的大小、角度、光線、背景等
不要被檢出的物件要在相片內，而且不能被標註

訓練時相片和標註檔的對映
darknet.c
int main(int argc, char **argv)
>run_detector(argc, argv);
detector.c
void run_detector(int argc, char **argv)
>train_detector(datacfg, cfg, weights, gpus, ngpus, clear, dont_show);
void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int dont_show)
> pthread_t load_thread = load_data(args);
data.c
pthread_t load_data(load_args args)
>if(pthread_create(&thread, 0, load_threads, ptr)) error("Thread creation failed");
void *load_threads(void *ptr)
>threads[i] = load_data_in_thread(args);
if(pthread_create(&thread, 0, load_thread, ptr)) error("Thread creation failed");
void *load_thread(void *ptr)
>*a.d = load_data_detection(a.n, a.paths, a.m, a.w, a.h, a.c, a.num_boxes, a.classes, a.flip, a.jitter, a.hue, a.saturation, a.exposure, a.small_object);
data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, float jitter, float hue, float saturation, float exposure, int small_object)
>fill_truth_detection(filename, boxes, d.y.vals[i], classes, flip, dx, dy, 1./sx, 1./sy, small_object, w, h);
void fill_truth_detection(char *path, int num_boxes, float *truth, int classes, int flip, float dx, float dy, float sx, float sy, int small_object, int net_w, int net_h)
>replace_image_to_label(path, labelpath);
utils.c
void replace_image_to_label(char *input_path, char *output_path)

在相片上標註偵測出的物件
image.c
void draw_detections_cv_v3(IplImage* show_img, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output)

network.c
將 image 轉成 network
float *network_predict(network net, float *input)
從 network 中取得 detection
detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter)

訂閱：文章 (Atom)

網頁

2023年7月3日 星期一

2023年6月21日 星期三

2022年2月16日 星期三

2022年2月9日 星期三

2020年10月28日 星期三

2020年9月25日 星期五

2020年8月25日 星期二

2019年6月22日 星期六

2018年10月6日 星期六

2018年7月9日 星期一

2018年7月3日 星期二

2023年7月3日星期一

2023年6月21日星期三

2022年2月16日星期三

2022年2月9日星期三

2020年10月28日星期三

2020年9月25日星期五

2020年8月25日星期二

2019年6月22日星期六

2018年10月6日星期六

2018年7月9日星期一

2018年7月3日星期二