hailo-inference/inference.py

340 lines
14 KiB
Python
Raw Normal View History

import os
from multiprocessing import Process
import json
import time
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.image import combined_non_max_suppression
from detection_tools.utils.visualization_utils import visualize_boxes_and_labels_on_image_array
from hailo_platform import (HEF, PcieDevice, HailoStreamInterface, InferVStreams, ConfigureParams, \
InputVStreamParams, OutputVStreamParams, InputVStreams, OutputVStreams, FormatType)
# preprocess dataset for yolov5 size
# yolov5 640x640
# resnet18 320x320
def preproc(image, output_height=640, output_width=640, resize_side=256):
'''
imagenet-standard: aspect-preserving resize to 256px smaller-side,
then central-crop to 224px
'''
new_width = int(image.width/image.height*resize_side)
new_height = resize_side
x , y = (new_width-output_width)/2, 0
# Select area to crop
area = (x, y, x+output_width, y+output_height)
# Crop, show, and save image
cropped_img = image.resize((new_width, new_height)).crop(area)
return cropped_img
# Collect images from data files
def dataset_read(hef):
images_path = './minimal_data'
names = []
images_list = [img_name for img_name in os.listdir(images_path) if
os.path.splitext(os.path.join(images_path, img_name))[1] == '.jpg']
# Define dataset params
input_vstream_info = hef.get_input_vstream_infos()[0]
output_vstream_infos = hef.get_output_vstream_infos()
image_height, image_width, channels = input_vstream_info.shape
# dataset = np.zeros((len(images_list), image_height, image_width, channels),
# dtype=np.float32)
dataset = np.zeros((1, image_height, image_width, channels),
dtype=np.float32)
for idx, img_name in enumerate(images_list):
img = Image.open(os.path.join(images_path, img_name))
img_preproc = preproc(img)
dataset[idx,:,:,:] = np.array(img_preproc)
names.append(img_name)
break
return dataset, names
# Generate random dataset
def dataset_random(image_height, image_width, channels):
num_of_images = 10
low, high = 2, 20
dataset = np.random.randint(low, high, (num_of_images, image_height,
image_width, channels)).astype(np.float32)
return dataset
def init_hailo(model_name='yolov5m'):
target = PcieDevice()
hef_path = f'hef/{model_name}.hef'
hef = HEF(hef_path)
# Configure network groups
configure_params = ConfigureParams.create_from_hef(hef=hef, interface=HailoStreamInterface.PCIe)
network_groups = target.configure(hef, configure_params)
network_group = network_groups[0]
return hef, network_group
'''
The target can be used as a context manager ("with" statement) to ensure it's released on time.
Here it's avoided for the sake of simplicity
'''
def run_hailo(dataset, names, hef, network_group):
# Create input and output virtual streams params
# Quantized argument signifies whether or not the incoming data is already quantized.
# Data is quantized by HailoRT if and only if quantized == False .
input_vstreams_params = InputVStreamParams.make(network_group,
quantized=False,
format_type=FormatType.FLOAT32)
# TODO: change to FLOAT32
output_vstreams_params = OutputVStreamParams.make(network_group, quantized=False, format_type=FormatType.FLOAT32)
# output_vstreams_params = OutputVStreamParams.make(network_group,
# quantized=True,
# format_type=FormatType.INT8)
input_vstream_info = hef.get_input_vstream_infos()[0]
output_vstream_infos = hef.get_output_vstream_infos()
input_data = {input_vstream_info.name: dataset}
network_group_params = network_group.create_params()
with InferVStreams(network_group, input_vstreams_params, output_vstreams_params) as infer_pipeline:
with network_group.activate(network_group_params):
infer_results = infer_pipeline.infer(input_data)
out = [infer_results[i.name] for i in output_vstream_infos]
return out, names, dataset, names
# 20 x 20 -> 32
# stride = 32
def yolo_postprocess_numpy(net_out, anchors_for_stride, stride):
"""
net_out is shape: [N, 19, 19, 255] or [N, 38, 38, 255] or [N, 76, 76, 255]
first we reshape it to be as in gluon and then follow gluon's shapes.
output_ind = 0 for stride 32, 1 for stride 16, 2 for stride 8.
"""
# net_out = net_out.astype(np.float32) / 256
num_classes = 4
BS = net_out.shape[0] # batch size
H = net_out.shape[1]
W = net_out.shape[2]
num_anchors = anchors_for_stride.size // 2 # 2 params for each anchor.
num_pred = 1 + 4 + num_classes # 2 box centers, 2 box scales, 1 objness, num_classes class scores
alloc_size = (128, 128)
grid_x = np.arange(alloc_size[1])
grid_y = np.arange(alloc_size[0])
grid_x, grid_y = np.meshgrid(grid_x, grid_y) # dims [128,128], [128,128]
offsets = np.concatenate((grid_x[:, :, np.newaxis], grid_y[:, :, np.newaxis]), axis=-1) # dim [128,128,2]
offsets = np.expand_dims(np.expand_dims(offsets, 0), 0) # dim [1,1,128,128,2]
pred = net_out.transpose((0, 3, 1, 2)) # now dims are: [N,C,H,W] as in Gluon.
pred = np.reshape(pred, (BS, num_anchors * num_pred, -1)) # dim [N, 255, HxW]
# dim [N, 361, 255], we did it so that the 255 be the last dim and can be reshaped.
pred = pred.transpose((0, 2, 1))
pred = np.reshape(pred, (BS, -1, num_anchors, num_pred)) # dim [N, HxW, 3, 85]]
raw_box_centers = pred[:, :, :, 0:2] # dim [N, HxW, 3, 2]
raw_box_scales = pred[:, :, :, 2:4] # dim [N,HxW, 3, 2]
objness = pred[:, :, :, 4:5] # dim [N, HxW, 3, 1]
class_pred = pred[:, :, :, 5:] # dim [N, HxW, 3, 80]
offsets = offsets[:, :, :H, :W, :] # dim [1, 1, H, W, 2]
offsets = np.reshape(offsets, (1, -1, 1, 2)) # dim [1, HxW, 1, 2]
box_centers, box_scales, confidence, class_pred = _yolo5_decode(
raw_box_centers=raw_box_centers,
raw_box_scales=raw_box_scales,
objness=objness,
class_pred=class_pred,
anchors_for_stride=anchors_for_stride,
offsets=offsets,
stride=stride)
class_score = class_pred * confidence # dim [N, HxW, 3, 80]
wh = box_scales / 2.0
# dim [N, HxW, 3, 4]. scheme xmin, ymin, xmax, ymax
bbox = np.concatenate((box_centers - wh, box_centers + wh), axis=-1)
detection_boxes = np.reshape(bbox, (BS, -1, 1, 4)) # dim [N, num_detections, 1, 4]
detection_scores = np.reshape(class_score, (BS, -1, num_classes)) # dim [N, num_detections, 80]
# switching scheme from xmin, ymin, xmanx, ymax to ymin, xmin, ymax, xmax:
detection_boxes_tmp = np.zeros(detection_boxes.shape)
detection_boxes_tmp[:, :, :, 0] = detection_boxes[:, :, :, 1]
detection_boxes_tmp[:, :, :, 1] = detection_boxes[:, :, :, 0]
detection_boxes_tmp[:, :, :, 2] = detection_boxes[:, :, :, 3]
detection_boxes_tmp[:, :, :, 3] = detection_boxes[:, :, :, 2]
detection_boxes = detection_boxes_tmp # now scheme is: ymin, xmin, ymax, xmax
return detection_boxes.astype(np.float32), detection_scores.astype(np.float32)
def _yolo5_decode(raw_box_centers, raw_box_scales, objness, class_pred, anchors_for_stride, offsets, stride):
box_centers = (raw_box_centers * 2. - 0.5 + offsets) * stride
box_scales = (raw_box_scales * 2) ** 2 * anchors_for_stride # dim [N, HxW, 3, 2]
return box_centers, box_scales, objness, class_pred
def postprocessing(endnodes):
"""
endnodes is a list of 3 output tensors:
endnodes[0] - stride 32 of input
endnodes[1] - stride 16 of input
endnodes[2] - stride 8 of input
Returns:
a tensor with dims: [BS, Total_num_of_detections_in_image, 6]
where:
total_num_of_detections_in_image = H*W*((1/32^2) + (1/16^2) + (1/8^2))*num_anchors*num_classes,
with H, W as input dims.
If H=W=608, num_anchors=3, num_classes=80 (coco 2017), we get:
total_num_of_detections = 1819440 ~ 1.8M detections per image for the NMS
"""
H_input = 640
W_input = 640
anchors_list = [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]]
# TODO make prettier
strides = [8, 16, 32]
num_classes = 80
for output_ind, output_branch in enumerate(endnodes): # iterating over the output layers:
stride = strides[::-1][output_ind]
anchors_for_stride = np.array(anchors_list[::-1][output_ind])
anchors_for_stride = np.reshape(anchors_for_stride, (1, 1, -1, 2)) # dim [1, 1, 3, 2]
detection_boxes, detection_scores = yolo_postprocess_numpy(output_branch,
anchors_for_stride,
stride)
# detection_boxes is a [BS, num_detections, 1, 4] tensor, detection_scores is a
# [BS, num_detections, num_classes] tensor
detection_boxes = detection_boxes / H_input # normalization of box coordinates to 1
BS = endnodes[0].shape[0]
H = H_input // stride
W = W_input // stride
num_anchors = anchors_for_stride.size // 2
num_detections = H * W * num_anchors
# detection_boxes.set_shape((BS, num_detections, 1, 4))
# detection_scores.set_shape((BS, num_detections, num_classes))
# concatenating the detections from the different output layers:
if output_ind == 0:
detection_boxes_full = detection_boxes
detection_scores_full = detection_scores
else:
detection_boxes_full = tf.concat([detection_boxes_full, detection_boxes], axis=1)
detection_scores_full = tf.concat([detection_scores_full, detection_scores], axis=1)
score_threshold = 0.5
nms_iou_threshold = 0.5
labels_offset = 1
(nmsed_boxes, nmsed_scores, nmsed_classes, num_detections) = \
combined_non_max_suppression(boxes=detection_boxes_full,
scores=detection_scores_full,
score_threshold=score_threshold,
iou_threshold=nms_iou_threshold,
max_output_size_per_class=100,
max_total_size=100)
# adding offset to the class prediction and cast to integer
def translate_coco_2017_to_2014(nmsed_classes):
return np.vectorize(COCO_2017_TO_2014_TRANSLATION.get)(nmsed_classes).astype(np.int32)
nmsed_classes = tf.cast(tf.add(nmsed_classes, labels_offset), tf.int16)
nmsed_classes = translate_coco_2017_to_2014(nmsed_classes)
return {'detection_boxes': nmsed_boxes,
'detection_scores': nmsed_scores,
'detection_classes': nmsed_classes,
'num_detections': num_detections}
def _get_face_detection_visualization_data(logits):
boxes = logits['detection_boxes'][0]
face_landmarks = logits.get('face_landmarks')
if face_landmarks is not None:
face_landmarks = face_landmarks[0].reshape((-1, 5, 2))[:, :, (1, 0)]
boxes = boxes[:, (1, 0, 3, 2)]
# No name to prevent clobbering the visualization
labels = {1: {'id': 1, 'name': ''}}
return boxes, labels, face_landmarks
def _get_coco_labels():
coco_names = json.load(open(os.path.join(os.path.dirname(__file__), 'coco_names.json')))
coco_names = {int(k): {'id': int(k), 'name': str(v)} for (k, v) in coco_names.items()}
return coco_names
def _get_labels(label_name):
filename = os.path.join(os.path.dirname(__file__), label_name + '.json')
names = json.load(open(filename))
names = {int(k): {'id': int(k), 'name': str(v)} for (k, v) in names.items()}
return names
def process_yolo5():
hef, network_group = init_hailo("yolov5m_22_2")
dataset, names = dataset_read(hef)
samples = 1000
start_time = time.time()
fps = 0
while samples > 0:
if start_time + 1 < time.time():
print("fps: " + str(fps))
start_time = time.time()
fps = 0
out, names, dataset, names = run_hailo(dataset, names, hef, network_group)
logits = postprocessing(out)
fps += 1
samples -= 1
labels = _get_labels("daria_names")
image = visualize_boxes_and_labels_on_image_array(
dataset[0],
logits['detection_boxes'].numpy()[0],
logits['detection_classes'][0],
logits['detection_scores'].numpy()[0],
labels,
use_normalized_coordinates=True,
max_boxes_to_draw=100,
min_score_thresh=.5,
agnostic_mode=False,
line_thickness=4)
Image.fromarray(np.uint8(image)).save('/home/maintenance/test.png')
COCO_2017_TO_2014_TRANSLATION = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10,
11: 11, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19,
19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 27, 26: 28,
27: 31, 28: 32, 29: 33, 30: 34, 31: 35, 32: 36, 33: 37, 34: 38,
35: 39, 36: 40, 37: 41, 38: 42, 39: 43, 40: 44, 41: 46, 42: 47,
43: 48, 44: 49, 45: 50, 46: 51, 47: 52, 48: 53, 49: 54, 50: 55,
51: 56, 52: 57, 53: 58, 54: 59, 55: 60, 56: 61, 57: 62, 58: 63,
59: 64, 60: 65, 61: 67, 62: 70, 63: 72, 64: 73, 65: 74, 66: 75,
67: 76, 68: 77, 69: 78, 70: 79, 71: 80, 72: 81, 73: 82, 74: 84,
75: 85, 76: 86, 77: 87, 78: 88, 79: 89, 80: 90}
if __name__ == "__main__":
process_yolo5()