- ImageMeta struct saves image metadata - DataHandler takes care of loading dataset and parsing the label information - YoloProcessing takes care of image preprocessing and yolo postprocessing - HailoHandler connects to Hailo device, pushes the desired network hef file and runs the dataset on the hailo chip
314 lines
13 KiB
314 lines
13 KiB
import json
import os
import time
from PIL import Image
from detection_tools.utils.visualization_utils import \
from hailo_platform import (ConfigureParams, FormatType, HEF,
HailoStreamInterface, InferVStreams,
InputVStreamParams, OutputVStreamParams,
import numpy as np
import tensorflow as tf
from tensorflow.image import combined_non_max_suppression
# Collect images from data files
class ImageMeta:
def __init__(self, image_height, image_width, channels):
self.image_height = image_height
self.image_width = image_width
self.channels = channels
class DataHandler:
def __init__(self, path, image_meta):
self.images_path = path
self.image_meta = image_meta
def load_data(self, preprocess_fn):
names = []
images_list = [img_name for img_name in os.listdir(self.images_path)
if os.path.splitext(os.path.join(self.images_path, img_name))[1] == '.jpg']
dataset = np.zeros((1, self.image_meta.image_height,
for idx, img_name in enumerate(images_list):
img =, img_name))
img_preproc = preprocess_fn(img)
dataset[idx, :, :, :] = np.array(img_preproc)
self.dataset = dataset
self.names = names
def _get_coco_labels(self):
coco_names = json.load(open(os.path.join(os.path.dirname(__file__), 'coco_names.json')))
coco_names = {int(k): {'id': int(k), 'name': str(v)} for (k, v) in coco_names.items()}
return coco_names
def _get_labels(self, label_name):
filename = os.path.join(os.path.dirname(__file__), label_name + '.json')
names = json.load(open(filename))
names = {int(k): {'id': int(k), 'name': str(v)} for (k, v) in names.items()}
return names
COCO_17_14 = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9,
10: 10, 11: 11, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18,
18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 27,
26: 28, 27: 31, 28: 32, 29: 33, 30: 34, 31: 35, 32: 36, 33: 37,
34: 38, 35: 39, 36: 40, 37: 41, 38: 42, 39: 43, 40: 44, 41: 46,
42: 47, 43: 48, 44: 49, 45: 50, 46: 51, 47: 52, 48: 53, 49: 54,
50: 55, 51: 56, 52: 57, 53: 58, 54: 59, 55: 60, 56: 61, 57: 62,
58: 63, 59: 64, 60: 65, 61: 67, 62: 70, 63: 72, 64: 73, 65: 74,
66: 75, 67: 76, 68: 77, 69: 78, 70: 79, 71: 80, 72: 81, 73: 82,
74: 84, 75: 85, 76: 86, 77: 87, 78: 88, 79: 89, 80: 90}
class YoloProcessing:
def __init__(self, imageMeta, classes):
self.output_height = imageMeta.image_height
self.output_width = imageMeta.image_width
self.classes = classes
def preproc(self, image, resize_side=256):
imagenet-standard: aspect-preserving resize to 256px smaller-side,
then central-crop to 224px
new_width = int(image.width/image.height*resize_side)
new_height = resize_side
x, y = (new_width-self.output_width)/2, 0
# Select area to crop
area = (x, y, x+self.output_width, y+self.output_height)
# Crop, show, and save image
cropped_img = image.resize((new_width, new_height)).crop(area)
return cropped_img
# 20 x 20 -> 32
# stride = 32
def yolo_postprocess_numpy(self, net_out, anchors_for_stride, stride):
net_out is shape: [N, 19, 19, 255] or [N, 38, 38, 255] or [N, 76, 76, 255]
first we reshape it to be as in gluon and then follow gluon's shapes.
output_ind = 0 for stride 32, 1 for stride 16, 2 for stride 8.
# net_out = net_out.astype(np.float32) / 256
num_classes = 4
BS = net_out.shape[0] # batch size
H = net_out.shape[1]
W = net_out.shape[2]
num_anchors = anchors_for_stride.size // 2 # 2 params for each anchor.
num_pred = 1 + 4 + num_classes # 2 box centers, 2 box scales, 1 objness, num_classes class scores
alloc_size = (128, 128)
grid_x = np.arange(alloc_size[1])
grid_y = np.arange(alloc_size[0])
grid_x, grid_y = np.meshgrid(grid_x, grid_y) # dims [128,128], [128,128]
offsets = np.concatenate((grid_x[:, :, np.newaxis], grid_y[:, :, np.newaxis]), axis=-1) # dim [128,128,2]
offsets = np.expand_dims(np.expand_dims(offsets, 0), 0) # dim [1,1,128,128,2]
pred = net_out.transpose((0, 3, 1, 2)) # now dims are: [N,C,H,W] as in Gluon.
pred = np.reshape(pred, (BS, num_anchors * num_pred, -1)) # dim [N, 255, HxW]
# dim [N, 361, 255], we did it so that the 255 be the last dim and can be reshaped.
pred = pred.transpose((0, 2, 1))
pred = np.reshape(pred, (BS, -1, num_anchors, num_pred)) # dim [N, HxW, 3, 85]]
raw_box_centers = pred[:, :, :, 0:2] # dim [N, HxW, 3, 2]
raw_box_scales = pred[:, :, :, 2:4] # dim [N,HxW, 3, 2]
objness = pred[:, :, :, 4:5] # dim [N, HxW, 3, 1]
class_pred = pred[:, :, :, 5:] # dim [N, HxW, 3, 80]
offsets = offsets[:, :, :H, :W, :] # dim [1, 1, H, W, 2]
offsets = np.reshape(offsets, (1, -1, 1, 2)) # dim [1, HxW, 1, 2]
box_centers, box_scales, confidence, class_pred = self._yolo5_decode(
class_score = class_pred * confidence # dim [N, HxW, 3, 80]
wh = box_scales / 2.0
# dim [N, HxW, 3, 4]. scheme xmin, ymin, xmax, ymax
bbox = np.concatenate((box_centers - wh, box_centers + wh), axis=-1)
detection_boxes = np.reshape(bbox, (BS, -1, 1, 4)) # dim [N, num_detections, 1, 4]
detection_scores = np.reshape(class_score, (BS, -1, num_classes)) # dim [N, num_detections, 80]
# switching scheme from xmin, ymin, xmanx, ymax to ymin, xmin, ymax, xmax:
detection_boxes_tmp = np.zeros(detection_boxes.shape)
detection_boxes_tmp[:, :, :, 0] = detection_boxes[:, :, :, 1]
detection_boxes_tmp[:, :, :, 1] = detection_boxes[:, :, :, 0]
detection_boxes_tmp[:, :, :, 2] = detection_boxes[:, :, :, 3]
detection_boxes_tmp[:, :, :, 3] = detection_boxes[:, :, :, 2]
detection_boxes = detection_boxes_tmp # now scheme is: ymin, xmin, ymax, xmax
return detection_boxes.astype(np.float32), detection_scores.astype(np.float32)
def _yolo5_decode(self, raw_box_centers, raw_box_scales, objness, class_pred, anchors_for_stride, offsets, stride):
box_centers = (raw_box_centers * 2. - 0.5 + offsets) * stride
box_scales = (raw_box_scales * 2) ** 2 * anchors_for_stride # dim [N, HxW, 3, 2]
return box_centers, box_scales, objness, class_pred
def postprocessing(self, endnodes):
endnodes is a list of 3 output tensors:
endnodes[0] - stride 32 of input
endnodes[1] - stride 16 of input
endnodes[2] - stride 8 of input
a tensor with dims: [BS, Total_num_of_detections_in_image, 6]
total_num_of_detections_in_image = H*W*((1/32^2) + (1/16^2) + (1/8^2))*num_anchors*num_classes,
with H, W as input dims.
If H=W=608, num_anchors=3, num_classes=80 (coco 2017), we get:
total_num_of_detections = 1819440 ~ 1.8M detections per image for the NMS
H_input = 640
W_input = 640
anchors_list = [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]]
# TODO make prettier
strides = [8, 16, 32]
for output_ind, output_branch in enumerate(endnodes): # iterating over the output layers:
stride = strides[::-1][output_ind]
anchors_for_stride = np.array(anchors_list[::-1][output_ind])
anchors_for_stride = np.reshape(anchors_for_stride, (1, 1, -1, 2)) # dim [1, 1, 3, 2]
detection_boxes, detection_scores = self.yolo_postprocess_numpy(output_branch,
# detection_boxes is a [BS, num_detections, 1, 4] tensor, detection_scores is a
# [BS, num_detections, num_classes] tensor
detection_boxes = detection_boxes / H_input # normalization of box coordinates to 1
BS = endnodes[0].shape[0]
H = H_input // stride
W = W_input // stride
num_anchors = anchors_for_stride.size // 2
num_detections = H * W * num_anchors
# detection_boxes.set_shape((BS, num_detections, 1, 4))
# detection_scores.set_shape((BS, num_detections, num_classes))
# concatenating the detections from the different output layers:
if output_ind == 0:
detection_boxes_full = detection_boxes
detection_scores_full = detection_scores
detection_boxes_full = tf.concat([detection_boxes_full, detection_boxes], axis=1)
detection_scores_full = tf.concat([detection_scores_full, detection_scores], axis=1)
score_threshold = 0.5
nms_iou_threshold = 0.5
labels_offset = 1
(nmsed_boxes, nmsed_scores, nmsed_classes, num_detections) = \
# adding offset to the class prediction and cast to integer
def translate_coco_2017_to_2014(nmsed_classes):
return np.vectorize(COCO_17_14.get)(nmsed_classes).astype(np.int32)
nmsed_classes = tf.cast(tf.add(nmsed_classes, labels_offset), tf.int16)
nmsed_classes = translate_coco_2017_to_2014(nmsed_classes)
return {'detection_boxes': nmsed_boxes,
'detection_scores': nmsed_scores,
'detection_classes': nmsed_classes,
'num_detections': num_detections}
class HailoHandler:
def __init__(self, hef_path='hef/yolov5m.hef'):
target = PcieDevice()
self.hef = HEF(hef_path)
# Configure network groups
configure_params = ConfigureParams.create_from_hef(hef=self.hef,
network_groups = target.configure(self.hef, configure_params)
self.network_group = network_groups[0]
self.input_vstreams_params = InputVStreamParams.make(self.network_group,
self.output_vstreams_params = OutputVStreamParams.make(self.network_group, quantized=False, format_type=FormatType.FLOAT32)
self.input_vstream_info = self.hef.get_input_vstream_infos()[0]
self.output_vstream_infos = self.hef.get_output_vstream_infos()
self.network_group_params = self.network_group.create_params()
def run_hailo(self, dataset):
input_data = { dataset}
with InferVStreams(self.network_group, self.input_vstreams_params, self.output_vstreams_params) as infer_pipeline:
with self.network_group.activate(self.network_group_params):
infer_results = infer_pipeline.infer(input_data)
out = [infer_results[] for i in self.output_vstream_infos]
return out
def process_yolo5():
imageMeta = ImageMeta(640, 640, 3)
processor = YoloProcessing(imageMeta, classes=3)
data = DataHandler('./minimal_data', imageMeta)
hailo = HailoHandler('hef/yolov5m_22_2.hef')
out = hailo.run_hailo(data.dataset)
logits = processor.postprocessing(out)
labels = data._get_labels("daria_names")
image = visualize_boxes_and_labels_on_image_array(
if __name__ == "__main__":