adds inference tutorial setup for custom yolov5
- postprocessing for yolov5 included - loading hef file of custom yolov5 onto hailo chip - running inference - saving image and printing fps for inference and postprocessing
This commit is contained in:
		
							
								
								
									
										339
									
								
								inference.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										339
									
								
								inference.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,339 @@ | ||||
| import os | ||||
| from multiprocessing import Process | ||||
| import json | ||||
| import time | ||||
| import numpy as np | ||||
| from PIL import Image | ||||
| import tensorflow as tf | ||||
| from tensorflow.image import combined_non_max_suppression | ||||
|  | ||||
| from detection_tools.utils.visualization_utils import visualize_boxes_and_labels_on_image_array | ||||
|  | ||||
|  | ||||
| from hailo_platform import (HEF, PcieDevice, HailoStreamInterface, InferVStreams, ConfigureParams, \ | ||||
|         InputVStreamParams, OutputVStreamParams, InputVStreams, OutputVStreams, FormatType) | ||||
|  | ||||
| # preprocess dataset for yolov5 size | ||||
| # yolov5 640x640 | ||||
| # resnet18 320x320 | ||||
|  | ||||
|  | ||||
| def preproc(image, output_height=640, output_width=640, resize_side=256): | ||||
|  | ||||
|     ''' | ||||
|     imagenet-standard: aspect-preserving resize to 256px smaller-side, | ||||
|     then central-crop to 224px | ||||
|     ''' | ||||
|     new_width = int(image.width/image.height*resize_side) | ||||
|     new_height = resize_side | ||||
|     x , y = (new_width-output_width)/2, 0 | ||||
|  | ||||
|     # Select area to crop | ||||
|     area = (x, y, x+output_width, y+output_height) | ||||
|  | ||||
|     # Crop, show, and save image | ||||
|     cropped_img = image.resize((new_width, new_height)).crop(area) | ||||
|     return cropped_img | ||||
|  | ||||
|  | ||||
| # Collect images from data files | ||||
| def dataset_read(hef): | ||||
|  | ||||
|     images_path = './minimal_data' | ||||
|     names = [] | ||||
|     images_list = [img_name for img_name in os.listdir(images_path) if | ||||
|             os.path.splitext(os.path.join(images_path, img_name))[1] == '.jpg'] | ||||
|  | ||||
|     # Define dataset params | ||||
|     input_vstream_info = hef.get_input_vstream_infos()[0] | ||||
|     output_vstream_infos = hef.get_output_vstream_infos() | ||||
|     image_height, image_width, channels = input_vstream_info.shape | ||||
|  | ||||
|     # dataset = np.zeros((len(images_list), image_height, image_width, channels), | ||||
|     #        dtype=np.float32) | ||||
|     dataset = np.zeros((1, image_height, image_width, channels), | ||||
|            dtype=np.float32) | ||||
|  | ||||
|     for idx, img_name in enumerate(images_list): | ||||
|         img = Image.open(os.path.join(images_path, img_name)) | ||||
|         img_preproc = preproc(img) | ||||
|         dataset[idx,:,:,:] = np.array(img_preproc)  | ||||
|         names.append(img_name) | ||||
|         break | ||||
|  | ||||
|     return dataset, names | ||||
|  | ||||
| # Generate random dataset | ||||
| def dataset_random(image_height, image_width, channels): | ||||
|     num_of_images = 10 | ||||
|     low, high = 2, 20 | ||||
|     dataset = np.random.randint(low, high, (num_of_images, image_height, | ||||
|         image_width, channels)).astype(np.float32) | ||||
|     return dataset | ||||
|  | ||||
| def init_hailo(model_name='yolov5m'): | ||||
|     target = PcieDevice() | ||||
|  | ||||
|     hef_path = f'hef/{model_name}.hef' | ||||
|     hef = HEF(hef_path) | ||||
|  | ||||
|     # Configure network groups | ||||
|     configure_params = ConfigureParams.create_from_hef(hef=hef, interface=HailoStreamInterface.PCIe) | ||||
|     network_groups = target.configure(hef, configure_params) | ||||
|     network_group = network_groups[0] | ||||
|  | ||||
|  | ||||
|     return hef, network_group | ||||
|  | ||||
|  | ||||
| ''' | ||||
| The target can be used as a context manager ("with" statement) to ensure it's released on time. | ||||
| Here it's avoided for the sake of simplicity | ||||
| ''' | ||||
| def run_hailo(dataset, names, hef, network_group): | ||||
|     # Create input and output virtual streams params | ||||
|     # Quantized argument signifies whether or not the incoming data is already quantized. | ||||
|     # Data is quantized by HailoRT if and only if quantized == False . | ||||
|     input_vstreams_params = InputVStreamParams.make(network_group, | ||||
|             quantized=False, | ||||
|             format_type=FormatType.FLOAT32) | ||||
|     # TODO: change to FLOAT32 | ||||
|     output_vstreams_params = OutputVStreamParams.make(network_group, quantized=False, format_type=FormatType.FLOAT32) | ||||
|     # output_vstreams_params = OutputVStreamParams.make(network_group, | ||||
|     #        quantized=True, | ||||
|     #        format_type=FormatType.INT8) | ||||
|  | ||||
|  | ||||
|     input_vstream_info = hef.get_input_vstream_infos()[0] | ||||
|     output_vstream_infos = hef.get_output_vstream_infos() | ||||
|     input_data = {input_vstream_info.name: dataset} | ||||
|     network_group_params = network_group.create_params() | ||||
|  | ||||
|     with InferVStreams(network_group, input_vstreams_params, output_vstreams_params) as infer_pipeline: | ||||
|         with network_group.activate(network_group_params): | ||||
|             infer_results = infer_pipeline.infer(input_data) | ||||
|  | ||||
|     out = [infer_results[i.name] for i in output_vstream_infos] | ||||
|     return out, names, dataset, names | ||||
|  | ||||
|  | ||||
| # 20 x 20 -> 32 | ||||
| # stride = 32 | ||||
| def yolo_postprocess_numpy(net_out, anchors_for_stride, stride): | ||||
|     """ | ||||
|     net_out is shape: [N, 19, 19, 255] or [N, 38, 38, 255] or [N, 76, 76, 255] | ||||
|     first we reshape it to be as in gluon and then follow gluon's shapes. | ||||
|     output_ind = 0 for stride 32, 1 for stride 16, 2 for stride 8. | ||||
|     """ | ||||
|  | ||||
|     # net_out = net_out.astype(np.float32) / 256 | ||||
|     num_classes = 4 | ||||
|     BS = net_out.shape[0]  # batch size | ||||
|     H = net_out.shape[1] | ||||
|     W = net_out.shape[2] | ||||
|  | ||||
|     num_anchors = anchors_for_stride.size // 2  # 2 params for each anchor. | ||||
|     num_pred = 1 + 4 + num_classes  # 2 box centers, 2 box scales, 1 objness, num_classes class scores | ||||
|     alloc_size = (128, 128) | ||||
|  | ||||
|     grid_x = np.arange(alloc_size[1]) | ||||
|     grid_y = np.arange(alloc_size[0]) | ||||
|     grid_x, grid_y = np.meshgrid(grid_x, grid_y)  # dims [128,128], [128,128] | ||||
|  | ||||
|     offsets = np.concatenate((grid_x[:, :, np.newaxis], grid_y[:, :, np.newaxis]), axis=-1)  # dim [128,128,2] | ||||
|     offsets = np.expand_dims(np.expand_dims(offsets, 0), 0)  # dim [1,1,128,128,2] | ||||
|  | ||||
|     pred = net_out.transpose((0, 3, 1, 2))  # now dims are: [N,C,H,W] as in Gluon. | ||||
|     pred = np.reshape(pred, (BS, num_anchors * num_pred, -1))  # dim [N, 255, HxW] | ||||
|     # dim [N, 361, 255], we did it so that the 255 be the last dim and can be reshaped. | ||||
|     pred = pred.transpose((0, 2, 1)) | ||||
|     pred = np.reshape(pred, (BS, -1, num_anchors, num_pred))  # dim [N, HxW, 3, 85]] | ||||
|  | ||||
|     raw_box_centers = pred[:, :, :, 0:2]  # dim [N, HxW, 3, 2] | ||||
|     raw_box_scales = pred[:, :, :, 2:4]  # dim [N,HxW, 3, 2] | ||||
|  | ||||
|     objness = pred[:, :, :, 4:5]  # dim [N, HxW, 3, 1] | ||||
|     class_pred = pred[:, :, :, 5:]  # dim [N, HxW, 3, 80] | ||||
|     offsets = offsets[:, :, :H, :W, :]  # dim [1, 1, H, W, 2] | ||||
|     offsets = np.reshape(offsets, (1, -1, 1, 2))  # dim [1, HxW, 1, 2] | ||||
|     box_centers, box_scales, confidence, class_pred = _yolo5_decode( | ||||
|         raw_box_centers=raw_box_centers, | ||||
|         raw_box_scales=raw_box_scales, | ||||
|         objness=objness, | ||||
|         class_pred=class_pred, | ||||
|         anchors_for_stride=anchors_for_stride, | ||||
|         offsets=offsets, | ||||
|         stride=stride) | ||||
|  | ||||
|     class_score = class_pred * confidence  # dim [N, HxW, 3, 80] | ||||
|     wh = box_scales / 2.0 | ||||
|     # dim [N, HxW, 3, 4]. scheme xmin, ymin, xmax, ymax | ||||
|     bbox = np.concatenate((box_centers - wh, box_centers + wh), axis=-1) | ||||
|  | ||||
|     detection_boxes = np.reshape(bbox, (BS, -1, 1, 4))  # dim [N, num_detections, 1, 4] | ||||
|     detection_scores = np.reshape(class_score, (BS, -1, num_classes))  # dim [N, num_detections, 80] | ||||
|  | ||||
|     # switching scheme from xmin, ymin, xmanx, ymax to ymin, xmin, ymax, xmax: | ||||
|     detection_boxes_tmp = np.zeros(detection_boxes.shape) | ||||
|     detection_boxes_tmp[:, :, :, 0] = detection_boxes[:, :, :, 1] | ||||
|     detection_boxes_tmp[:, :, :, 1] = detection_boxes[:, :, :, 0] | ||||
|     detection_boxes_tmp[:, :, :, 2] = detection_boxes[:, :, :, 3] | ||||
|     detection_boxes_tmp[:, :, :, 3] = detection_boxes[:, :, :, 2] | ||||
|  | ||||
|     detection_boxes = detection_boxes_tmp  # now scheme is: ymin, xmin, ymax, xmax | ||||
|     return detection_boxes.astype(np.float32), detection_scores.astype(np.float32) | ||||
|  | ||||
| def _yolo5_decode(raw_box_centers, raw_box_scales, objness, class_pred, anchors_for_stride, offsets, stride): | ||||
|     box_centers = (raw_box_centers * 2. - 0.5 + offsets) * stride | ||||
|     box_scales = (raw_box_scales * 2) ** 2 * anchors_for_stride  # dim [N, HxW, 3, 2] | ||||
|     return box_centers, box_scales, objness, class_pred | ||||
|  | ||||
| def postprocessing(endnodes): | ||||
|     """ | ||||
|     endnodes is a list of 3 output tensors: | ||||
|     endnodes[0] - stride 32 of input | ||||
|     endnodes[1] - stride 16 of input | ||||
|     endnodes[2] - stride 8 of input | ||||
|     Returns: | ||||
|     a tensor with dims: [BS, Total_num_of_detections_in_image, 6] | ||||
|     where: | ||||
|         total_num_of_detections_in_image = H*W*((1/32^2) + (1/16^2) + (1/8^2))*num_anchors*num_classes, | ||||
|         with H, W as input dims. | ||||
|         If H=W=608, num_anchors=3, num_classes=80 (coco 2017), we get: | ||||
|         total_num_of_detections = 1819440 ~ 1.8M detections per image for the NMS | ||||
|     """ | ||||
|     H_input = 640 | ||||
|     W_input = 640 | ||||
|     anchors_list = [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]] | ||||
|     # TODO make prettier | ||||
|     strides = [8, 16, 32] | ||||
|     num_classes = 80 | ||||
|  | ||||
|     for output_ind, output_branch in enumerate(endnodes):  # iterating over the output layers: | ||||
|         stride = strides[::-1][output_ind] | ||||
|         anchors_for_stride = np.array(anchors_list[::-1][output_ind]) | ||||
|         anchors_for_stride = np.reshape(anchors_for_stride, (1, 1, -1, 2))  # dim [1, 1, 3, 2] | ||||
|  | ||||
|         detection_boxes, detection_scores = yolo_postprocess_numpy(output_branch,  | ||||
|                                                                    anchors_for_stride, | ||||
|                                                                    stride) | ||||
|  | ||||
|         # detection_boxes is a [BS, num_detections, 1, 4] tensor, detection_scores is a | ||||
|         # [BS, num_detections, num_classes] tensor | ||||
|         detection_boxes = detection_boxes / H_input  # normalization of box coordinates to 1 | ||||
|         BS = endnodes[0].shape[0] | ||||
|         H = H_input // stride | ||||
|         W = W_input // stride | ||||
|         num_anchors = anchors_for_stride.size // 2 | ||||
|         num_detections = H * W * num_anchors | ||||
|         # detection_boxes.set_shape((BS, num_detections, 1, 4)) | ||||
|         # detection_scores.set_shape((BS, num_detections, num_classes)) | ||||
|         # concatenating the detections from the different output layers: | ||||
|         if output_ind == 0: | ||||
|             detection_boxes_full = detection_boxes | ||||
|             detection_scores_full = detection_scores | ||||
|         else: | ||||
|             detection_boxes_full = tf.concat([detection_boxes_full, detection_boxes], axis=1) | ||||
|             detection_scores_full = tf.concat([detection_scores_full, detection_scores], axis=1) | ||||
|  | ||||
|     score_threshold = 0.5 | ||||
|     nms_iou_threshold = 0.5 | ||||
|     labels_offset = 1 | ||||
|  | ||||
|     (nmsed_boxes, nmsed_scores, nmsed_classes, num_detections) = \ | ||||
|         combined_non_max_suppression(boxes=detection_boxes_full, | ||||
|                                      scores=detection_scores_full, | ||||
|                                      score_threshold=score_threshold, | ||||
|                                      iou_threshold=nms_iou_threshold, | ||||
|                                      max_output_size_per_class=100, | ||||
|                                      max_total_size=100) | ||||
|  | ||||
|  | ||||
|     # adding offset to the class prediction and cast to integer | ||||
|     def translate_coco_2017_to_2014(nmsed_classes): | ||||
|         return np.vectorize(COCO_2017_TO_2014_TRANSLATION.get)(nmsed_classes).astype(np.int32) | ||||
|  | ||||
|     nmsed_classes = tf.cast(tf.add(nmsed_classes, labels_offset), tf.int16) | ||||
|     nmsed_classes = translate_coco_2017_to_2014(nmsed_classes) | ||||
|  | ||||
|     return {'detection_boxes': nmsed_boxes, | ||||
|             'detection_scores': nmsed_scores, | ||||
|             'detection_classes': nmsed_classes, | ||||
|             'num_detections': num_detections} | ||||
|  | ||||
|  | ||||
| def _get_face_detection_visualization_data(logits): | ||||
|     boxes = logits['detection_boxes'][0] | ||||
|  | ||||
|     face_landmarks = logits.get('face_landmarks') | ||||
|     if face_landmarks is not None: | ||||
|         face_landmarks = face_landmarks[0].reshape((-1, 5, 2))[:, :, (1, 0)] | ||||
|     boxes = boxes[:, (1, 0, 3, 2)] | ||||
|     # No name to prevent clobbering the visualization | ||||
|     labels = {1: {'id': 1, 'name': ''}} | ||||
|     return boxes, labels, face_landmarks | ||||
|  | ||||
|  | ||||
| def _get_coco_labels(): | ||||
|     coco_names = json.load(open(os.path.join(os.path.dirname(__file__), 'coco_names.json'))) | ||||
|     coco_names = {int(k): {'id': int(k), 'name': str(v)} for (k, v) in coco_names.items()} | ||||
|     return coco_names | ||||
|  | ||||
| def _get_labels(label_name): | ||||
|     filename = os.path.join(os.path.dirname(__file__), label_name + '.json') | ||||
|     names = json.load(open(filename)) | ||||
|     names = {int(k): {'id': int(k), 'name': str(v)} for (k, v) in names.items()} | ||||
|     return names | ||||
|  | ||||
|  | ||||
| def process_yolo5(): | ||||
|  | ||||
|     hef, network_group = init_hailo("yolov5m_22_2") | ||||
|  | ||||
|     dataset, names = dataset_read(hef) | ||||
|  | ||||
|     samples = 1000 | ||||
|     start_time = time.time() | ||||
|     fps = 0 | ||||
|     while samples > 0: | ||||
|         if start_time + 1 < time.time(): | ||||
|             print("fps: " + str(fps)) | ||||
|             start_time = time.time() | ||||
|             fps = 0 | ||||
|  | ||||
|         out, names, dataset, names = run_hailo(dataset, names, hef, network_group) | ||||
|  | ||||
|         logits = postprocessing(out) | ||||
|  | ||||
|         fps += 1 | ||||
|         samples -= 1 | ||||
|  | ||||
|  | ||||
|     labels = _get_labels("daria_names") | ||||
|     image = visualize_boxes_and_labels_on_image_array( | ||||
|         dataset[0], | ||||
|         logits['detection_boxes'].numpy()[0], | ||||
|         logits['detection_classes'][0], | ||||
|         logits['detection_scores'].numpy()[0], | ||||
|         labels, | ||||
|         use_normalized_coordinates=True, | ||||
|         max_boxes_to_draw=100, | ||||
|         min_score_thresh=.5, | ||||
|         agnostic_mode=False, | ||||
|         line_thickness=4) | ||||
|  | ||||
|     Image.fromarray(np.uint8(image)).save('/home/maintenance/test.png') | ||||
|  | ||||
| COCO_2017_TO_2014_TRANSLATION = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, | ||||
|                                  11: 11, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, | ||||
|                                  19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 27, 26: 28, | ||||
|                                  27: 31, 28: 32, 29: 33, 30: 34, 31: 35, 32: 36, 33: 37, 34: 38, | ||||
|                                  35: 39, 36: 40, 37: 41, 38: 42, 39: 43, 40: 44, 41: 46, 42: 47, | ||||
|                                  43: 48, 44: 49, 45: 50, 46: 51, 47: 52, 48: 53, 49: 54, 50: 55, | ||||
|                                  51: 56, 52: 57, 53: 58, 54: 59, 55: 60, 56: 61, 57: 62, 58: 63, | ||||
|                                  59: 64, 60: 65, 61: 67, 62: 70, 63: 72, 64: 73, 65: 74, 66: 75, | ||||
|                                  67: 76, 68: 77, 69: 78, 70: 79, 71: 80, 72: 81, 73: 82, 74: 84, | ||||
|                                  75: 85, 76: 86, 77: 87, 78: 88, 79: 89, 80: 90} | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     process_yolo5() | ||||
		Reference in New Issue
	
	Block a user