# Copyright 2020-2021 The MediaPipe Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """MediaPipe Objectron.""" import enum from typing import List, Tuple, NamedTuple, Optional import attr import numpy as np from mediapipe.calculators.core import constant_side_packet_calculator_pb2 # pylint: disable=unused-import from mediapipe.calculators.core import gate_calculator_pb2 from mediapipe.calculators.core import split_vector_calculator_pb2 from mediapipe.calculators.tensor import image_to_tensor_calculator_pb2 from mediapipe.calculators.tensor import inference_calculator_pb2 from mediapipe.calculators.tensor import tensors_to_detections_calculator_pb2 from mediapipe.calculators.tensor import tensors_to_floats_calculator_pb2 from mediapipe.calculators.tensor import tensors_to_landmarks_calculator_pb2 from mediapipe.calculators.tflite import ssd_anchors_calculator_pb2 from mediapipe.calculators.util import association_calculator_pb2 from mediapipe.calculators.util import collection_has_min_size_calculator_pb2 from mediapipe.calculators.util import detection_label_id_to_text_calculator_pb2 from mediapipe.calculators.util import detections_to_rects_calculator_pb2 from mediapipe.calculators.util import landmark_projection_calculator_pb2 from mediapipe.calculators.util import local_file_contents_calculator_pb2 from mediapipe.calculators.util import non_max_suppression_calculator_pb2 from mediapipe.calculators.util import rect_transformation_calculator_pb2 from mediapipe.calculators.util import thresholding_calculator_pb2 from mediapipe.framework.formats import landmark_pb2 from mediapipe.modules.objectron.calculators import annotation_data_pb2 from mediapipe.modules.objectron.calculators import frame_annotation_to_rect_calculator_pb2 from mediapipe.modules.objectron.calculators import lift_2d_frame_annotation_to_3d_calculator_pb2 # pylint: enable=unused-import from mediapipe.python.solution_base import SolutionBase from mediapipe.python.solutions import download_utils class BoxLandmark(enum.IntEnum): """The 9 3D box landmarks.""" # # 3 + + + + + + + + 7 # +\ +\ UP # + \ + \ # + \ + \ | # + 4 + + + + + + + + 8 | y # + + + + | # + + + + | # + + (0) + + .------- x # + + + + \ # 1 + + + + + + + + 5 + \ # \ + \ + \ z # \ + \ + \ # \+ \+ # 2 + + + + + + + + 6 CENTER = 0 BACK_BOTTOM_LEFT = 1 FRONT_BOTTOM_LEFT = 2 BACK_TOP_LEFT = 3 FRONT_TOP_LEFT = 4 BACK_BOTTOM_RIGHT = 5 FRONT_BOTTOM_RIGHT = 6 BACK_TOP_RIGHT = 7 FRONT_TOP_RIGHT = 8 BINARYPB_FILE_PATH = 'mediapipe/modules/objectron/objectron_cpu.binarypb' BOX_CONNECTIONS = frozenset([ (BoxLandmark.BACK_BOTTOM_LEFT, BoxLandmark.FRONT_BOTTOM_LEFT), (BoxLandmark.BACK_BOTTOM_LEFT, BoxLandmark.BACK_TOP_LEFT), (BoxLandmark.BACK_BOTTOM_LEFT, BoxLandmark.BACK_BOTTOM_RIGHT), (BoxLandmark.FRONT_BOTTOM_LEFT, BoxLandmark.FRONT_TOP_LEFT), (BoxLandmark.FRONT_BOTTOM_LEFT, BoxLandmark.FRONT_BOTTOM_RIGHT), (BoxLandmark.BACK_TOP_LEFT, BoxLandmark.FRONT_TOP_LEFT), (BoxLandmark.BACK_TOP_LEFT, BoxLandmark.BACK_TOP_RIGHT), (BoxLandmark.FRONT_TOP_LEFT, BoxLandmark.FRONT_TOP_RIGHT), (BoxLandmark.BACK_BOTTOM_RIGHT, BoxLandmark.FRONT_BOTTOM_RIGHT), (BoxLandmark.BACK_BOTTOM_RIGHT, BoxLandmark.BACK_TOP_RIGHT), (BoxLandmark.FRONT_BOTTOM_RIGHT, BoxLandmark.FRONT_TOP_RIGHT), (BoxLandmark.BACK_TOP_RIGHT, BoxLandmark.FRONT_TOP_RIGHT), ]) @attr.s(auto_attribs=True) class ObjectronModel(object): model_path: str label_name: str @attr.s(auto_attribs=True, frozen=True) class ShoeModel(ObjectronModel): model_path: str = ('mediapipe/modules/objectron/' 'object_detection_3d_sneakers.tflite') label_name: str = 'Footwear' @attr.s(auto_attribs=True, frozen=True) class ChairModel(ObjectronModel): model_path: str = ('mediapipe/modules/objectron/' 'object_detection_3d_chair.tflite') label_name: str = 'Chair' @attr.s(auto_attribs=True, frozen=True) class CameraModel(ObjectronModel): model_path: str = ('mediapipe/modules/objectron/' 'object_detection_3d_camera.tflite') label_name: str = 'Camera' @attr.s(auto_attribs=True, frozen=True) class CupModel(ObjectronModel): model_path: str = ('mediapipe/modules/objectron/' 'object_detection_3d_cup.tflite') label_name: str = 'Coffee cup, Mug' _MODEL_DICT = { 'Shoe': ShoeModel(), 'Chair': ChairModel(), 'Cup': CupModel(), 'Camera': CameraModel() } def _download_oss_objectron_models(objectron_model: str): """Downloads the objectron models from the MediaPipe Github repo if they don't exist in the package.""" download_utils.download_oss_model( 'mediapipe/modules/objectron/object_detection_ssd_mobilenetv2_oidv4_fp16.tflite' ) download_utils.download_oss_model(objectron_model) def get_model_by_name(name: str) -> ObjectronModel: if name not in _MODEL_DICT: raise ValueError(f'{name} is not a valid model name for Objectron.') _download_oss_objectron_models(_MODEL_DICT[name].model_path) return _MODEL_DICT[name] @attr.s(auto_attribs=True) class ObjectronOutputs(object): landmarks_2d: landmark_pb2.NormalizedLandmarkList landmarks_3d: landmark_pb2.LandmarkList rotation: np.ndarray translation: np.ndarray scale: np.ndarray class Objectron(SolutionBase): """MediaPipe Objectron. MediaPipe Objectron processes an RGB image and returns the 3D box landmarks and 2D rectangular bounding box of each detected object. """ def __init__(self, static_image_mode: bool = False, max_num_objects: int = 5, min_detection_confidence: float = 0.5, min_tracking_confidence: float = 0.99, model_name: str = 'Shoe', focal_length: Tuple[float, float] = (1.0, 1.0), principal_point: Tuple[float, float] = (0.0, 0.0), image_size: Optional[Tuple[int, int]] = None, ): """Initializes a MediaPipe Objectron class. Args: static_image_mode: Whether to treat the input images as a batch of static and possibly unrelated images, or a video stream. max_num_objects: Maximum number of objects to detect. min_detection_confidence: Minimum confidence value ([0.0, 1.0]) for object detection to be considered successful. min_tracking_confidence: Minimum confidence value ([0.0, 1.0]) for the box landmarks to be considered tracked successfully. model_name: Name of model to use for predicting box landmarks, currently support {'Shoe', 'Chair', 'Cup', 'Camera'}. focal_length: Camera focal length `(fx, fy)`, by default is defined in NDC space. To use focal length (fx_pixel, fy_pixel) in pixel space, users should provide image_size = (image_width, image_height) to enable conversions inside the API. principal_point: Camera principal point (px, py), by default is defined in NDC space. To use principal point (px_pixel, py_pixel) in pixel space, users should provide image_size = (image_width, image_height) to enable conversions inside the API. image_size (Optional): size (image_width, image_height) of the input image , ONLY needed when use focal_length and principal_point in pixel space. Raises: ConnectionError: If the objectron open source model can't be downloaded from the MediaPipe Github repo. """ # Get Camera parameters. fx, fy = focal_length px, py = principal_point if image_size is not None: half_width = image_size[0] / 2.0 half_height = image_size[1] / 2.0 fx = fx / half_width fy = fy / half_height px = - (px - half_width) / half_width py = - (py - half_height) / half_height # Create and init model. model = get_model_by_name(model_name) super().__init__( binary_graph_path=BINARYPB_FILE_PATH, side_inputs={ 'box_landmark_model_path': model.model_path, 'allowed_labels': model.label_name, 'max_num_objects': max_num_objects, }, calculator_params={ 'ConstantSidePacketCalculator.packet': [ constant_side_packet_calculator_pb2 .ConstantSidePacketCalculatorOptions.ConstantSidePacket( bool_value=not static_image_mode) ], ('objectdetectionoidv4subgraph' '__TensorsToDetectionsCalculator.min_score_thresh'): min_detection_confidence, ('boxlandmarksubgraph__ThresholdingCalculator' '.threshold'): min_tracking_confidence, ('Lift2DFrameAnnotationTo3DCalculator' '.normalized_focal_x'): fx, ('Lift2DFrameAnnotationTo3DCalculator' '.normalized_focal_y'): fy, ('Lift2DFrameAnnotationTo3DCalculator' '.normalized_principal_point_x'): px, ('Lift2DFrameAnnotationTo3DCalculator' '.normalized_principal_point_y'): py, }, outputs=['detected_objects']) def process(self, image: np.ndarray) -> NamedTuple: """Processes an RGB image and returns the box landmarks and rectangular bounding box of each detected object. Args: image: An RGB image represented as a numpy ndarray. Raises: RuntimeError: If the underlying graph throws any error. ValueError: If the input image is not three channel RGB. Returns: A NamedTuple object with a "detected_objects" field that contains a list of detected 3D bounding boxes. Each detected box is represented as an "ObjectronOutputs" instance. """ results = super().process(input_data={'image': image}) if results.detected_objects: results.detected_objects = self._convert_format(results.detected_objects) else: results.detected_objects = None return results def _convert_format( self, inputs: annotation_data_pb2.FrameAnnotation) -> List[ObjectronOutputs]: new_outputs = list() for annotation in inputs.annotations: # Get 3d object pose. rotation = np.reshape(np.array(annotation.rotation), (3, 3)) translation = np.array(annotation.translation) scale = np.array(annotation.scale) # Get 2d/3d landmakrs. landmarks_2d = landmark_pb2.NormalizedLandmarkList() landmarks_3d = landmark_pb2.LandmarkList() for keypoint in annotation.keypoints: point_2d = keypoint.point_2d landmarks_2d.landmark.add(x=point_2d.x, y=point_2d.y) point_3d = keypoint.point_3d landmarks_3d.landmark.add(x=point_3d.x, y=point_3d.y, z=point_3d.z) # Add to objectron outputs. new_outputs.append(ObjectronOutputs(landmarks_2d, landmarks_3d, rotation, translation, scale=scale)) return new_outputs