Updated image classifier to use a region of interest parameter

2022-10-10 08:15:40 -07:00 · 2022-10-10 08:15:40 -07:00 · 44e6f8e1a1
commit 44e6f8e1a1
parent cb806071ba
6 changed files with 281 additions and 58 deletions
--- a/mediapipe/tasks/python/components/containers/BUILD
+++ b/mediapipe/tasks/python/components/containers/BUILD
@ -27,6 +27,15 @@ py_library(
    ],
 )

+py_library(
+    name = "rect",
+    srcs = ["rect.py"],
+    deps = [
+        "//mediapipe/framework/formats:rect_py_pb2",
+        "//mediapipe/tasks/python/core:optional_dependencies",
+    ],
+)
+
 py_library(
    name = "category",
    srcs = ["category.py"],
--- a/mediapipe/tasks/python/components/containers/rect.py
+++ b/mediapipe/tasks/python/components/containers/rect.py
@ -0,0 +1,136 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Rect data class."""
+
+import dataclasses
+from typing import Any, Optional
+
+from mediapipe.framework.formats import rect_pb2
+from mediapipe.tasks.python.core.optional_dependencies import doc_controls
+
+_RectProto = rect_pb2.Rect
+_NormalizedRectProto = rect_pb2.NormalizedRect
+
+
+@dataclasses.dataclass
+class Rect:
+  """A rectangle with rotation in image coordinates.
+
+  Attributes:
+    x_center : The X coordinate of the top-left corner, in pixels.
+    y_center : The Y coordinate of the top-left corner, in pixels.
+    width: The width of the rectangle, in pixels.
+    height: The height of the rectangle, in pixels.
+    rotation: Rotation angle is clockwise in radians.
+    rect_id:  Optional unique id to help associate different rectangles to each
+      other.
+  """
+
+  x_center: int
+  y_center: int
+  width: int
+  height: int
+  rotation: Optional[float] = 0.0
+  rect_id: Optional[int] = None
+
+  @doc_controls.do_not_generate_docs
+  def to_pb2(self) -> _RectProto:
+    """Generates a Rect protobuf object."""
+    return _RectProto(
+        x_center=self.x_center,
+        y_center=self.y_center,
+        width=self.width,
+        height=self.height,
+    )
+
+  @classmethod
+  @doc_controls.do_not_generate_docs
+  def create_from_pb2(cls, pb2_obj: _RectProto) -> 'Rect':
+    """Creates a `Rect` object from the given protobuf object."""
+    return Rect(
+        x_center=pb2_obj.x_center,
+        y_center=pb2_obj.y_center,
+        width=pb2_obj.width,
+        height=pb2_obj.height)
+
+  def __eq__(self, other: Any) -> bool:
+    """Checks if this object is equal to the given object.
+
+    Args:
+      other: The object to be compared with.
+
+    Returns:
+      True if the objects are equal.
+    """
+    if not isinstance(other, Rect):
+      return False
+
+    return self.to_pb2().__eq__(other.to_pb2())
+
+
+@dataclasses.dataclass
+class NormalizedRect:
+  """A rectangle with rotation in normalized coordinates. The values of box
+    center location and size are within [0, 1].
+
+  Attributes:
+    x_center : The X normalized coordinate of the top-left corner.
+    y_center : The Y normalized coordinate of the top-left corner.
+    width: The width of the rectangle.
+    height: The height of the rectangle.
+    rotation: Rotation angle is clockwise in radians.
+    rect_id:  Optional unique id to help associate different rectangles to each
+      other.
+  """
+
+  x_center: float
+  y_center: float
+  width: float
+  height: float
+  rotation: Optional[float] = 0.0
+  rect_id: Optional[int] = None
+
+  @doc_controls.do_not_generate_docs
+  def to_pb2(self) -> _NormalizedRectProto:
+    """Generates a NormalizedRect protobuf object."""
+    return _NormalizedRectProto(
+        x_center=self.x_center,
+        y_center=self.y_center,
+        width=self.width,
+        height=self.height,
+    )
+
+  @classmethod
+  @doc_controls.do_not_generate_docs
+  def create_from_pb2(cls, pb2_obj: _NormalizedRectProto) -> 'NormalizedRect':
+    """Creates a `NormalizedRect` object from the given protobuf object."""
+    return NormalizedRect(
+        x_center=pb2_obj.x_center,
+        y_center=pb2_obj.y_center,
+        width=pb2_obj.width,
+        height=pb2_obj.height)
+
+  def __eq__(self, other: Any) -> bool:
+    """Checks if this object is equal to the given object.
+
+    Args:
+      other: The object to be compared with.
+
+    Returns:
+      True if the objects are equal.
+    """
+    if not isinstance(other, NormalizedRect):
+      return False
+
+    return self.to_pb2().__eq__(other.to_pb2())
--- a/mediapipe/tasks/python/test/vision/BUILD
+++ b/mediapipe/tasks/python/test/vision/BUILD
@ -49,6 +49,7 @@ py_test(
        "//mediapipe/tasks/python/components/processors:classifier_options",
        "//mediapipe/tasks/python/components/containers:category",
        "//mediapipe/tasks/python/components/containers:classifications",
+        "//mediapipe/tasks/python/components/containers:rect",
        "//mediapipe/tasks/python/core:base_options",
        "//mediapipe/tasks/python/test:test_utils",
        "//mediapipe/tasks/python/vision:image_classifier",
--- a/mediapipe/tasks/python/test/vision/image_classifier_test.py
+++ b/mediapipe/tasks/python/test/vision/image_classifier_test.py
@ -24,11 +24,13 @@ from mediapipe.python._framework_bindings import image as image_module
 from mediapipe.tasks.python.components.processors import classifier_options
 from mediapipe.tasks.python.components.containers import category as category_module
 from mediapipe.tasks.python.components.containers import classifications as classifications_module
+from mediapipe.tasks.python.components.containers import rect as rect_module
 from mediapipe.tasks.python.core import base_options as base_options_module
 from mediapipe.tasks.python.test import test_utils
 from mediapipe.tasks.python.vision import image_classifier
 from mediapipe.tasks.python.vision.core import vision_task_running_mode as running_mode_module

+_NormalizedRect = rect_module.NormalizedRect
 _BaseOptions = base_options_module.BaseOptions
 _ClassifierOptions = classifier_options.ClassifierOptions
 _Category = category_module.Category
@ -42,40 +44,6 @@ _RUNNING_MODE = running_mode_module.VisionTaskRunningMode

 _MODEL_FILE = 'mobilenet_v2_1.0_224.tflite'
 _IMAGE_FILE = 'burger.jpg'
-_EXPECTED_CATEGORIES = [
-    _Category(
-      index=934,
-      score=0.7939587831497192,
-      display_name='',
-      category_name='cheeseburger'),
-    _Category(
-      index=932,
-      score=0.02739289402961731,
-      display_name='',
-      category_name='bagel'),
-    _Category(
-      index=925,
-      score=0.01934075355529785,
-      display_name='',
-      category_name='guacamole'),
-    _Category(
-      index=963,
-      score=0.006327860057353973,
-      display_name='',
-      category_name='meat loaf')
-]
-_EXPECTED_CLASSIFICATION_RESULT = _ClassificationResult(
-  classifications=[
-    _Classifications(
-      entries=[
-        _ClassificationEntry(
-          categories=_EXPECTED_CATEGORIES,
-          timestamp_ms=0
-        )
-      ],
-      head_index=0,
-      head_name='probability')
-  ])
 _EMPTY_CLASSIFICATION_RESULT = _ClassificationResult(
  classifications=[
    _Classifications(
@ -94,6 +62,60 @@ _SCORE_THRESHOLD = 0.5
 _MAX_RESULTS = 3


+def _generate_burger_results(timestamp_ms: int) -> _ClassificationResult:
+  return _ClassificationResult(
+    classifications=[
+      _Classifications(
+        entries=[
+          _ClassificationEntry(
+            categories=[
+              _Category(
+                index=934,
+                score=0.7939587831497192,
+                display_name='',
+                category_name='cheeseburger'),
+              _Category(
+                index=932,
+                score=0.02739289402961731,
+                display_name='',
+                category_name='bagel'),
+              _Category(
+                index=925,
+                score=0.01934075355529785,
+                display_name='',
+                category_name='guacamole'),
+              _Category(
+                index=963,
+                score=0.006327860057353973,
+                display_name='',
+                category_name='meat loaf')
+            ],
+            timestamp_ms=timestamp_ms
+          )
+        ],
+        head_index=0,
+        head_name='probability')
+    ])
+
+
+def _generate_soccer_ball_results(timestamp_ms: int) -> _ClassificationResult:
+  return _ClassificationResult(
+    classifications=[
+      _Classifications(
+        entries=[
+          _ClassificationEntry(
+            categories=[
+              _Category(index=806, score=0.9965274930000305, display_name='',
+                        category_name='soccer ball')
+            ],
+            timestamp_ms=timestamp_ms
+          )
+        ],
+        head_index=0,
+        head_name='probability')
+    ])
+
+
 class ModelFileType(enum.Enum):
  FILE_CONTENT = 1
  FILE_NAME = 2
@ -138,8 +160,8 @@ class ImageClassifierTest(parameterized.TestCase):
      self.assertIsInstance(classifier, _ImageClassifier)

  @parameterized.parameters(
-      (ModelFileType.FILE_NAME, 4, _EXPECTED_CLASSIFICATION_RESULT),
-      (ModelFileType.FILE_CONTENT, 4, _EXPECTED_CLASSIFICATION_RESULT))
+      (ModelFileType.FILE_NAME, 4, _generate_burger_results(0)),
+      (ModelFileType.FILE_CONTENT, 4, _generate_burger_results(0)))
  def test_classify(self, model_file_type, max_results,
                    expected_classification_result):
    # Creates classifier.
@ -167,8 +189,8 @@ class ImageClassifierTest(parameterized.TestCase):
    classifier.close()

  @parameterized.parameters(
-    (ModelFileType.FILE_NAME, 4, _EXPECTED_CLASSIFICATION_RESULT),
-    (ModelFileType.FILE_CONTENT, 4, _EXPECTED_CLASSIFICATION_RESULT))
+    (ModelFileType.FILE_NAME, 4, _generate_burger_results(0)),
+    (ModelFileType.FILE_CONTENT, 4, _generate_burger_results(0)))
  def test_classify_in_context(self, model_file_type, max_results,
                               expected_classification_result):
    if model_file_type is ModelFileType.FILE_NAME:
@ -190,6 +212,23 @@ class ImageClassifierTest(parameterized.TestCase):
      # Comparing results.
      self.assertEqual(image_result, expected_classification_result)

+  def test_classify_succeeds_with_region_of_interest(self):
+    base_options = _BaseOptions(model_asset_path=self.model_path)
+    classifier_options = _ClassifierOptions(max_results=1)
+    options = _ImageClassifierOptions(
+      base_options=base_options, classifier_options=classifier_options)
+    with _ImageClassifier.create_from_options(options) as classifier:
+      # Load the test image.
+      test_image = _Image.create_from_file(
+          test_utils.get_test_data_path('multi_objects.jpg'))
+      # NormalizedRect around the soccer ball.
+      roi = _NormalizedRect(x_center=0.532, y_center=0.521, width=0.164,
+                            height=0.427)
+      # Performs image classification on the input.
+      image_result = classifier.classify(test_image, roi)
+      # Comparing results.
+      self.assertEqual(image_result, _generate_soccer_ball_results(0))
+
  def test_score_threshold_option(self):
    classifier_options = _ClassifierOptions(score_threshold=_SCORE_THRESHOLD)
    options = _ImageClassifierOptions(
@ -353,16 +392,27 @@ class ImageClassifierTest(parameterized.TestCase):
      for timestamp in range(0, 300, 30):
        classification_result = classifier.classify_for_video(
            self.test_image, timestamp)
-        expected_classification_result = _ClassificationResult(
-          classifications=[
-            _Classifications(
-              entries=[
-                _ClassificationEntry(
-                  categories=_EXPECTED_CATEGORIES, timestamp_ms=timestamp)
-              ],
-              head_index=0, head_name='probability')
-          ])
-        self.assertEqual(classification_result, expected_classification_result)
+        self.assertEqual(classification_result,
+                         _generate_burger_results(timestamp))
+
+  def test_classify_for_video_succeeds_with_region_of_interest(self):
+    classifier_options = _ClassifierOptions(max_results=1)
+    options = _ImageClassifierOptions(
+      base_options=_BaseOptions(model_asset_path=self.model_path),
+      running_mode=_RUNNING_MODE.VIDEO,
+      classifier_options=classifier_options)
+    with _ImageClassifier.create_from_options(options) as classifier:
+      # Load the test image.
+      test_image = _Image.create_from_file(
+        test_utils.get_test_data_path('multi_objects.jpg'))
+      # NormalizedRect around the soccer ball.
+      roi = _NormalizedRect(x_center=0.532, y_center=0.521, width=0.164,
+                            height=0.427)
+      for timestamp in range(0, 300, 30):
+        classification_result = classifier.classify_for_video(
+          test_image, timestamp, roi)
+        self.assertEqual(classification_result,
+                         _generate_soccer_ball_results(timestamp))

  def test_calling_classify_in_live_stream_mode(self):
    options = _ImageClassifierOptions(
--- a/mediapipe/tasks/python/vision/BUILD
+++ b/mediapipe/tasks/python/vision/BUILD
@ -49,6 +49,7 @@ py_library(
        "//mediapipe/tasks/cc/vision/image_classifier/proto:image_classifier_graph_options_py_pb2",
        "//mediapipe/tasks/python/components/processors:classifier_options",
        "//mediapipe/tasks/python/components/containers:classifications",
+        "//mediapipe/tasks/python/components/containers:rect",
        "//mediapipe/tasks/python/core:base_options",
        "//mediapipe/tasks/python/core:optional_dependencies",
        "//mediapipe/tasks/python/core:task_info",
--- a/mediapipe/tasks/python/vision/image_classifier.py
+++ b/mediapipe/tasks/python/vision/image_classifier.py
@ -24,12 +24,14 @@ from mediapipe.python._framework_bindings import task_runner as task_runner_modu
 from mediapipe.tasks.cc.vision.image_classifier.proto import image_classifier_graph_options_pb2
 from mediapipe.tasks.python.components.processors import classifier_options
 from mediapipe.tasks.python.components.containers import classifications as classifications_module
+from mediapipe.tasks.python.components.containers import rect as rect_module
 from mediapipe.tasks.python.core import base_options as base_options_module
 from mediapipe.tasks.python.core import task_info as task_info_module
 from mediapipe.tasks.python.core.optional_dependencies import doc_controls
 from mediapipe.tasks.python.vision.core import base_vision_task_api
 from mediapipe.tasks.python.vision.core import vision_task_running_mode as running_mode_module

+_NormalizedRect = rect_module.NormalizedRect
 _BaseOptions = base_options_module.BaseOptions
 _ImageClassifierGraphOptionsProto = image_classifier_graph_options_pb2.ImageClassifierGraphOptions
 _ClassifierOptions = classifier_options.ClassifierOptions
@ -42,10 +44,17 @@ _CLASSIFICATION_RESULT_TAG = 'CLASSIFICATION_RESULT'
 _IMAGE_IN_STREAM_NAME = 'image_in'
 _IMAGE_OUT_STREAM_NAME = 'image_out'
 _IMAGE_TAG = 'IMAGE'
+_NORM_RECT_NAME = 'norm_rect_in'
+_NORM_RECT_TAG = 'NORM_RECT'
 _TASK_GRAPH_NAME = 'mediapipe.tasks.vision.image_classifier.ImageClassifierGraph'
 _MICRO_SECONDS_PER_MILLISECOND = 1000


+def _build_full_image_norm_rect() -> _NormalizedRect:
+  # Builds a NormalizedRect covering the entire image.
+  return _NormalizedRect(x_center=0.5, y_center=0.5, width=1, height=1)
+
+
@dataclasses.dataclass
 class ImageClassifierOptions:
  """Options for the image classifier task.
@ -145,6 +154,7 @@ class ImageClassifier(base_vision_task_api.BaseVisionTaskApi):
        task_graph=_TASK_GRAPH_NAME,
        input_streams=[
            ':'.join([_IMAGE_TAG, _IMAGE_IN_STREAM_NAME]),
+            ':'.join([_NORM_RECT_TAG, _NORM_RECT_NAME]),
        ],
        output_streams=[
            ':'.join([_CLASSIFICATION_RESULT_TAG,
@ -161,11 +171,13 @@ class ImageClassifier(base_vision_task_api.BaseVisionTaskApi):
  def classify(
      self,
      image: image_module.Image,
+      roi: Optional[_NormalizedRect] = None
  ) -> classifications_module.ClassificationResult:
    """Performs image classification on the provided MediaPipe Image.

    Args:
      image: MediaPipe Image.
+      roi: The region of interest.

    Returns:
      A classification result object that contains a list of classifications.
@ -174,8 +186,10 @@ class ImageClassifier(base_vision_task_api.BaseVisionTaskApi):
      ValueError: If any of the input arguments is invalid.
      RuntimeError: If image classification failed to run.
    """
-    output_packets = self._process_image_data(
-        {_IMAGE_IN_STREAM_NAME: packet_creator.create_image(image)})
+    norm_rect = roi if roi is not None else _build_full_image_norm_rect()
+    output_packets = self._process_image_data({
+        _IMAGE_IN_STREAM_NAME: packet_creator.create_image(image),
+        _NORM_RECT_NAME: packet_creator.create_proto(norm_rect.to_pb2())})
    classification_result_proto = packet_getter.get_proto(
        output_packets[_CLASSIFICATION_RESULT_OUT_STREAM_NAME])

@ -186,7 +200,8 @@ class ImageClassifier(base_vision_task_api.BaseVisionTaskApi):

  def classify_for_video(
      self, image: image_module.Image,
-      timestamp_ms: int
+      timestamp_ms: int,
+      roi: Optional[_NormalizedRect] = None
  ) -> classifications_module.ClassificationResult:
    """Performs image classification on the provided video frames.

@ -198,6 +213,7 @@ class ImageClassifier(base_vision_task_api.BaseVisionTaskApi):
    Args:
      image: MediaPipe Image.
      timestamp_ms: The timestamp of the input video frame in milliseconds.
+      roi: The region of interest.

    Returns:
      A classification result object that contains a list of classifications.
@ -206,10 +222,12 @@ class ImageClassifier(base_vision_task_api.BaseVisionTaskApi):
      ValueError: If any of the input arguments is invalid.
      RuntimeError: If image classification failed to run.
    """
+    norm_rect = roi if roi is not None else _build_full_image_norm_rect()
    output_packets = self._process_video_data({
-        _IMAGE_IN_STREAM_NAME:
-            packet_creator.create_image(image).at(
-                timestamp_ms * _MICRO_SECONDS_PER_MILLISECOND)
+        _IMAGE_IN_STREAM_NAME: packet_creator.create_image(image).at(
+            timestamp_ms * _MICRO_SECONDS_PER_MILLISECOND),
+        _NORM_RECT_NAME: packet_creator.create_proto(norm_rect.to_pb2()).at(
+            timestamp_ms * _MICRO_SECONDS_PER_MILLISECOND)
    })
    classification_result_proto = packet_getter.get_proto(
      output_packets[_CLASSIFICATION_RESULT_OUT_STREAM_NAME])
@ -219,7 +237,12 @@ class ImageClassifier(base_vision_task_api.BaseVisionTaskApi):
        for classification in classification_result_proto.classifications
    ])

-  def classify_async(self, image: image_module.Image, timestamp_ms: int) -> None:
+  def classify_async(
+      self,
+      image: image_module.Image,
+      timestamp_ms: int,
+      roi: Optional[_NormalizedRect] = None
+  ) -> None:
    """Sends live image data (an Image with a unique timestamp) to perform
    image classification.

@ -241,13 +264,16 @@ class ImageClassifier(base_vision_task_api.BaseVisionTaskApi):
    Args:
      image: MediaPipe Image.
      timestamp_ms: The timestamp of the input image in milliseconds.
+      roi: The region of interest.

    Raises:
      ValueError: If the current input timestamp is smaller than what the image
        classifier has already processed.
    """
+    norm_rect = roi if roi is not None else _build_full_image_norm_rect()
    self._send_live_stream_data({
-        _IMAGE_IN_STREAM_NAME:
-            packet_creator.create_image(image).at(
-                timestamp_ms * _MICRO_SECONDS_PER_MILLISECOND)
+        _IMAGE_IN_STREAM_NAME: packet_creator.create_image(image).at(
+            timestamp_ms * _MICRO_SECONDS_PER_MILLISECOND),
+        _NORM_RECT_NAME: packet_creator.create_proto(norm_rect.to_pb2()).at(
+            timestamp_ms * _MICRO_SECONDS_PER_MILLISECOND)
    })