Open Source Object Detector

PiperOrigin-RevId: 519201221
2023-03-24 11:51:01 -07:00 · 2023-03-24 11:51:01 -07:00 · 97c271644a
commit 97c271644a
parent 58a98bc7da
32 changed files with 3013 additions and 0 deletions
--- a/mediapipe/model_maker/init.py
+++ b/mediapipe/model_maker/init.py
@ -17,6 +17,7 @@ from mediapipe.model_maker.python.core.utils import quantization
 from mediapipe.model_maker.python.vision import image_classifier
 from mediapipe.model_maker.python.vision import gesture_recognizer
 from mediapipe.model_maker.python.text import text_classifier
+from mediapipe.model_maker.python.vision import object_detector

 # Remove duplicated and non-public API
 del python
--- a/mediapipe/model_maker/python/vision/object_detector/BUILD
+++ b/mediapipe/model_maker/python/vision/object_detector/BUILD
@ -0,0 +1,195 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Placeholder for internal Python strict library and test compatibility macro.
+# Placeholder for internal Python strict test compatibility macro.
+
+licenses(["notice"])
+
+package(
+    default_visibility = ["//mediapipe:__subpackages__"],
+)
+
+py_library(
+    name = "object_detector_import",
+    srcs = ["__init__.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":dataset",
+        ":hyperparameters",
+        ":model_options",
+        ":model_spec",
+        ":object_detector",
+        ":object_detector_options",
+    ],
+)
+
+py_binary(
+    name = "object_detector_demo",
+    srcs = ["object_detector_demo.py"],
+    data = [":testdata"],
+    python_version = "PY3",
+    tags = ["requires-net:external"],
+    deps = [":object_detector_import"],
+)
+
+filegroup(
+    name = "testdata",
+    srcs = glob([
+        "testdata/**",
+    ]),
+)
+
+py_library(
+    name = "dataset",
+    srcs = ["dataset.py"],
+    deps = [
+        ":dataset_util",
+        "//mediapipe/model_maker/python/core/data:classification_dataset",
+    ],
+)
+
+py_test(
+    name = "dataset_test",
+    srcs = ["dataset_test.py"],
+    data = [":testdata"],
+    deps = [
+        ":dataset",
+        "//mediapipe/model_maker/python/vision/core:image_utils",
+        "//mediapipe/model_maker/python/vision/core:test_utils",
+        "//mediapipe/tasks/python/test:test_utils",
+    ],
+)
+
+py_library(
+    name = "dataset_util",
+    srcs = ["dataset_util.py"],
+)
+
+py_test(
+    name = "dataset_util_test",
+    srcs = ["dataset_util_test.py"],
+    data = [":testdata"],
+    deps = [
+        ":dataset_util",
+        "//mediapipe/model_maker/python/vision/core:test_utils",
+        "//mediapipe/tasks/python/test:test_utils",
+    ],
+)
+
+py_library(
+    name = "hyperparameters",
+    srcs = ["hyperparameters.py"],
+    deps = [
+        "//mediapipe/model_maker/python/core:hyperparameters",
+    ],
+)
+
+py_library(
+    name = "preprocessor",
+    srcs = ["preprocessor.py"],
+    deps = [":model_spec"],
+)
+
+py_test(
+    name = "preprocessor_test",
+    srcs = ["preprocessor_test.py"],
+    deps = [
+        ":model_spec",
+        ":preprocessor",
+        "//mediapipe/model_maker/python/vision/core:test_utils",
+    ],
+)
+
+py_library(
+    name = "model",
+    srcs = ["model.py"],
+    deps = [
+        ":model_options",
+        ":model_spec",
+    ],
+)
+
+py_test(
+    name = "model_test",
+    size = "large",
+    srcs = ["model_test.py"],
+    data = [":testdata"],
+    shard_count = 4,
+    tags = ["requires-net:external"],
+    deps = [
+        ":dataset",
+        ":model",
+        ":model_options",
+        ":model_spec",
+        ":preprocessor",
+        "//mediapipe/tasks/python/test:test_utils",
+    ],
+)
+
+py_library(
+    name = "model_options",
+    srcs = ["model_options.py"],
+)
+
+py_library(
+    name = "model_spec",
+    srcs = ["model_spec.py"],
+    deps = ["//mediapipe/model_maker/python/core/utils:file_util"],
+)
+
+py_library(
+    name = "object_detector",
+    srcs = ["object_detector.py"],
+    deps = [
+        ":dataset",
+        ":hyperparameters",
+        ":model",
+        ":model_options",
+        ":model_spec",
+        ":object_detector_options",
+        ":preprocessor",
+        "//mediapipe/model_maker/python/core/tasks:classifier",
+        "//mediapipe/model_maker/python/core/utils:model_util",
+        "//mediapipe/model_maker/python/core/utils:quantization",
+        "//mediapipe/tasks/python/metadata/metadata_writers:metadata_writer",
+        "//mediapipe/tasks/python/metadata/metadata_writers:object_detector",
+    ],
+)
+
+py_test(
+    name = "object_detector_test",
+    size = "enormous",
+    srcs = ["object_detector_test.py"],
+    data = [":testdata"],
+    tags = ["requires-net:external"],
+    deps = [
+        ":dataset",
+        ":hyperparameters",
+        ":model_spec",
+        ":object_detector",
+        ":object_detector_options",
+        "//mediapipe/tasks/python/test:test_utils",
+    ],
+)
+
+py_library(
+    name = "object_detector_options",
+    srcs = ["object_detector_options.py"],
+    deps = [
+        ":hyperparameters",
+        ":model_options",
+        ":model_spec",
+    ],
+)
--- a/mediapipe/model_maker/python/vision/object_detector/init.py
+++ b/mediapipe/model_maker/python/vision/object_detector/init.py
@ -0,0 +1,30 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe Model Maker Python Public API For Object Detector."""
+
+from mediapipe.model_maker.python.vision.object_detector import dataset
+from mediapipe.model_maker.python.vision.object_detector import hyperparameters
+from mediapipe.model_maker.python.vision.object_detector import model_options
+from mediapipe.model_maker.python.vision.object_detector import model_spec
+from mediapipe.model_maker.python.vision.object_detector import object_detector
+from mediapipe.model_maker.python.vision.object_detector import object_detector_options
+
+ObjectDetector = object_detector.ObjectDetector
+ModelOptions = model_options.ObjectDetectorModelOptions
+ModelSpec = model_spec.ModelSpec
+SupportedModels = model_spec.SupportedModels
+HParams = hyperparameters.HParams
+QATHParams = hyperparameters.QATHParams
+Dataset = dataset.Dataset
+ObjectDetectorOptions = object_detector_options.ObjectDetectorOptions
--- a/mediapipe/model_maker/python/vision/object_detector/dataset.py
+++ b/mediapipe/model_maker/python/vision/object_detector/dataset.py
@ -0,0 +1,179 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Object detector dataset library."""
+
+from typing import Optional
+
+import tensorflow as tf
+import yaml
+
+from mediapipe.model_maker.python.core.data import classification_dataset
+from mediapipe.model_maker.python.vision.object_detector import dataset_util
+from official.vision.dataloaders import tf_example_decoder
+
+
+class Dataset(classification_dataset.ClassificationDataset):
+  """Dataset library for object detector."""
+
+  @classmethod
+  def from_coco_folder(
+      cls,
+      data_dir: str,
+      max_num_images: Optional[int] = None,
+      cache_dir: Optional[str] = None,
+  ) -> 'Dataset':
+    """Loads images and labels from the given directory in COCO format.
+
+    - https://cocodataset.org/#home
+
+    Folder structure should be:
+      <data_dir>/
+        images/
+          <file0>.jpg
+          ...
+        labels.json
+
+    The `labels.json` annotations file should should have the following format:
+    {
+        "categories": [{"id": 0, "name": "background"}, ...],
+        "images": [{"id": 0, "file_name": "<file0>.jpg"}, ...],
+        "annotations": [{
+           "id": 0,
+           "image_id": 0,
+           "category_id": 2,
+           "bbox": [x-top left, y-top left, width, height],
+           }, ...]
+    }
+    Note that category id 0 is reserved for the "background" class. It is
+    optional to include, but if included it must be set to "background".
+
+
+    Args:
+      data_dir: Name of the directory containing the data files.
+      max_num_images: Max number of images to process.
+      cache_dir: The cache directory to save TFRecord and metadata files. The
+        TFRecord files are a standardized format for training object detection
+        while the metadata file is used to store information like dataset size
+        and label mapping of id to label name. If the cache_dir is not set, a
+        temporary folder will be created and will not be removed automatically
+        after training which means it can be reused later.
+
+    Returns:
+      Dataset containing images and labels and other related info.
+    Raises:
+      ValueError: If the input data directory is empty.
+      ValueError: If the label_name for id 0 is set to something other than
+        the 'background' class.
+    """
+    cache_files = dataset_util.get_cache_files_coco(data_dir, cache_dir)
+    if not dataset_util.is_cached(cache_files):
+      label_map = dataset_util.get_label_map_coco(data_dir)
+      cache_writer = dataset_util.COCOCacheFilesWriter(
+          label_map=label_map, max_num_images=max_num_images
+      )
+      cache_writer.write_files(cache_files, data_dir)
+    return cls.from_cache(cache_files.cache_prefix)
+
+  @classmethod
+  def from_pascal_voc_folder(
+      cls,
+      data_dir: str,
+      max_num_images: Optional[int] = None,
+      cache_dir: Optional[str] = None,
+  ) -> 'Dataset':
+    """Loads images and labels from the given directory in PASCAL VOC format.
+
+    - http://host.robots.ox.ac.uk/pascal/VOC.
+
+    Folder structure should be:
+      <data_dir>/
+        images/
+          <file0>.jpg
+          ...
+        Annotations/
+          <file0>.xml
+          ...
+    Each <file0>.xml annotation file should have the following format:
+      <annotation>
+        <filename>file0.jpg<filename>
+        <object>
+          <name>kangaroo</name>
+          <bndbox>
+            <xmin>233</xmin>
+            <ymin>89</ymin>
+            <xmax>386</xmax>
+            <ymax>262</ymax>
+        </object>
+        <object>...</object>
+      </annotation>
+
+    Args:
+      data_dir: Name of the directory containing the data files.
+      max_num_images: Max number of images to process.
+      cache_dir: The cache directory to save TFRecord and metadata files. The
+        TFRecord files are a standardized format for training object detection
+        while the metadata file is used to store information like dataset size
+        and label mapping of id to label name. If the cache_dir is not set, a
+        temporary folder will be created and will not be removed automatically
+        after training which means it can be reused later.
+
+    Returns:
+      Dataset containing images and labels and other related info.
+    Raises:
+      ValueError: if the input data directory is empty.
+    """
+    cache_files = dataset_util.get_cache_files_pascal_voc(data_dir, cache_dir)
+    if not dataset_util.is_cached(cache_files):
+      label_map = dataset_util.get_label_map_pascal_voc(data_dir)
+      cache_writer = dataset_util.PascalVocCacheFilesWriter(
+          label_map=label_map, max_num_images=max_num_images
+      )
+      cache_writer.write_files(cache_files, data_dir)
+
+    return cls.from_cache(cache_files.cache_prefix)
+
+  @classmethod
+  def from_cache(cls, cache_prefix: str) -> 'Dataset':
+    """Loads the TFRecord data from cache.
+
+    Args:
+      cache_prefix: The cache prefix including the cache directory and the cache
+        prefix filename, e.g: '/tmp/cache/train'.
+
+    Returns:
+      ObjectDetectorDataset object.
+    """
+    # Get TFRecord Files
+    tfrecord_file_patten = cache_prefix + '*.tfrecord'
+    matched_files = tf.io.gfile.glob(tfrecord_file_patten)
+    if not matched_files:
+      raise ValueError('TFRecord files are empty.')
+
+    # Load meta_data.
+    meta_data_file = cache_prefix + dataset_util.META_DATA_FILE_SUFFIX
+    if not tf.io.gfile.exists(meta_data_file):
+      raise ValueError("Metadata file %s doesn't exist." % meta_data_file)
+    with tf.io.gfile.GFile(meta_data_file, 'r') as f:
+      meta_data = yaml.load(f, Loader=yaml.FullLoader)
+
+    dataset = tf.data.TFRecordDataset(matched_files)
+    decoder = tf_example_decoder.TfExampleDecoder(regenerate_source_id=False)
+    dataset = dataset.map(decoder.decode, num_parallel_calls=tf.data.AUTOTUNE)
+
+    label_map = meta_data['label_map']
+    label_names = [label_map[k] for k in sorted(label_map.keys())]
+
+    return Dataset(
+        dataset=dataset, size=meta_data['size'], label_names=label_names
+    )
--- a/mediapipe/model_maker/python/vision/object_detector/dataset_test.py
+++ b/mediapipe/model_maker/python/vision/object_detector/dataset_test.py
@ -0,0 +1,119 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import random
+import tensorflow as tf
+
+from mediapipe.model_maker.python.vision.core import image_utils
+from mediapipe.model_maker.python.vision.core import test_utils
+from mediapipe.model_maker.python.vision.object_detector import dataset
+from mediapipe.tasks.python.test import test_utils as tasks_test_utils
+
+IMAGE_SIZE = 224
+
+
+class DatasetTest(tf.test.TestCase):
+
+  def _get_rand_bbox(self):
+    x1, x2 = random.uniform(0, IMAGE_SIZE), random.uniform(0, IMAGE_SIZE)
+    y1, y2 = random.uniform(0, IMAGE_SIZE), random.uniform(0, IMAGE_SIZE)
+    return [min(x1, x2), min(y1, y2), abs(x1 - x2), abs(y1 - y2)]
+
+  def setUp(self):
+    super().setUp()
+    self.coco_dataset_path = os.path.join(self.get_temp_dir(), 'coco_dataset')
+    if os.path.exists(self.coco_dataset_path):
+      return
+    os.mkdir(self.coco_dataset_path)
+    categories = [{'id': 1, 'name': 'daisy'}, {'id': 2, 'name': 'tulips'}]
+    images = [
+        {'id': 1, 'file_name': 'img1.jpeg'},
+        {'id': 2, 'file_name': 'img2.jpeg'},
+    ]
+    annotations = [
+        {'image_id': 1, 'category_id': 1, 'bbox': self._get_rand_bbox()},
+        {'image_id': 2, 'category_id': 1, 'bbox': self._get_rand_bbox()},
+        {'image_id': 2, 'category_id': 2, 'bbox': self._get_rand_bbox()},
+    ]
+    labels_dict = {
+        'categories': categories,
+        'images': images,
+        'annotations': annotations,
+    }
+    labels_json = json.dumps(labels_dict)
+    with open(os.path.join(self.coco_dataset_path, 'labels.json'), 'w') as f:
+      f.write(labels_json)
+    images_dir = os.path.join(self.coco_dataset_path, 'images')
+    os.mkdir(images_dir)
+    for item in images:
+      test_utils.write_filled_jpeg_file(
+          os.path.join(images_dir, item['file_name']),
+          [random.uniform(0, 255) for _ in range(3)],
+          IMAGE_SIZE,
+      )
+
+  def test_from_coco_folder(self):
+    data = dataset.Dataset.from_coco_folder(
+        self.coco_dataset_path, cache_dir=self.get_temp_dir()
+    )
+    self.assertLen(data, 2)
+    self.assertEqual(data.num_classes, 3)
+    self.assertEqual(data.label_names, ['background', 'daisy', 'tulips'])
+    for example in data.gen_tf_dataset():
+      boxes = example['groundtruth_boxes']
+      classes = example['groundtruth_classes']
+      self.assertNotEmpty(boxes)
+      self.assertAllLessEqual(boxes, 1)
+      self.assertAllGreaterEqual(boxes, 0)
+      self.assertNotEmpty(classes)
+      self.assertTrue(
+          (classes.numpy() == [1]).all() or (classes.numpy() == [1, 2]).all()
+      )
+      if (classes.numpy() == [1, 1]).all():
+        raw_image_tensor = image_utils.load_image(
+            os.path.join(self.coco_dataset_path, 'images', 'img1.jpeg')
+        )
+      else:
+        raw_image_tensor = image_utils.load_image(
+            os.path.join(self.coco_dataset_path, 'images', 'img2.jpeg')
+        )
+      self.assertTrue(
+          (example['image'].numpy() == raw_image_tensor.numpy()).all()
+      )
+
+  def test_from_pascal_voc_folder(self):
+    pascal_voc_folder = tasks_test_utils.get_test_data_path('pascal_voc_data')
+    data = dataset.Dataset.from_pascal_voc_folder(
+        pascal_voc_folder, cache_dir=self.get_temp_dir()
+    )
+    self.assertLen(data, 4)
+    self.assertEqual(data.num_classes, 3)
+    self.assertEqual(data.label_names, ['background', 'android', 'pig_android'])
+    for example in data.gen_tf_dataset():
+      boxes = example['groundtruth_boxes']
+      classes = example['groundtruth_classes']
+      self.assertNotEmpty(boxes)
+      self.assertAllLessEqual(boxes, 1)
+      self.assertAllGreaterEqual(boxes, 0)
+      self.assertNotEmpty(classes)
+      image = example['image']
+      self.assertNotEmpty(image)
+      self.assertAllGreaterEqual(image, 0)
+      self.assertAllLessEqual(image, 255)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/mediapipe/model_maker/python/vision/object_detector/dataset_util.py
+++ b/mediapipe/model_maker/python/vision/object_detector/dataset_util.py
@ -0,0 +1,484 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for Object Detector Dataset Library."""
+
+import abc
+import collections
+import dataclasses
+import hashlib
+import json
+import math
+import os
+import tempfile
+from typing import Any, Dict, List, Mapping, Optional, Sequence
+import xml.etree.ElementTree as ET
+
+import tensorflow as tf
+import yaml
+
+from official.vision.data import tfrecord_lib
+
+
+# Suffix of the meta data file name.
+META_DATA_FILE_SUFFIX = '_meta_data.yaml'
+
+
+def _xml_get(node: ET.Element, name: str) -> ET.Element:
+  """Gets a named child from an XML Element node.
+
+  This method is used to retrieve an XML element that is expected to exist as a
+  subelement of the `node` passed into this argument. If the subelement is not
+  found, then an error is thrown.
+
+  Raises:
+    ValueError: If the subelement is not found.
+
+  Args:
+    node: XML Element Tree node.
+    name: Name of the child node to get
+
+  Returns:
+    A child node of the parameter node with the matching name.
+  """
+  result = node.find(name)
+  if result is None:
+    raise ValueError(f'Unexpected xml format: {name} not found in {node}')
+  return result
+
+
+def _get_cache_dir_or_create(cache_dir: Optional[str]) -> str:
+  """Gets the cache directory or creates it if not exists."""
+  if cache_dir is None:
+    cache_dir = tempfile.mkdtemp()
+  if not tf.io.gfile.exists(cache_dir):
+    tf.io.gfile.makedirs(cache_dir)
+  return cache_dir
+
+
+def _get_dir_basename(data_dir: str) -> str:
+  """Gets the base name of the directory."""
+  return os.path.basename(os.path.abspath(data_dir))
+
+
+@dataclasses.dataclass(frozen=True)
+class CacheFiles:
+  """Cache files for object detection."""
+
+  cache_prefix: str
+  tfrecord_files: Sequence[str]
+  meta_data_file: str
+
+
+def _get_cache_files(
+    cache_dir: Optional[str], cache_prefix_filename: str, num_shards: int = 10
+) -> CacheFiles:
+  """Creates an object of CacheFiles class.
+
+  Args:
+    cache_dir: The cache directory to save TFRecord and metadata file. When
+      cache_dir is None, a temporary folder will be created and will not be
+      removed automatically after training which makes it can be used later.
+     cache_prefix_filename: The cache prefix filename.
+     num_shards: Number of shards for output file.
+
+  Returns:
+    An object of CacheFiles class.
+  """
+  cache_dir = _get_cache_dir_or_create(cache_dir)
+  # The cache prefix including the cache directory and the cache prefix
+  # filename, e.g: '/tmp/cache/train'.
+  cache_prefix = os.path.join(cache_dir, cache_prefix_filename)
+  tf.compat.v1.logging.info(
+      'Cache will be stored in %s with prefix filename %s. Cache_prefix is %s'
+      % (cache_dir, cache_prefix_filename, cache_prefix)
+  )
+
+  # Cached files including the TFRecord files and the meta data file.
+  tfrecord_files = [
+      cache_prefix + '-%05d-of-%05d.tfrecord' % (i, num_shards)
+      for i in range(num_shards)
+  ]
+  meta_data_file = cache_prefix + META_DATA_FILE_SUFFIX
+  return CacheFiles(
+      cache_prefix=cache_prefix,
+      tfrecord_files=tuple(tfrecord_files),
+      meta_data_file=meta_data_file,
+  )
+
+
+def get_cache_files_coco(data_dir: str, cache_dir: str) -> CacheFiles:
+  """Creates an object of CacheFiles class using a COCO formatted dataset.
+
+  Args:
+    data_dir: Folder path of the coco dataset
+    cache_dir: Folder path of the cache location. When cache_dir is None, a
+      temporary folder will be created and will not be removed automatically
+      after training which makes it can be used later.
+
+  Returns:
+    An object of CacheFiles class.
+  """
+  hasher = hashlib.md5()
+  # Update with dataset folder name
+  hasher.update(_get_dir_basename(data_dir).encode('utf-8'))
+  # Update with image filenames
+  for image_file in sorted(os.listdir(os.path.join(data_dir, 'images'))):
+    hasher.update(os.path.basename(image_file).encode('utf-8'))
+  # Update with labels.json file content
+  label_file = os.path.join(data_dir, 'labels.json')
+  with open(label_file, 'r') as f:
+    label_json = json.load(f)
+    hasher.update(str(label_json).encode('utf-8'))
+    num_examples = len(label_json['images'])
+  # Num_shards automatically set to 100 images per shard, up to 10 shards total.
+  # See https://www.tensorflow.org/tutorials/load_data/tfrecord for more info
+  # on sharding.
+  num_shards = min(math.ceil(num_examples / 100), 10)
+  # Update with num shards
+  hasher.update(str(num_shards).encode('utf-8'))
+  cache_prefix_filename = hasher.hexdigest()
+
+  return _get_cache_files(cache_dir, cache_prefix_filename, num_shards)
+
+
+def get_cache_files_pascal_voc(data_dir: str, cache_dir: str) -> CacheFiles:
+  """Gets an object of CacheFiles using a PASCAL VOC formatted dataset.
+
+  Args:
+    data_dir: Folder path of the pascal voc dataset.
+    cache_dir: Folder path of the cache location. When cache_dir is None, a
+      temporary folder will be created and will not be removed automatically
+      after training which makes it can be used later.
+
+  Returns:
+    An object of CacheFiles class.
+  """
+  hasher = hashlib.md5()
+  hasher.update(_get_dir_basename(data_dir).encode('utf-8'))
+  annotation_files = tf.io.gfile.glob(
+      os.path.join(data_dir, 'Annotations') + r'/*.xml'
+  )
+  annotation_filenames = [
+      os.path.basename(ann_file) for ann_file in annotation_files
+  ]
+  hasher.update(' '.join(annotation_filenames).encode('utf-8'))
+  num_examples = len(annotation_filenames)
+  num_shards = min(math.ceil(num_examples / 100), 10)
+  hasher.update(str(num_shards).encode('utf-8'))
+  cache_prefix_filename = hasher.hexdigest()
+
+  return _get_cache_files(cache_dir, cache_prefix_filename, num_shards)
+
+
+def is_cached(cache_files: CacheFiles) -> bool:
+  """Checks whether cache files are already cached."""
+  all_cached_files = list(cache_files.tfrecord_files) + [
+      cache_files.meta_data_file
+  ]
+  return all(tf.io.gfile.exists(path) for path in all_cached_files)
+
+
+class CacheFilesWriter(abc.ABC):
+  """CacheFilesWriter class to write the cached files."""
+
+  def __init__(
+      self, label_map: Dict[int, str], max_num_images: Optional[int] = None
+  ) -> None:
+    """Initializes CacheFilesWriter for object detector.
+
+    Args:
+      label_map: Dict, map label integer ids to string label names such as {1:
+        'person', 2: 'notperson'}. 0 is the reserved key for `background` and
+        doesn't need to be included in `label_map`. Label names can't be
+        duplicated.
+      max_num_images: Max number of images to process. If None, process all the
+        images.
+    """
+    self.label_map = label_map
+    self.max_num_images = max_num_images
+
+  def write_files(self, cache_files: CacheFiles, *args, **kwargs) -> None:
+    """Writes TFRecord and meta_data files.
+
+    Args:
+      cache_files: CacheFiles object including a list of TFRecord files and the
+        meta data yaml file to save the meta_data including data size and
+        label_map.
+      *args: Non-keyword of parameters used in the `_get_example` method.
+      **kwargs: Keyword parameters used in the `_get_example` method.
+    """
+    writers = [
+        tf.io.TFRecordWriter(path) for path in cache_files.tfrecord_files
+    ]
+
+    # Writes tf.Example into TFRecord files.
+    size = 0
+    for idx, tf_example in enumerate(self._get_example(*args, **kwargs)):
+      if self.max_num_images and idx >= self.max_num_images:
+        break
+      if idx % 100 == 0:
+        tf.compat.v1.logging.info('On image %d' % idx)
+      writers[idx % len(writers)].write(tf_example.SerializeToString())
+      size = idx + 1
+
+    for writer in writers:
+      writer.close()
+
+    # Writes meta_data into meta_data_file.
+    meta_data = {'size': size, 'label_map': self.label_map}
+    with tf.io.gfile.GFile(cache_files.meta_data_file, 'w') as f:
+      yaml.dump(meta_data, f)
+
+  @abc.abstractmethod
+  def _get_example(self, *args, **kwargs):
+    raise NotImplementedError
+
+
+def get_label_map_coco(data_dir: str):
+  """Gets the label map from a COCO formatted dataset directory.
+
+  Note that id 0 is reserved for the background class. If id=0 is set, it needs
+  to be set to "background". It is optional to include id=0 if it is unused, and
+  it will be automatically added by this method.
+
+  Args:
+    data_dir: Path of the dataset directory
+
+  Returns:
+    label_map dictionary of the format {<id>:<label_name>}
+
+  Raises:
+    ValueError: If the label_name for id 0 is set to something other than
+    the "background" class.
+  """
+  data_dir = os.path.abspath(data_dir)
+  # Process labels.json file
+  label_file = os.path.join(data_dir, 'labels.json')
+  with open(label_file, 'r') as f:
+    data = json.load(f)
+
+  # Categories
+  label_map = {}
+  for category in data['categories']:
+    label_map[int(category['id'])] = category['name']
+
+  if 0 in label_map and label_map[0] != 'background':
+    raise ValueError(
+        (
+            'Label index 0 is reserved for the background class, but '
+            f'it was found to be {label_map[0]}'
+        ),
+    )
+  if 0 not in label_map:
+    label_map[0] = 'background'
+
+  return label_map
+
+
+def get_label_map_pascal_voc(data_dir: str):
+  """Gets the label map from a PASCAL VOC formatted dataset directory.
+
+  The id to label_name mapping is determined by sorting all label_names and
+  numbering them starting from 1. Id=0 is set as the 'background' class.
+
+  Args:
+    data_dir: Path of the dataset directory
+
+  Returns:
+    label_map dictionary of the format {<id>:<label_name>}
+  """
+  data_dir = os.path.abspath(data_dir)
+  all_label_names = set()
+  annotations_dir = os.path.join(data_dir, 'Annotations')
+  all_annotation_files = tf.io.gfile.glob(annotations_dir + r'/*.xml')
+  for ann_file in all_annotation_files:
+    tree = ET.parse(ann_file)
+    root = tree.getroot()
+    for child in root.iter('object'):
+      label_name = _xml_get(child, 'name').text
+      all_label_names.add(label_name)
+  label_map = {0: 'background'}
+  for ind, label_name in enumerate(sorted(all_label_names)):
+    label_map[ind + 1] = label_name
+  return label_map
+
+
+def _bbox_data_to_feature_dict(data):
+  """Converts a dictionary of bbox annotations to a feature dictionary.
+
+  Args:
+    data: Dict with keys 'xmin', 'xmax', 'ymin', 'ymax', 'category_id'
+
+  Returns:
+    Feature dictionary
+  """
+  bbox_feature_dict = {
+      'image/object/bbox/xmin': tfrecord_lib.convert_to_feature(data['xmin']),
+      'image/object/bbox/xmax': tfrecord_lib.convert_to_feature(data['xmax']),
+      'image/object/bbox/ymin': tfrecord_lib.convert_to_feature(data['ymin']),
+      'image/object/bbox/ymax': tfrecord_lib.convert_to_feature(data['ymax']),
+      'image/object/class/label': tfrecord_lib.convert_to_feature(
+          data['category_id']
+      ),
+  }
+  return bbox_feature_dict
+
+
+def _coco_annotations_to_lists(
+    bbox_annotations: List[Mapping[str, Any]],
+    image_height: int,
+    image_width: int,
+):
+  """Converts COCO annotations to feature lists.
+
+  Args:
+    bbox_annotations: List of dicts with keys ['bbox', 'category_id']
+    image_height: Height of image
+    image_width: Width of iamge
+
+  Returns:
+    (data, num_annotations_skipped) tuple where data contains the keys:
+    ['xmin', 'xmax', 'ymin', 'ymax', 'is_crowd', 'category_id', 'area'] and
+    num_annotations_skipped is the number of skipped annotations because of the
+    bbox having 0 area.
+  """
+
+  data = collections.defaultdict(list)
+
+  num_annotations_skipped = 0
+
+  for object_annotations in bbox_annotations:
+    (x, y, width, height) = tuple(object_annotations['bbox'])
+
+    if width <= 0 or height <= 0:
+      num_annotations_skipped += 1
+      continue
+    if x + width > image_width or y + height > image_height:
+      num_annotations_skipped += 1
+      continue
+    data['xmin'].append(float(x) / image_width)
+    data['xmax'].append(float(x + width) / image_width)
+    data['ymin'].append(float(y) / image_height)
+    data['ymax'].append(float(y + height) / image_height)
+    category_id = int(object_annotations['category_id'])
+    data['category_id'].append(category_id)
+
+  return data, num_annotations_skipped
+
+
+class COCOCacheFilesWriter(CacheFilesWriter):
+  """CacheFilesWriter class to write the cached files for COCO data."""
+
+  def _get_example(self, data_dir: str) -> tf.train.Example:
+    """Iterates over all examples in the COCO formatted dataset directory.
+
+    Args:
+      data_dir: Path of the dataset directory
+
+    Yields:
+      tf.train.Example
+    """
+    data_dir = os.path.abspath(data_dir)
+    # Process labels.json file
+    label_file = os.path.join(data_dir, 'labels.json')
+    with open(label_file, 'r') as f:
+      data = json.load(f)
+
+    # Load all Annotations
+    img_to_annotations = collections.defaultdict(list)
+    for annotation in data['annotations']:
+      image_id = annotation['image_id']
+      img_to_annotations[image_id].append(annotation)
+
+    # For each Image:
+    for image in data['images']:
+      img_id = image['id']
+      file_name = image['file_name']
+      full_path = os.path.join(data_dir, 'images', file_name)
+      with tf.io.gfile.GFile(full_path, 'rb') as fid:
+        encoded_jpg = fid.read()
+      image = tf.io.decode_jpeg(encoded_jpg, channels=3)
+      height, width, _ = image.shape
+      feature_dict = tfrecord_lib.image_info_to_feature_dict(
+          height, width, file_name, img_id, encoded_jpg, 'jpg'
+      )
+      data, _ = _coco_annotations_to_lists(
+          img_to_annotations[img_id], height, width
+      )
+      if not data['xmin']:
+        # Skip examples which have no annotations
+        continue
+      bbox_feature_dict = _bbox_data_to_feature_dict(data)
+      feature_dict.update(bbox_feature_dict)
+      example = tf.train.Example(
+          features=tf.train.Features(feature=feature_dict)
+      )
+      yield example
+
+
+class PascalVocCacheFilesWriter(CacheFilesWriter):
+  """CacheFilesWriter class to write the cached files for PASCAL VOC data."""
+
+  def _get_example(self, data_dir: str) -> tf.train.Example:
+    """Iterates over all examples in the PASCAL VOC formatted dataset directory.
+
+    Args:
+      data_dir: Path of the dataset directory
+
+    Yields:
+      tf.train.Example
+    """
+    label_name_to_id = {name: i for (i, name) in self.label_map.items()}
+    annotations_dir = os.path.join(data_dir, 'Annotations')
+    images_dir = os.path.join(data_dir, 'images')
+    all_annotation_paths = tf.io.gfile.glob(annotations_dir + r'/*.xml')
+
+    data = collections.defaultdict(list)
+    for ind, ann_file in enumerate(all_annotation_paths):
+      tree = ET.parse(ann_file)
+      root = tree.getroot()
+      img_filename = _xml_get(root, 'filename').text
+      img_file = os.path.join(images_dir, img_filename)
+      with tf.io.gfile.GFile(img_file, 'rb') as fid:
+        encoded_jpg = fid.read()
+      image = tf.io.decode_jpeg(encoded_jpg, channels=3)
+      height, width, _ = image.shape
+      for child in root.iter('object'):
+        category_name = _xml_get(child, 'name').text
+        category_id = label_name_to_id[category_name]
+        bndbox = _xml_get(child, 'bndbox')
+        xmin = float(_xml_get(bndbox, 'xmin').text)
+        xmax = float(_xml_get(bndbox, 'xmax').text)
+        ymin = float(_xml_get(bndbox, 'ymin').text)
+        ymax = float(_xml_get(bndbox, 'ymax').text)
+        if xmax <= xmin or ymax <= ymin or xmax > width or ymax > height:
+          # Skip annotations that have no area or are larger than the image
+          continue
+        data['xmin'].append(xmin / width)
+        data['ymin'].append(ymin / height)
+        data['xmax'].append(xmax / width)
+        data['ymax'].append(ymax / height)
+        data['category_id'].append(category_id)
+      if not data['xmin']:
+        # Skip examples which have no valid annotations
+        continue
+      feature_dict = tfrecord_lib.image_info_to_feature_dict(
+          height, width, img_filename, ind, encoded_jpg, 'jpg'
+      )
+      bbox_feature_dict = _bbox_data_to_feature_dict(data)
+      feature_dict.update(bbox_feature_dict)
+      example = tf.train.Example(
+          features=tf.train.Features(feature=feature_dict)
+      )
+      yield example
--- a/mediapipe/model_maker/python/vision/object_detector/dataset_util_test.py
+++ b/mediapipe/model_maker/python/vision/object_detector/dataset_util_test.py
@ -0,0 +1,236 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import json
+import os
+import shutil
+from unittest import mock as unittest_mock
+
+import tensorflow as tf
+import yaml
+
+from mediapipe.model_maker.python.vision.core import test_utils
+from mediapipe.model_maker.python.vision.object_detector import dataset_util
+from mediapipe.tasks.python.test import test_utils as tasks_test_utils
+
+
+class DatasetUtilTest(tf.test.TestCase):
+
+  def _assert_cache_files_equal(self, cf1, cf2):
+    self.assertEqual(cf1.cache_prefix, cf2.cache_prefix)
+    self.assertCountEqual(cf1.tfrecord_files, cf2.tfrecord_files)
+    self.assertEqual(cf1.meta_data_file, cf2.meta_data_file)
+
+  def _assert_cache_files_not_equal(self, cf1, cf2):
+    self.assertNotEqual(cf1.cache_prefix, cf2.cache_prefix)
+    self.assertNotEqual(cf1.tfrecord_files, cf2.tfrecord_files)
+    self.assertNotEqual(cf1.meta_data_file, cf2.meta_data_file)
+
+  def _get_cache_files_and_assert_neq_fn(self, cache_files_fn):
+    def get_cache_files_and_assert_neq(cf, data_dir, cache_dir):
+      new_cf = cache_files_fn(data_dir, cache_dir)
+      self._assert_cache_files_not_equal(cf, new_cf)
+      return new_cf
+
+    return get_cache_files_and_assert_neq
+
+  @unittest_mock.patch.object(hashlib, 'md5', autospec=True)
+  def test_get_cache_files_coco(self, mock_md5):
+    mock_md5.return_value.hexdigest.return_value = 'train'
+    cache_files = dataset_util.get_cache_files_coco(
+        tasks_test_utils.get_test_data_path('coco_data'), cache_dir='/tmp/'
+    )
+    self.assertEqual(cache_files.cache_prefix, '/tmp/train')
+    self.assertLen(cache_files.tfrecord_files, 1)
+    self.assertEqual(
+        cache_files.tfrecord_files[0], '/tmp/train-00000-of-00001.tfrecord'
+    )
+    self.assertEqual(cache_files.meta_data_file, '/tmp/train_meta_data.yaml')
+
+  def test_matching_get_cache_files_coco(self):
+    cache_dir = self.create_tempdir()
+    coco_folder = tasks_test_utils.get_test_data_path('coco_data')
+    coco_folder_tmp = os.path.join(self.create_tempdir(), 'coco_data')
+    shutil.copytree(coco_folder, coco_folder_tmp)
+    cache_files1 = dataset_util.get_cache_files_coco(coco_folder, cache_dir)
+    cache_files2 = dataset_util.get_cache_files_coco(coco_folder, cache_dir)
+    self._assert_cache_files_equal(cache_files1, cache_files2)
+    cache_files3 = dataset_util.get_cache_files_coco(coco_folder_tmp, cache_dir)
+    self._assert_cache_files_equal(cache_files1, cache_files3)
+
+  def test_not_matching_get_cache_files_coco(self):
+    cache_dir = self.create_tempdir()
+    temp_dir = self.create_tempdir()
+    coco_folder = os.path.join(temp_dir, 'coco_data')
+    shutil.copytree(
+        tasks_test_utils.get_test_data_path('coco_data'), coco_folder
+    )
+    prev_cache_file = dataset_util.get_cache_files_coco(coco_folder, cache_dir)
+    os.chmod(coco_folder, 0o700)
+    os.chmod(os.path.join(coco_folder, 'images'), 0o700)
+    os.chmod(os.path.join(coco_folder, 'labels.json'), 0o700)
+    get_cache_files_and_assert_neq = self._get_cache_files_and_assert_neq_fn(
+        dataset_util.get_cache_files_coco
+    )
+    # Test adding image
+    test_utils.write_filled_jpeg_file(
+        os.path.join(coco_folder, 'images', 'test.jpg'), [0, 0, 0], 50
+    )
+    prev_cache_file = get_cache_files_and_assert_neq(
+        prev_cache_file, coco_folder, cache_dir
+    )
+    # Test modifying labels.json
+    with open(os.path.join(coco_folder, 'labels.json'), 'w') as f:
+      json.dump({'images': [{'id': 1, 'file_name': '000000000078.jpg'}]}, f)
+    prev_cache_file = get_cache_files_and_assert_neq(
+        prev_cache_file, coco_folder, cache_dir
+    )
+
+    # Test rename folder
+    new_coco_folder = os.path.join(temp_dir, 'coco_data_renamed')
+    shutil.move(coco_folder, new_coco_folder)
+    coco_folder = new_coco_folder
+    prev_cache_file = get_cache_files_and_assert_neq(
+        prev_cache_file, new_coco_folder, cache_dir
+    )
+
+  @unittest_mock.patch.object(hashlib, 'md5', autospec=True)
+  def test_get_cache_files_pascal_voc(self, mock_md5):
+    mock_md5.return_value.hexdigest.return_value = 'train'
+    cache_files = dataset_util.get_cache_files_pascal_voc(
+        tasks_test_utils.get_test_data_path('pascal_voc_data'),
+        cache_dir='/tmp/',
+    )
+    self.assertEqual(cache_files.cache_prefix, '/tmp/train')
+    self.assertLen(cache_files.tfrecord_files, 1)
+    self.assertEqual(
+        cache_files.tfrecord_files[0], '/tmp/train-00000-of-00001.tfrecord'
+    )
+    self.assertEqual(cache_files.meta_data_file, '/tmp/train_meta_data.yaml')
+
+  def test_matching_get_cache_files_pascal_voc(self):
+    cache_dir = self.create_tempdir()
+    pascal_folder = tasks_test_utils.get_test_data_path('pascal_voc_data')
+    pascal_folder_temp = os.path.join(self.create_tempdir(), 'pascal_voc_data')
+    shutil.copytree(pascal_folder, pascal_folder_temp)
+    cache_files1 = dataset_util.get_cache_files_pascal_voc(
+        pascal_folder, cache_dir
+    )
+    cache_files2 = dataset_util.get_cache_files_pascal_voc(
+        pascal_folder, cache_dir
+    )
+    self._assert_cache_files_equal(cache_files1, cache_files2)
+    cache_files3 = dataset_util.get_cache_files_pascal_voc(
+        pascal_folder_temp, cache_dir
+    )
+    self._assert_cache_files_equal(cache_files1, cache_files3)
+
+  def test_not_matching_get_cache_files_pascal_voc(self):
+    cache_dir = self.create_tempdir()
+    temp_dir = self.create_tempdir()
+    pascal_folder = os.path.join(temp_dir, 'pascal_voc_data')
+    shutil.copytree(
+        tasks_test_utils.get_test_data_path('pascal_voc_data'), pascal_folder
+    )
+    prev_cache_files = dataset_util.get_cache_files_pascal_voc(
+        pascal_folder, cache_dir
+    )
+    os.chmod(pascal_folder, 0o700)
+    os.chmod(os.path.join(pascal_folder, 'images'), 0o700)
+    os.chmod(os.path.join(pascal_folder, 'Annotations'), 0o700)
+    get_cache_files_and_assert_neq = self._get_cache_files_and_assert_neq_fn(
+        dataset_util.get_cache_files_pascal_voc
+    )
+    # Test adding xml file
+    with open(os.path.join(pascal_folder, 'Annotations', 'test.xml'), 'w') as f:
+      f.write('test')
+    prev_cache_files = get_cache_files_and_assert_neq(
+        prev_cache_files, pascal_folder, cache_dir
+    )
+
+    # Test rename folder
+    new_pascal_folder = os.path.join(temp_dir, 'pascal_voc_data_renamed')
+    shutil.move(pascal_folder, new_pascal_folder)
+    pascal_folder = new_pascal_folder
+    prev_cache_files = get_cache_files_and_assert_neq(
+        prev_cache_files, new_pascal_folder, cache_dir
+    )
+
+  def test_is_cached(self):
+    tempdir = self.create_tempdir()
+    cache_files = dataset_util.get_cache_files_coco(
+        tasks_test_utils.get_test_data_path('coco_data'), cache_dir=tempdir
+    )
+    self.assertFalse(dataset_util.is_cached(cache_files))
+    with open(cache_files.tfrecord_files[0], 'w') as f:
+      f.write('test')
+    self.assertFalse(dataset_util.is_cached(cache_files))
+    with open(cache_files.meta_data_file, 'w') as f:
+      f.write('test')
+    self.assertTrue(dataset_util.is_cached(cache_files))
+
+  def test_get_label_map_coco(self):
+    coco_dir = tasks_test_utils.get_test_data_path('coco_data')
+    label_map = dataset_util.get_label_map_coco(coco_dir)
+    all_keys = sorted(label_map.keys())
+    self.assertEqual(all_keys[0], 0)
+    self.assertEqual(all_keys[-1], 11)
+    self.assertLen(all_keys, 12)
+
+  def test_get_label_map_pascal_voc(self):
+    pascal_dir = tasks_test_utils.get_test_data_path('pascal_voc_data')
+    label_map = dataset_util.get_label_map_pascal_voc(pascal_dir)
+    all_keys = sorted(label_map.keys())
+    self.assertEqual(label_map[0], 'background')
+    self.assertEqual(all_keys[0], 0)
+    self.assertEqual(all_keys[-1], 2)
+    self.assertLen(all_keys, 3)
+
+  def _validate_cache_files(self, cache_files, expected_size):
+    # Checks the TFRecord file
+    self.assertTrue(os.path.isfile(cache_files.tfrecord_files[0]))
+    self.assertGreater(os.path.getsize(cache_files.tfrecord_files[0]), 0)
+
+    # Checks the meta_data file
+    self.assertTrue(os.path.isfile(cache_files.meta_data_file))
+    self.assertGreater(os.path.getsize(cache_files.meta_data_file), 0)
+    with tf.io.gfile.GFile(cache_files.meta_data_file, 'r') as f:
+      meta_data_dict = yaml.load(f, Loader=yaml.FullLoader)
+      # Size is 3 because some examples are skipped for having poor bboxes
+      self.assertEqual(meta_data_dict['size'], expected_size)
+
+  def test_coco_cache_files_writer(self):
+    tempdir = self.create_tempdir()
+    coco_dir = tasks_test_utils.get_test_data_path('coco_data')
+    label_map = dataset_util.get_label_map_coco(coco_dir)
+    cache_writer = dataset_util.COCOCacheFilesWriter(label_map)
+    cache_files = dataset_util.get_cache_files_coco(coco_dir, cache_dir=tempdir)
+    cache_writer.write_files(cache_files, coco_dir)
+    self._validate_cache_files(cache_files, 3)
+
+  def test_pascal_voc_cache_files_writer(self):
+    tempdir = self.create_tempdir()
+    pascal_dir = tasks_test_utils.get_test_data_path('pascal_voc_data')
+    label_map = dataset_util.get_label_map_pascal_voc(pascal_dir)
+    cache_writer = dataset_util.PascalVocCacheFilesWriter(label_map)
+    cache_files = dataset_util.get_cache_files_pascal_voc(
+        pascal_dir, cache_dir=tempdir
+    )
+    cache_writer.write_files(cache_files, pascal_dir)
+    self._validate_cache_files(cache_files, 4)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/mediapipe/model_maker/python/vision/object_detector/hyperparameters.py
+++ b/mediapipe/model_maker/python/vision/object_detector/hyperparameters.py
@ -0,0 +1,101 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Hyperparameters for training object detection models."""
+
+import dataclasses
+from typing import List
+
+from mediapipe.model_maker.python.core import hyperparameters as hp
+
+
+@dataclasses.dataclass
+class HParams(hp.BaseHParams):
+  """The hyperparameters for training object detectors.
+
+  Attributes:
+    learning_rate: Learning rate to use for gradient descent training.
+    batch_size: Batch size for training.
+    epochs: Number of training iterations over the dataset.
+    do_fine_tuning: If true, the base module is trained together with the
+      classification layer on top.
+    learning_rate_boundaries: List of epoch boundaries where
+      learning_rate_boundaries[i] is the epoch where the learning rate will
+      decay to learning_rate * learning_rate_decay_multipliers[i].
+    learning_rate_decay_multipliers: List of learning rate multipliers which
+      calculates the learning rate at the ith boundary as learning_rate *
+      learning_rate_decay_multipliers[i].
+  """
+
+  # Parameters from BaseHParams class.
+  learning_rate: float = 0.003
+  batch_size: int = 32
+  epochs: int = 10
+
+  # Parameters for learning rate decay
+  learning_rate_boundaries: List[int] = dataclasses.field(
+      default_factory=lambda: [5, 8]
+  )
+  learning_rate_decay_multipliers: List[float] = dataclasses.field(
+      default_factory=lambda: [0.1, 0.01]
+  )
+
+  def __post_init__(self):
+    # Validate stepwise learning rate parameters
+    lr_boundary_len = len(self.learning_rate_boundaries)
+    lr_decay_multipliers_len = len(self.learning_rate_decay_multipliers)
+    if lr_boundary_len != lr_decay_multipliers_len:
+      raise ValueError(
+          "Length of learning_rate_boundaries and ",
+          "learning_rate_decay_multipliers do not match: ",
+          f"{lr_boundary_len}!={lr_decay_multipliers_len}",
+      )
+    # Validate learning_rate_boundaries
+    if sorted(self.learning_rate_boundaries) != self.learning_rate_boundaries:
+      raise ValueError(
+          "learning_rate_boundaries is not in ascending order: ",
+          self.learning_rate_boundaries,
+      )
+    if (
+        self.learning_rate_boundaries
+        and self.learning_rate_boundaries[-1] > self.epochs
+    ):
+      raise ValueError(
+          "Values in learning_rate_boundaries cannot be greater ", "than epochs"
+      )
+
+
+@dataclasses.dataclass
+class QATHParams:
+  """The hyperparameters for running quantization aware training (QAT) on object detectors.
+
+  For more information on QAT, see:
+    https://www.tensorflow.org/model_optimization/guide/quantization/training
+
+  Attributes:
+    learning_rate: Learning rate to use for gradient descent QAT.
+    batch_size: Batch size for QAT.
+    epochs: Number of training iterations over the dataset.
+    decay_steps: Learning rate decay steps for Exponential Decay. See
+      https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/schedules/ExponentialDecay
+        for more information.
+    decay_rate: Learning rate decay rate for Exponential Decay. See
+      https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/schedules/ExponentialDecay
+        for more information.
+  """
+
+  learning_rate: float = 0.03
+  batch_size: int = 32
+  epochs: int = 10
+  decay_steps: int = 231
+  decay_rate: float = 0.96
--- a/mediapipe/model_maker/python/vision/object_detector/model.py
+++ b/mediapipe/model_maker/python/vision/object_detector/model.py
@ -0,0 +1,355 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Custom Model for Object Detection."""
+
+import os
+from typing import Mapping, Optional, Sequence, Union
+
+import tensorflow as tf
+
+from mediapipe.model_maker.python.vision.object_detector import model_options as model_opt
+from mediapipe.model_maker.python.vision.object_detector import model_spec as ms
+from official.core import config_definitions as cfg
+from official.projects.qat.vision.configs import common as qat_common
+from official.projects.qat.vision.modeling import factory as qat_factory
+from official.vision import configs
+from official.vision.losses import focal_loss
+from official.vision.losses import loss_utils
+from official.vision.modeling import factory
+from official.vision.modeling import retinanet_model
+from official.vision.modeling.layers import detection_generator
+from official.vision.serving import detection
+
+
+class ObjectDetectorModel(tf.keras.Model):
+  """An object detector model which can be trained using Model Maker's training API.
+
+  Attributes:
+    loss_trackers: List of tf.keras.metrics.Mean objects used to track the loss
+      during training.
+  """
+
+  def __init__(
+      self,
+      model_spec: ms.ModelSpec,
+      model_options: model_opt.ObjectDetectorModelOptions,
+      num_classes: int,
+  ) -> None:
+    """Initializes an ObjectDetectorModel.
+
+    Args:
+      model_spec: Specification for the model.
+      model_options: Model options for creating the model.
+      num_classes: Number of classes for object detection.
+    """
+    super().__init__()
+    self._model_spec = model_spec
+    self._model_options = model_options
+    self._num_classes = num_classes
+    self._model = self._build_model()
+    checkpoint_folder = self._model_spec.downloaded_files.get_path()
+    checkpoint_file = os.path.join(checkpoint_folder, 'ckpt-277200')
+    self.load_checkpoint(checkpoint_file)
+    self._model.summary()
+    self.loss_trackers = [
+        tf.keras.metrics.Mean(name=n)
+        for n in ['total_loss', 'cls_loss', 'box_loss', 'model_loss']
+    ]
+
+  def _get_model_config(
+      self,
+      generator_config: configs.retinanet.DetectionGenerator = configs.retinanet.DetectionGenerator(),
+  ) -> configs.retinanet.RetinaNet:
+    model_config = configs.retinanet.RetinaNet(
+        min_level=3,
+        max_level=7,
+        num_classes=self._num_classes,
+        input_size=self._model_spec.input_image_shape,
+        anchor=configs.retinanet.Anchor(
+            num_scales=3, aspect_ratios=[0.5, 1.0, 2.0], anchor_size=3
+        ),
+        backbone=configs.backbones.Backbone(
+            type='mobilenet', mobilenet=configs.backbones.MobileNet()
+        ),
+        decoder=configs.decoders.Decoder(
+            type='fpn',
+            fpn=configs.decoders.FPN(
+                num_filters=128, use_separable_conv=True, use_keras_layer=True
+            ),
+        ),
+        head=configs.retinanet.RetinaNetHead(
+            num_filters=128, use_separable_conv=True
+        ),
+        detection_generator=generator_config,
+        norm_activation=configs.common.NormActivation(activation='relu6'),
+    )
+    return model_config
+
+  def _build_model(self) -> tf.keras.Model:
+    """Builds a RetinaNet object detector model."""
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None] + self._model_spec.input_image_shape
+    )
+    l2_regularizer = tf.keras.regularizers.l2(
+        self._model_options.l2_weight_decay / 2.0
+    )
+    model_config = self._get_model_config()
+
+    return factory.build_retinanet(input_specs, model_config, l2_regularizer)
+
+  def save_checkpoint(self, checkpoint_path: str) -> None:
+    """Saves a model checkpoint to checkpoint_path.
+
+    Args:
+      checkpoint_path: The path to save checkpoint.
+    """
+    ckpt_items = {
+        'backbone': self._model.backbone,
+        'decoder': self._model.decoder,
+        'head': self._model.head,
+    }
+    tf.train.Checkpoint(**ckpt_items).write(checkpoint_path)
+
+  def load_checkpoint(
+      self, checkpoint_path: str, include_last_layer: bool = False
+  ) -> None:
+    """Loads a model checkpoint from checkpoint_path.
+
+    Args:
+      checkpoint_path: The path to load a checkpoint from.
+      include_last_layer: Whether or not to load the last classification layer.
+        The size of the last classification layer will differ depending on the
+        number of classes. When loading from the pre-trained checkpoint, this
+        parameter should be False to avoid shape mismatch on the last layer.
+        Defaults to False.
+    """
+    dummy_input = tf.zeros([1] + self._model_spec.input_image_shape)
+    self._model(dummy_input, training=True)
+    if include_last_layer:
+      head = self._model.head
+    else:
+      head_classifier = tf.train.Checkpoint(
+          depthwise_kernel=self._model.head._classifier.depthwise_kernel  # pylint:disable=protected-access
+      )
+      head_items = {
+          '_classifier': head_classifier,
+          '_box_norms': self._model.head._box_norms,  # pylint:disable=protected-access
+          '_box_regressor': self._model.head._box_regressor,  # pylint:disable=protected-access
+          '_cls_convs': self._model.head._cls_convs,  # pylint:disable=protected-access
+          '_cls_norms': self._model.head._cls_norms,  # pylint:disable=protected-access
+          '_box_convs': self._model.head._box_convs,  # pylint:disable=protected-access
+      }
+      head = tf.train.Checkpoint(**head_items)
+    ckpt_items = {
+        'backbone': self._model.backbone,
+        'decoder': self._model.decoder,
+        'head': head,
+    }
+    ckpt = tf.train.Checkpoint(**ckpt_items)
+    status = ckpt.read(checkpoint_path)
+    status.expect_partial().assert_existing_objects_matched()
+
+  def convert_to_qat(self) -> None:
+    """Converts the model to a QAT RetinaNet model."""
+    model = self._build_model()
+    dummy_input = tf.zeros([1] + self._model_spec.input_image_shape)
+    model(dummy_input, training=True)
+    model.set_weights(self._model.get_weights())
+    quantization_config = qat_common.Quantization(
+        quantize_detection_decoder=True, quantize_detection_head=True
+    )
+    model_config = self._get_model_config()
+    qat_model = qat_factory.build_qat_retinanet(
+        model, quantization_config, model_config
+    )
+    self._model = qat_model
+
+  def export_saved_model(self, save_path: str):
+    """Exports a saved_model for tflite conversion.
+
+    The export process modifies the model in the following two ways:
+      1. Replaces the nms operation in the detection generator with a custom
+        TFLite compatible nms operation.
+      2. Wraps the model with a DetectionModule which handles pre-processing
+        and post-processing when running inference.
+
+    Args:
+      save_path: Path to export the saved model.
+    """
+    generator_config = configs.retinanet.DetectionGenerator(
+        nms_version='tflite',
+        tflite_post_processing=configs.common.TFLitePostProcessingConfig(
+            nms_score_threshold=0,
+            max_detections=10,
+            max_classes_per_detection=1,
+            normalize_anchor_coordinates=True,
+        ),
+    )
+    tflite_post_processing_config = (
+        generator_config.tflite_post_processing.as_dict()
+    )
+    tflite_post_processing_config['input_image_size'] = (
+        self._model_spec.input_image_shape[0],
+        self._model_spec.input_image_shape[1],
+    )
+    detection_generator_obj = detection_generator.MultilevelDetectionGenerator(
+        apply_nms=generator_config.apply_nms,
+        pre_nms_top_k=generator_config.pre_nms_top_k,
+        pre_nms_score_threshold=generator_config.pre_nms_score_threshold,
+        nms_iou_threshold=generator_config.nms_iou_threshold,
+        max_num_detections=generator_config.max_num_detections,
+        nms_version=generator_config.nms_version,
+        use_cpu_nms=generator_config.use_cpu_nms,
+        soft_nms_sigma=generator_config.soft_nms_sigma,
+        tflite_post_processing_config=tflite_post_processing_config,
+        return_decoded=generator_config.return_decoded,
+        use_class_agnostic_nms=generator_config.use_class_agnostic_nms,
+    )
+    model_config = self._get_model_config(generator_config)
+    model = retinanet_model.RetinaNetModel(
+        self._model.backbone,
+        self._model.decoder,
+        self._model.head,
+        detection_generator_obj,
+        min_level=model_config.min_level,
+        max_level=model_config.max_level,
+        num_scales=model_config.anchor.num_scales,
+        aspect_ratios=model_config.anchor.aspect_ratios,
+        anchor_size=model_config.anchor.anchor_size,
+    )
+    task_config = configs.retinanet.RetinaNetTask(model=model_config)
+    params = cfg.ExperimentConfig(
+        task=task_config,
+    )
+    export_module = detection.DetectionModule(
+        params=params,
+        batch_size=1,
+        input_image_size=self._model_spec.input_image_shape[:2],
+        input_type='tflite',
+        num_channels=self._model_spec.input_image_shape[2],
+        model=model,
+    )
+    function_keys = {'tflite': tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY}
+    signatures = export_module.get_inference_signatures(function_keys)
+
+    tf.saved_model.save(export_module, save_path, signatures=signatures)
+
+  # The remaining method overrides are used to train this object detector model
+  # using model.fit().
+  def call(
+      self,
+      images: Union[tf.Tensor, Sequence[tf.Tensor]],
+      image_shape: Optional[tf.Tensor] = None,
+      anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+      output_intermediate_features: bool = False,
+      training: bool = None,
+  ) -> Mapping[str, tf.Tensor]:
+    """Overrides call from tf.keras.Model."""
+    return self._model(
+        images,
+        image_shape,
+        anchor_boxes,
+        output_intermediate_features,
+        training,
+    )
+
+  def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):
+    """Overrides compute_loss from tf.keras.Model."""
+    cls_loss_fn = focal_loss.FocalLoss(
+        alpha=0.25, gamma=1.5, reduction=tf.keras.losses.Reduction.SUM
+    )
+    box_loss_fn = tf.keras.losses.Huber(
+        0.1, reduction=tf.keras.losses.Reduction.SUM
+    )
+    labels = y
+    outputs = y_pred
+    # Sums all positives in a batch for normalization and avoids zero
+    # num_positives_sum, which would lead to inf loss during training
+    cls_sample_weight = labels['cls_weights']
+    box_sample_weight = labels['box_weights']
+    num_positives = tf.reduce_sum(box_sample_weight) + 1.0
+    cls_sample_weight = cls_sample_weight / num_positives
+    box_sample_weight = box_sample_weight / num_positives
+    y_true_cls = loss_utils.multi_level_flatten(
+        labels['cls_targets'], last_dim=None
+    )
+    y_true_cls = tf.one_hot(y_true_cls, self._num_classes)
+    y_pred_cls = loss_utils.multi_level_flatten(
+        outputs['cls_outputs'], last_dim=self._num_classes
+    )
+    y_true_box = loss_utils.multi_level_flatten(
+        labels['box_targets'], last_dim=4
+    )
+    y_pred_box = loss_utils.multi_level_flatten(
+        outputs['box_outputs'], last_dim=4
+    )
+
+    cls_loss = cls_loss_fn(
+        y_true=y_true_cls, y_pred=y_pred_cls, sample_weight=cls_sample_weight
+    )
+    box_loss = box_loss_fn(
+        y_true=y_true_box, y_pred=y_pred_box, sample_weight=box_sample_weight
+    )
+
+    model_loss = cls_loss + 50 * box_loss
+    total_loss = model_loss
+    regularization_losses = self._model.losses
+    if regularization_losses:
+      reg_loss = tf.reduce_sum(regularization_losses)
+      total_loss = model_loss + reg_loss
+    all_losses = {
+        'total_loss': total_loss,
+        'cls_loss': cls_loss,
+        'box_loss': box_loss,
+        'model_loss': model_loss,
+    }
+    for m in self.metrics:
+      m.update_state(all_losses[m.name])
+    return total_loss
+
+  @property
+  def metrics(self):
+    """Overrides metrics from tf.keras.Model."""
+    return self.loss_trackers
+
+  def compute_metrics(self, x, y, y_pred, sample_weight=None):
+    """Overrides compute_metrics from tf.keras.Model."""
+    return self.get_metrics_result()
+
+  def train_step(self, data):
+    """Overrides train_step from tf.keras.Model."""
+    tf.keras.backend.set_learning_phase(1)
+    x, y = data
+    # Run forward pass.
+    with tf.GradientTape() as tape:
+      y_pred = self(x, training=True)
+      loss = self.compute_loss(x, y, y_pred)
+    self._validate_target_and_loss(y, loss)
+    # Run backwards pass.
+    self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
+    return self.compute_metrics(x, y, y_pred)
+
+  def test_step(self, data):
+    """Overrides test_step from tf.keras.Model."""
+    tf.keras.backend.set_learning_phase(0)
+    x, y = data
+    y_pred = self(
+        x,
+        anchor_boxes=y['anchor_boxes'],
+        image_shape=y['image_info'][:, 1, :],
+        training=False,
+    )
+    # Updates stateful loss metrics.
+    self.compute_loss(x, y, y_pred)
+    return self.compute_metrics(x, y, y_pred)
--- a/mediapipe/model_maker/python/vision/object_detector/model_options.py
+++ b/mediapipe/model_maker/python/vision/object_detector/model_options.py
@ -0,0 +1,28 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Configurable model options for object detector models."""
+
+import dataclasses
+
+
+@dataclasses.dataclass
+class ObjectDetectorModelOptions:
+  """Configurable options for object detector model.
+
+  Attributes:
+    l2_weight_decay: L2 regularization penalty used in
+      https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/L2.
+  """
+
+  l2_weight_decay: float = 3.0e-05
--- a/mediapipe/model_maker/python/vision/object_detector/model_spec.py
+++ b/mediapipe/model_maker/python/vision/object_detector/model_spec.py
@ -0,0 +1,62 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Object detector model specification."""
+import dataclasses
+import enum
+import functools
+from typing import List
+
+from mediapipe.model_maker.python.core.utils import file_util
+
+
+MOBILENET_V2_FILES = file_util.DownloadedFiles(
+    'object_detector/mobilenetv2',
+    'https://storage.googleapis.com/tf_model_garden/vision/qat/mobilenetv2_ssd_coco/mobilenetv2_ssd_i256_ckpt.tar.gz',
+    is_folder=True,
+)
+
+
+@dataclasses.dataclass
+class ModelSpec(object):
+  """Specification of object detector model."""
+
+  # Mean and Stddev image preprocessing normalization values.
+  mean_norm = (0.5,)
+  stddev_norm = (0.5,)
+  mean_rgb = (127.5,)
+  stddev_rgb = (127.5,)
+
+  downloaded_files: file_util.DownloadedFiles
+  input_image_shape: List[int]
+
+
+mobilenet_v2_spec = functools.partial(
+    ModelSpec,
+    downloaded_files=MOBILENET_V2_FILES,
+    input_image_shape=[256, 256, 3],
+)
+
+
+@enum.unique
+class SupportedModels(enum.Enum):
+  """Predefined object detector model specs supported by Model Maker."""
+
+  MOBILENET_V2 = mobilenet_v2_spec
+
+  @classmethod
+  def get(cls, spec: 'SupportedModels') -> 'ModelSpec':
+    """Get model spec from the input enum and initializes it."""
+    if spec not in cls:
+      raise TypeError(f'Unsupported object detector spec: {spec}')
+    return spec.value()
--- a/mediapipe/model_maker/python/vision/object_detector/model_test.py
+++ b/mediapipe/model_maker/python/vision/object_detector/model_test.py
@ -0,0 +1,147 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+from unittest import mock as unittest_mock
+
+import numpy as np
+import tensorflow as tf
+
+from mediapipe.model_maker.python.vision.object_detector import dataset as ds
+from mediapipe.model_maker.python.vision.object_detector import model as model_lib
+from mediapipe.model_maker.python.vision.object_detector import model_options as model_opt
+from mediapipe.model_maker.python.vision.object_detector import model_spec as ms
+from mediapipe.model_maker.python.vision.object_detector import preprocessor
+from mediapipe.tasks.python.test import test_utils as task_test_utils
+
+
+def _dicts_match(dict_1, dict_2):
+  for key in dict_1:
+    if key not in dict_2 or np.any(dict_1[key] != dict_2[key]):
+      return False
+  return True
+
+
+def _outputs_match(output1, output2):
+  return _dicts_match(
+      output1['cls_outputs'], output2['cls_outputs']
+  ) and _dicts_match(output1['box_outputs'], output2['box_outputs'])
+
+
+class ObjectDetectorModelTest(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    dataset_folder = task_test_utils.get_test_data_path('coco_data')
+    cache_dir = self.create_tempdir()
+    self.data = ds.Dataset.from_coco_folder(dataset_folder, cache_dir=cache_dir)
+    self.model_spec = ms.SupportedModels.MOBILENET_V2.value()
+    self.preprocessor = preprocessor.Preprocessor(self.model_spec)
+    self.fake_inputs = np.random.uniform(
+        low=0, high=1, size=(1, 256, 256, 3)
+    ).astype(np.float32)
+    # Mock tempfile.gettempdir() to be unique for each test to avoid race
+    # condition when downloading model since these tests may run in parallel.
+    mock_gettempdir = unittest_mock.patch.object(
+        tempfile,
+        'gettempdir',
+        return_value=self.create_tempdir(),
+        autospec=True,
+    )
+    self.mock_gettempdir = mock_gettempdir.start()
+    self.addCleanup(mock_gettempdir.stop)
+
+  def _create_model(self):
+    model_options = model_opt.ObjectDetectorModelOptions()
+    model = model_lib.ObjectDetectorModel(
+        self.model_spec, model_options, self.data.num_classes
+    )
+    return model
+
+  def _train_model(self, model):
+    """Helper to run a simple training run on the model."""
+    dataset = self.data.gen_tf_dataset(
+        batch_size=2,
+        is_training=True,
+        shuffle=False,
+        preprocess=self.preprocessor,
+    )
+    optimizer = tf.keras.optimizers.experimental.SGD(
+        learning_rate=0.03, momentum=0.9
+    )
+    model.compile(optimizer=optimizer)
+    model.fit(
+        x=dataset, epochs=2, steps_per_epoch=None, validation_data=dataset
+    )
+
+  def test_model(self):
+    model = self._create_model()
+    outputs_before = model(self.fake_inputs, training=True)
+    self._train_model(model)
+    outputs_after = model(self.fake_inputs, training=True)
+    self.assertFalse(_outputs_match(outputs_before, outputs_after))
+
+  def test_model_convert_to_qat(self):
+    model_options = model_opt.ObjectDetectorModelOptions()
+    model = model_lib.ObjectDetectorModel(
+        self.model_spec, model_options, self.data.num_classes
+    )
+    outputs_before = model(self.fake_inputs, training=True)
+    model.convert_to_qat()
+    outputs_after = model(self.fake_inputs, training=True)
+    self.assertFalse(_outputs_match(outputs_before, outputs_after))
+    outputs_before = outputs_after
+    self._train_model(model)
+    outputs_after = model(self.fake_inputs, training=True)
+    self.assertFalse(_outputs_match(outputs_before, outputs_after))
+
+  def test_model_save_and_load_checkpoint(self):
+    model = self._create_model()
+    checkpoint_path = os.path.join(self.create_tempdir(), 'ckpt')
+    model.save_checkpoint(checkpoint_path)
+    data_checkpoint_file = checkpoint_path + '.data-00000-of-00001'
+    index_checkpoint_file = checkpoint_path + '.index'
+    self.assertTrue(os.path.exists(data_checkpoint_file))
+    self.assertTrue(os.path.exists(index_checkpoint_file))
+    self.assertGreater(os.path.getsize(data_checkpoint_file), 0)
+    self.assertGreater(os.path.getsize(index_checkpoint_file), 0)
+    outputs_before = model(self.fake_inputs, training=True)
+
+    # Check model output is different after training
+    self._train_model(model)
+    outputs_after = model(self.fake_inputs, training=True)
+    self.assertFalse(_outputs_match(outputs_before, outputs_after))
+
+    # Check model output is the same after loading previous checkpoint
+    model.load_checkpoint(checkpoint_path, include_last_layer=True)
+    outputs_after = model(self.fake_inputs, training=True)
+    self.assertTrue(_outputs_match(outputs_before, outputs_after))
+
+  def test_export_saved_model(self):
+    export_dir = self.create_tempdir()
+    export_path = os.path.join(export_dir, 'saved_model')
+    model = self._create_model()
+    model.export_saved_model(export_path)
+    self.assertTrue(os.path.exists(export_path))
+    self.assertGreater(os.path.getsize(export_path), 0)
+    model.convert_to_qat()
+    export_path = os.path.join(export_dir, 'saved_model_qat')
+    model.export_saved_model(export_path)
+    self.assertTrue(os.path.exists(export_path))
+    self.assertGreater(os.path.getsize(export_path), 0)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/mediapipe/model_maker/python/vision/object_detector/object_detector.py
+++ b/mediapipe/model_maker/python/vision/object_detector/object_detector.py
@ -0,0 +1,353 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""APIs to train object detector model."""
+import os
+import tempfile
+from typing import Dict, List, Optional, Tuple
+
+import tensorflow as tf
+
+from mediapipe.model_maker.python.core.tasks import classifier
+from mediapipe.model_maker.python.core.utils import model_util
+from mediapipe.model_maker.python.core.utils import quantization
+from mediapipe.model_maker.python.vision.object_detector import dataset as ds
+from mediapipe.model_maker.python.vision.object_detector import hyperparameters as hp
+from mediapipe.model_maker.python.vision.object_detector import model as model_lib
+from mediapipe.model_maker.python.vision.object_detector import model_options as model_opt
+from mediapipe.model_maker.python.vision.object_detector import model_spec as ms
+from mediapipe.model_maker.python.vision.object_detector import object_detector_options
+from mediapipe.model_maker.python.vision.object_detector import preprocessor
+from mediapipe.tasks.python.metadata.metadata_writers import metadata_writer
+from mediapipe.tasks.python.metadata.metadata_writers import object_detector as object_detector_writer
+from official.vision.evaluation import coco_evaluator
+
+
+class ObjectDetector(classifier.Classifier):
+  """ObjectDetector for building object detection model."""
+
+  def __init__(
+      self,
+      model_spec: ms.ModelSpec,
+      label_names: List[str],
+      hparams: hp.HParams,
+      model_options: model_opt.ObjectDetectorModelOptions,
+  ) -> None:
+    """Initializes ObjectDetector class.
+
+    Args:
+      model_spec: Specifications for the model.
+      label_names: A list of label names for the classes.
+      hparams: The hyperparameters for training object detector.
+      model_options: Options for creating the object detector model.
+    """
+    super().__init__(
+        model_spec=model_spec, label_names=label_names, shuffle=hparams.shuffle
+    )
+    self._preprocessor = preprocessor.Preprocessor(model_spec)
+    self._hparams = hparams
+    self._model_options = model_options
+    self._optimizer = self._create_optimizer()
+    self._is_qat = False
+
+  @classmethod
+  def create(
+      cls,
+      train_data: ds.Dataset,
+      validation_data: ds.Dataset,
+      options: object_detector_options.ObjectDetectorOptions,
+  ) -> 'ObjectDetector':
+    """Creates and trains an ObjectDetector.
+
+    Loads data and trains the model based on data for object detection.
+
+    Args:
+      train_data: Training data.
+      validation_data: Validation data.
+      options: Configurations for creating and training object detector.
+
+    Returns:
+      An instance of ObjectDetector.
+    """
+    if options.hparams is None:
+      options.hparams = hp.HParams()
+
+    if options.model_options is None:
+      options.model_options = model_opt.ObjectDetectorModelOptions()
+
+    spec = ms.SupportedModels.get(options.supported_model)
+    object_detector = cls(
+        model_spec=spec,
+        label_names=train_data.label_names,
+        hparams=options.hparams,
+        model_options=options.model_options,
+    )
+    object_detector._create_and_train_model(train_data, validation_data)
+    return object_detector
+
+  def _create_and_train_model(
+      self, train_data: ds.Dataset, validation_data: ds.Dataset
+  ):
+    """Creates and trains the model.
+
+    Args:
+      train_data: Training data.
+      validation_data: Validation data.
+    """
+    self._create_model()
+    self._train_model(
+        train_data, validation_data, preprocessor=self._preprocessor
+    )
+    self._save_float_ckpt()
+
+  def _create_model(self) -> None:
+    """Creates the object detector model."""
+    self._model = model_lib.ObjectDetectorModel(
+        model_spec=self._model_spec,
+        model_options=self._model_options,
+        num_classes=self._num_classes,
+    )
+
+  def _save_float_ckpt(self) -> None:
+    """Saves a checkpoint of the trained float model.
+
+    The default save path is {hparams.export_dir}/float_ckpt. Note that
+      `float_cpt` represents a file prefix, not directory. The resulting files
+      saved to {hparams.export_dir} will be:
+        - float_ckpt.data-00000-of-00001
+        - float_ckpt.index
+    """
+    save_path = os.path.join(self._hparams.export_dir, 'float_ckpt')
+    if not os.path.exists(self._hparams.export_dir):
+      os.makedirs(self._hparams.export_dir)
+    self._model.save_checkpoint(save_path)
+
+  def restore_float_ckpt(self) -> None:
+    """Loads a float checkpoint of the model from {hparams.export_dir}/float_ckpt.
+
+    The float checkpoint at {hparams.export_dir}/float_ckpt is automatically
+    saved after training an ObjectDetector using the `create` method. This
+    method is used to restore the trained float checkpoint state of the model in
+    order to run `quantization_aware_training` multiple times. Example usage:
+
+    # Train a model
+    model = object_detector.create(...)
+    # Run QAT
+    model.quantization_aware_training(...)
+    model.evaluate(...)
+    # Restore the float checkpoint to run QAT again
+    model.restore_float_ckpt()
+    # Run QAT with different parameters
+    model.quantization_aware_training(...)
+    model.evaluate(...)
+    """
+    self._create_model()
+    self._model.load_checkpoint(
+        os.path.join(self._hparams.export_dir, 'float_ckpt'),
+        include_last_layer=True,
+    )
+    self._model.compile()
+    self._is_qat = False
+
+  # TODO: Refactor this method to utilize shared training function
+  def quantization_aware_training(
+      self,
+      train_data: ds.Dataset,
+      validation_data: ds.Dataset,
+      qat_hparams: hp.QATHParams,
+  ) -> None:
+    """Runs quantization aware training(QAT) on the model.
+
+    The QAT step happens after training a regular float model from the `create`
+    method. This additional step will fine-tune the model with a lower precision
+    in order mimic the behavior of a quantized model. The resulting quantized
+    model generally has better performance than a model which is quantized
+    without running QAT. See the following link for more information:
+    - https://www.tensorflow.org/model_optimization/guide/quantization/training
+
+    Just like training the float model using the `create` method, the QAT step
+    also requires some manual tuning of hyperparameters. In order to run QAT
+    more than once for purposes such as hyperparameter tuning, use the
+    `restore_float_ckpt` method to restore the model state to the trained float
+    checkpoint without having to rerun the `create` method.
+
+    Args:
+      train_data: Training dataset.
+      validation_data: Validaiton dataset.
+      qat_hparams: Configuration for QAT.
+    """
+    self._model.convert_to_qat()
+    learning_rate_fn = tf.keras.optimizers.schedules.ExponentialDecay(
+        qat_hparams.learning_rate * qat_hparams.batch_size / 256,
+        decay_steps=qat_hparams.decay_steps,
+        decay_rate=qat_hparams.decay_rate,
+        staircase=True,
+    )
+    optimizer = tf.keras.optimizers.experimental.SGD(
+        learning_rate=learning_rate_fn, momentum=0.9
+    )
+    if len(train_data) < qat_hparams.batch_size:
+      raise ValueError(
+          f"The size of the train_data {len(train_data)} can't be smaller than"
+          f' batch_size {qat_hparams.batch_size}. To solve this problem, set'
+          ' the batch_size smaller or increase the size of the train_data.'
+      )
+
+    train_dataset = train_data.gen_tf_dataset(
+        batch_size=qat_hparams.batch_size,
+        is_training=True,
+        shuffle=self._shuffle,
+        preprocess=self._preprocessor,
+    )
+    steps_per_epoch = model_util.get_steps_per_epoch(
+        steps_per_epoch=None,
+        batch_size=qat_hparams.batch_size,
+        train_data=train_data,
+    )
+    train_dataset = train_dataset.take(count=steps_per_epoch)
+    validation_dataset = validation_data.gen_tf_dataset(
+        batch_size=qat_hparams.batch_size,
+        is_training=False,
+        preprocess=self._preprocessor,
+    )
+    self._model.compile(optimizer=optimizer)
+    self._model.fit(
+        x=train_dataset,
+        epochs=qat_hparams.epochs,
+        steps_per_epoch=None,
+        validation_data=validation_dataset,
+    )
+    self._is_qat = True
+
+  def evaluate(
+      self, dataset: ds.Dataset, batch_size: int = 1
+  ) -> Tuple[List[float], Dict[str, float]]:
+    """Overrides Classifier.evaluate to calculate COCO metrics."""
+    dataset = dataset.gen_tf_dataset(
+        batch_size, is_training=False, preprocess=self._preprocessor
+    )
+    losses = self._model.evaluate(dataset)
+    coco_eval = coco_evaluator.COCOEvaluator(
+        annotation_file=None,
+        include_mask=False,
+        per_category_metrics=True,
+        max_num_eval_detections=100,
+    )
+    for batch in dataset:
+      x, y = batch
+      y_pred = self._model(
+          x,
+          anchor_boxes=y['anchor_boxes'],
+          image_shape=y['image_info'][:, 1, :],
+          training=False,
+      )
+      groundtruths = y['groundtruths']
+      y_pred['image_info'] = groundtruths['image_info']
+      y_pred['source_id'] = groundtruths['source_id']
+      coco_eval.update_state(groundtruths, y_pred)
+    coco_metrics = coco_eval.result()
+    return losses, coco_metrics
+
+  def export_model(
+      self,
+      model_name: str = 'model.tflite',
+      quantization_config: Optional[quantization.QuantizationConfig] = None,
+  ):
+    """Converts and saves the model to a TFLite file with metadata included.
+
+    The model export format is automatically set based on whether or not
+    `quantization_aware_training`(QAT) was run. The model exports to float32 by
+    default and will export to an int8 quantized model if QAT was run. To export
+    a float32 model after running QAT, run `restore_float_ckpt` before this
+    method. For custom post-training quantization without QAT, use the
+    quantization_config parameter.
+
+    Note that only the TFLite file is needed for deployment. This function also
+    saves a metadata.json file to the same directory as the TFLite file which
+    can be used to interpret the metadata content in the TFLite file.
+
+    Args:
+      model_name: File name to save TFLite model with metadata. The full export
+        path is {self._hparams.export_dir}/{model_name}.
+      quantization_config: The configuration for model quantization. Note that
+        int8 quantization aware training is automatically applied when possible.
+        This parameter is used to specify other post-training quantization
+        options such as fp16 and int8 without QAT.
+
+    Raises:
+      ValueError: If a custom quantization_config is specified when the model
+        has quantization aware training enabled.
+    """
+    if quantization_config:
+      if self._is_qat:
+        raise ValueError(
+            'Exporting a qat model with a custom quantization_config is not '
+            'supported.'
+        )
+      else:
+        print(
+            'Exporting with custom post-training-quantization: ',
+            quantization_config,
+        )
+    else:
+      if self._is_qat:
+        print('Exporting a qat int8 model')
+        quantization_config = quantization.QuantizationConfig(
+            inference_input_type=tf.uint8, inference_output_type=tf.uint8
+        )
+      else:
+        print('Exporting a floating point model')
+
+    tflite_file = os.path.join(self._hparams.export_dir, model_name)
+    metadata_file = os.path.join(self._hparams.export_dir, 'metadata.json')
+    with tempfile.TemporaryDirectory() as temp_dir:
+      save_path = os.path.join(temp_dir, 'saved_model')
+      self._model.export_saved_model(save_path)
+      converter = tf.lite.TFLiteConverter.from_saved_model(save_path)
+      if quantization_config:
+        converter = quantization_config.set_converter_with_quantization(
+            converter, preprocess=self._preprocessor
+        )
+
+      converter.target_spec.supported_ops = (tf.lite.OpsSet.TFLITE_BUILTINS,)
+      tflite_model = converter.convert()
+
+    writer = object_detector_writer.MetadataWriter.create(
+        tflite_model,
+        self._model_spec.mean_rgb,
+        self._model_spec.stddev_rgb,
+        labels=metadata_writer.Labels().add(list(self._label_names)),
+    )
+    tflite_model_with_metadata, metadata_json = writer.populate()
+    model_util.save_tflite(tflite_model_with_metadata, tflite_file)
+    with open(metadata_file, 'w') as f:
+      f.write(metadata_json)
+
+  def _create_optimizer(self) -> tf.keras.optimizers.Optimizer:
+    """Creates an optimizer with learning rate schedule for regular training.
+
+    Uses Keras PiecewiseConstantDecay schedule by default.
+
+    Returns:
+      A tf.keras.optimizer.Optimizer for model training.
+    """
+    init_lr = self._hparams.learning_rate * self._hparams.batch_size / 256
+    lr_values = [init_lr] + [
+        init_lr * m for m in self._hparams.learning_rate_decay_multipliers
+    ]
+    learning_rate_fn = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
+        self._hparams.learning_rate_boundaries, lr_values
+    )
+    return tf.keras.optimizers.experimental.SGD(
+        learning_rate=learning_rate_fn, momentum=0.9
+    )
--- a/mediapipe/model_maker/python/vision/object_detector/object_detector_demo.py
+++ b/mediapipe/model_maker/python/vision/object_detector/object_detector_demo.py
@ -0,0 +1,84 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Demo for making an object detector model by MediaPipe Model Maker."""
+
+import os
+# Dependency imports
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from mediapipe.model_maker.python.vision import object_detector
+
+FLAGS = flags.FLAGS
+
+TEST_DATA_DIR = 'mediapipe/model_maker/python/vision/object_detector/testdata/coco_data'
+
+
+def define_flags() -> None:
+  """Define flags for the object detection model maker demo."""
+  flags.DEFINE_string(
+      'export_dir', None, 'The directory to save exported files.'
+  )
+  flags.DEFINE_string(
+      'input_data_dir',
+      None,
+      """The directory with input training data. If the training data is not
+      specified, the pipeline will use the test dataset.""",
+  )
+  flags.DEFINE_bool('qat', True, 'Whether or not to do QAT.')
+  flags.mark_flag_as_required('export_dir')
+
+
+def run(data_dir: str, export_dir: str, qat: bool):
+  """Runs demo."""
+  data = object_detector.Dataset.from_coco_folder(data_dir)
+  train_data, rest_data = data.split(0.6)
+  validation_data, test_data = rest_data.split(0.5)
+
+  hparams = object_detector.HParams(batch_size=1, export_dir=export_dir)
+  options = object_detector.ObjectDetectorOptions(
+      supported_model=object_detector.SupportedModels.MOBILENET_V2,
+      hparams=hparams,
+  )
+  model = object_detector.ObjectDetector.create(
+      train_data=train_data, validation_data=validation_data, options=options
+  )
+  loss, coco_metrics = model.evaluate(test_data, batch_size=1)
+  print(f'Evaluation loss:{loss}, coco_metrics:{coco_metrics}')
+  if qat:
+    qat_hparams = object_detector.QATHParams(batch_size=1)
+    model.quantization_aware_training(train_data, validation_data, qat_hparams)
+    qat_loss, qat_coco_metrics = model.evaluate(test_data, batch_size=1)
+    print(f'QAT Evaluation loss:{qat_loss}, coco_metrics:{qat_coco_metrics}')
+
+  model.export_model()
+
+
+def main(_) -> None:
+  logging.set_verbosity(logging.INFO)
+
+  if FLAGS.input_data_dir is None:
+    data_dir = os.path.join(FLAGS.test_srcdir, TEST_DATA_DIR)
+  else:
+    data_dir = FLAGS.input_data_dir
+
+  export_dir = os.path.expanduser(FLAGS.export_dir)
+  run(data_dir=data_dir, export_dir=export_dir, qat=FLAGS.qat)
+
+
+if __name__ == '__main__':
+  define_flags()
+  app.run(main)
--- a/mediapipe/model_maker/python/vision/object_detector/object_detector_options.py
+++ b/mediapipe/model_maker/python/vision/object_detector/object_detector_options.py
@ -0,0 +1,36 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Options for building object detector."""
+
+import dataclasses
+from typing import Optional
+
+from mediapipe.model_maker.python.vision.object_detector import hyperparameters
+from mediapipe.model_maker.python.vision.object_detector import model_options as model_opt
+from mediapipe.model_maker.python.vision.object_detector import model_spec
+
+
+@dataclasses.dataclass
+class ObjectDetectorOptions:
+  """Configurable options for building object detector.
+
+  Attributes:
+    supported_model: A model from the SupportedModels enum.
+    model_options: A set of options for configuring the selected model.
+    hparams: A set of hyperparameters used to train the object detector.
+  """
+
+  supported_model: model_spec.SupportedModels
+  model_options: Optional[model_opt.ObjectDetectorModelOptions] = None
+  hparams: Optional[hyperparameters.HParams] = None
--- a/mediapipe/model_maker/python/vision/object_detector/object_detector_test.py
+++ b/mediapipe/model_maker/python/vision/object_detector/object_detector_test.py
@ -0,0 +1,121 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+from unittest import mock as unittest_mock
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from mediapipe.model_maker.python.vision.object_detector import dataset
+from mediapipe.model_maker.python.vision.object_detector import hyperparameters
+from mediapipe.model_maker.python.vision.object_detector import model_spec as ms
+from mediapipe.model_maker.python.vision.object_detector import object_detector
+from mediapipe.model_maker.python.vision.object_detector import object_detector_options
+from mediapipe.tasks.python.test import test_utils as task_test_utils
+
+
+class ObjectDetectorTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    dataset_folder = task_test_utils.get_test_data_path('coco_data')
+    cache_dir = self.create_tempdir()
+    self.data = dataset.Dataset.from_coco_folder(
+        dataset_folder, cache_dir=cache_dir
+    )
+    # Mock tempfile.gettempdir() to be unique for each test to avoid race
+    # condition when downloading model since these tests may run in parallel.
+    mock_gettempdir = unittest_mock.patch.object(
+        tempfile,
+        'gettempdir',
+        return_value=self.create_tempdir(),
+        autospec=True,
+    )
+    self.mock_gettempdir = mock_gettempdir.start()
+    self.addCleanup(mock_gettempdir.stop)
+
+  def test_object_detector(self):
+    hparams = hyperparameters.HParams(
+        epochs=10,
+        batch_size=2,
+        learning_rate=0.9,
+        shuffle=False,
+        export_dir=self.create_tempdir(),
+    )
+    options = object_detector_options.ObjectDetectorOptions(
+        supported_model=ms.SupportedModels.MOBILENET_V2, hparams=hparams
+    )
+    # Test `create``
+    model = object_detector.ObjectDetector.create(
+        train_data=self.data, validation_data=self.data, options=options
+    )
+    losses, coco_metrics = model.evaluate(self.data)
+    self._assert_ap_greater(coco_metrics)
+    self.assertFalse(model._is_qat)
+    # Test float export_model
+    model.export_model()
+    output_metadata_file = os.path.join(
+        options.hparams.export_dir, 'metadata.json'
+    )
+    output_tflite_file = os.path.join(
+        options.hparams.export_dir, 'model.tflite'
+    )
+    print('ASDF float', os.path.getsize(output_tflite_file))
+    self.assertTrue(os.path.exists(output_tflite_file))
+    self.assertGreater(os.path.getsize(output_tflite_file), 0)
+    self.assertTrue(os.path.exists(output_metadata_file))
+    self.assertGreater(os.path.getsize(output_metadata_file), 0)
+
+    # Test `quantization_aware_training`
+    qat_hparams = hyperparameters.QATHParams(
+        learning_rate=0.9,
+        batch_size=2,
+        epochs=5,
+        decay_steps=6,
+        decay_rate=0.96,
+    )
+    model.quantization_aware_training(self.data, self.data, qat_hparams)
+    qat_losses, qat_coco_metrics = model.evaluate(self.data)
+    self._assert_ap_greater(qat_coco_metrics)
+    self.assertNotAllEqual(losses, qat_losses)
+    self.assertTrue(model._is_qat)
+    model.export_model('model_qat.tflite')
+    output_metadata_file = os.path.join(
+        options.hparams.export_dir, 'metadata.json'
+    )
+    output_tflite_file = os.path.join(
+        options.hparams.export_dir, 'model_qat.tflite'
+    )
+    print('ASDF qat', os.path.getsize(output_tflite_file))
+    self.assertTrue(os.path.exists(output_tflite_file))
+    self.assertGreater(os.path.getsize(output_tflite_file), 0)
+    self.assertLess(os.path.getsize(output_tflite_file), 3500000)
+    self.assertTrue(os.path.exists(output_metadata_file))
+    self.assertGreater(os.path.getsize(output_metadata_file), 0)
+
+    # Load float ckpt test
+    model.restore_float_ckpt()
+    losses_2, _ = model.evaluate(self.data)
+    self.assertAllEqual(losses, losses_2)
+    self.assertNotAllEqual(qat_losses, losses_2)
+    self.assertFalse(model._is_qat)
+
+  def _assert_ap_greater(self, coco_metrics, threshold=0.0):
+    self.assertGreaterEqual(coco_metrics['AP'], threshold)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/mediapipe/model_maker/python/vision/object_detector/preprocessor.py
+++ b/mediapipe/model_maker/python/vision/object_detector/preprocessor.py
@ -0,0 +1,163 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Preprocessor for object detector."""
+from typing import Any, Mapping, Tuple
+
+import tensorflow as tf
+
+from mediapipe.model_maker.python.vision.object_detector import model_spec as ms
+from official.vision.dataloaders import utils
+from official.vision.ops import anchor
+from official.vision.ops import box_ops
+from official.vision.ops import preprocess_ops
+
+
+# TODO Combine preprocessing logic with image_preprocessor.
+class Preprocessor(object):
+  """Preprocessor for object detector."""
+
+  def __init__(self, model_spec: ms.ModelSpec):
+    """Initialize a Preprocessor."""
+    self._mean_norm = model_spec.mean_norm
+    self._stddev_norm = model_spec.stddev_norm
+    self._output_size = model_spec.input_image_shape[:2]
+    self._min_level = 3
+    self._max_level = 7
+    self._num_scales = 3
+    self._aspect_ratios = [0.5, 1, 2]
+    self._anchor_size = 3
+    self._dtype = tf.float32
+    self._match_threshold = 0.5
+    self._unmatched_threshold = 0.5
+    self._aug_scale_min = 0.5
+    self._aug_scale_max = 2.0
+    self._max_num_instances = 100
+
+  def __call__(
+      self, data: Mapping[str, Any], is_training: bool = True
+  ) -> Tuple[tf.Tensor, Mapping[str, Any]]:
+    """Run the preprocessor on an example.
+
+    The data dict should contain the following keys always:
+      - image
+      - groundtruth_classes
+      - groundtruth_boxes
+      - groundtruth_is_crowd
+    Additional keys needed when is_training is set to True:
+      - groundtruth_area
+      - source_id
+      - height
+      - width
+
+    Args:
+      data: A dict of object detector inputs.
+      is_training: Whether or not the data is used for training.
+
+    Returns:
+      A tuple of (image, labels) where image is a Tensor and labels is a dict.
+    """
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+
+    # Get original image.
+    image = data['image']
+    image_shape = tf.shape(input=image)[0:2]
+
+    # Normalize image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(
+        image, self._mean_norm, self._stddev_norm
+    )
+
+    # Flip image randomly during training.
+    if is_training:
+      image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)
+
+    # Convert boxes from normalized coordinates to pixel coordinates.
+    boxes = box_ops.denormalize_boxes(boxes, image_shape)
+
+    # Resize and crop image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=preprocess_ops.compute_padded_size(
+            self._output_size, 2**self._max_level
+        ),
+        aug_scale_min=(self._aug_scale_min if is_training else 1.0),
+        aug_scale_max=(self._aug_scale_max if is_training else 1.0),
+    )
+    image_height, image_width, _ = image.get_shape().as_list()
+
+    # Resize and crop boxes.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    boxes = preprocess_ops.resize_and_crop_boxes(
+        boxes, image_scale, image_info[1, :], offset
+    )
+    # Filter out ground-truth boxes that are all zeros.
+    indices = box_ops.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+
+    # Assign anchors.
+    input_anchor = anchor.build_anchor_generator(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        num_scales=self._num_scales,
+        aspect_ratios=self._aspect_ratios,
+        anchor_size=self._anchor_size,
+    )
+    anchor_boxes = input_anchor(image_size=(image_height, image_width))
+    anchor_labeler = anchor.AnchorLabeler(
+        self._match_threshold, self._unmatched_threshold
+    )
+    (cls_targets, box_targets, _, cls_weights, box_weights) = (
+        anchor_labeler.label_anchors(
+            anchor_boxes, boxes, tf.expand_dims(classes, axis=1)
+        )
+    )
+
+    # Cast input image to desired data type.
+    image = tf.cast(image, dtype=self._dtype)
+
+    # Pack labels for model_fn outputs.
+    labels = {
+        'cls_targets': cls_targets,
+        'box_targets': box_targets,
+        'anchor_boxes': anchor_boxes,
+        'cls_weights': cls_weights,
+        'box_weights': box_weights,
+        'image_info': image_info,
+    }
+    if not is_training:
+      groundtruths = {
+          'source_id': data['source_id'],
+          'height': data['height'],
+          'width': data['width'],
+          'num_detections': tf.shape(data['groundtruth_classes']),
+          'image_info': image_info,
+          'boxes': box_ops.denormalize_boxes(
+              data['groundtruth_boxes'], image_shape
+          ),
+          'classes': data['groundtruth_classes'],
+          'areas': data['groundtruth_area'],
+          'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
+      }
+      groundtruths['source_id'] = utils.process_source_id(
+          groundtruths['source_id']
+      )
+      groundtruths = utils.pad_groundtruths_to_fixed_size(
+          groundtruths, self._max_num_instances
+      )
+      labels.update({'groundtruths': groundtruths})
+    return image, labels
--- a/mediapipe/model_maker/python/vision/object_detector/preprocessor_test.py
+++ b/mediapipe/model_maker/python/vision/object_detector/preprocessor_test.py
@ -0,0 +1,158 @@
+# Copyright 2023 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from mediapipe.model_maker.python.vision.core import test_utils
+from mediapipe.model_maker.python.vision.object_detector import model_spec as ms
+from mediapipe.model_maker.python.vision.object_detector import preprocessor as preprocessor_lib
+
+
+class DatasetTest(tf.test.TestCase, parameterized.TestCase):
+  MAX_IMAGE_SIZE = 360
+  OUTPUT_SIZE = 256
+  NUM_CLASSES = 10
+  NUM_EXAMPLES = 3
+  MIN_LEVEL = 3
+  MAX_LEVEL = 7
+  NUM_SCALES = 3
+  ASPECT_RATIOS = [0.5, 1, 2]
+  MAX_NUM_INSTANCES = 100
+
+  def _get_rand_example(self):
+    num_annotations = random.randint(1, 3)
+    bboxes, classes, is_crowds = [], [], []
+    image_size = random.randint(10, self.MAX_IMAGE_SIZE + 1)
+    rgb = [random.uniform(0, 255) for _ in range(3)]
+    image = test_utils.fill_image(rgb, image_size)
+    for _ in range(num_annotations):
+      x1, x2 = random.uniform(0, image_size), random.uniform(0, image_size)
+      y1, y2 = random.uniform(0, image_size), random.uniform(0, image_size)
+      bbox = [min(x1, x2), min(y1, y2), abs(x1 - x2), abs(y1 - y2)]
+      bboxes.append(bbox)
+      classes.append(random.randint(0, self.NUM_CLASSES - 1))
+      is_crowds.append(0)
+    return {
+        'image': tf.cast(image, dtype=tf.float32),
+        'groundtruth_boxes': tf.cast(bboxes, dtype=tf.float32),
+        'groundtruth_classes': tf.cast(classes, dtype=tf.int64),
+        'groundtruth_is_crowd': tf.cast(is_crowds, dtype=tf.bool),
+        'groundtruth_area': tf.cast(is_crowds, dtype=tf.float32),
+        'source_id': tf.cast(1, dtype=tf.int64),
+        'height': tf.cast(image_size, dtype=tf.int64),
+        'width': tf.cast(image_size, dtype=tf.int64),
+    }
+
+  def setUp(self):
+    super().setUp()
+    dataset = [self._get_rand_example() for _ in range(self.NUM_EXAMPLES)]
+
+    def my_generator(data):
+      for item in data:
+        yield item
+
+    self.dataset = tf.data.Dataset.from_generator(
+        lambda: my_generator(dataset),
+        output_types={
+            'image': tf.float32,
+            'groundtruth_classes': tf.int64,
+            'groundtruth_boxes': tf.float32,
+            'groundtruth_is_crowd': tf.bool,
+            'groundtruth_area': tf.float32,
+            'source_id': tf.int64,
+            'height': tf.int64,
+            'width': tf.int64,
+        },
+    )
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name='training',
+          is_training=True,
+      ),
+      dict(
+          testcase_name='evaluation',
+          is_training=False,
+      ),
+  )
+  def test_preprocessor(self, is_training):
+    model_spec = ms.SupportedModels.MOBILENET_V2.value()
+    labels_keys = [
+        'cls_targets',
+        'box_targets',
+        'anchor_boxes',
+        'cls_weights',
+        'box_weights',
+        'image_info',
+    ]
+    if not is_training:
+      labels_keys.append('groundtruths')
+    preprocessor = preprocessor_lib.Preprocessor(model_spec)
+    for example in self.dataset:
+      result = preprocessor(example, is_training=is_training)
+      image, labels = result
+      self.assertAllEqual(image.shape, (256, 256, 3))
+      self.assertCountEqual(labels_keys, labels.keys())
+      np_labels = tf.nest.map_structure(lambda x: x.numpy(), labels)
+      # Checks shapes of `image_info` and `anchor_boxes`.
+      self.assertEqual(np_labels['image_info'].shape, (4, 2))
+      n_anchors = 0
+      for level in range(self.MIN_LEVEL, self.MAX_LEVEL + 1):
+        stride = 2**level
+        output_size_l = [self.OUTPUT_SIZE / stride, self.OUTPUT_SIZE / stride]
+        anchors_per_location = self.NUM_SCALES * len(self.ASPECT_RATIOS)
+        self.assertEqual(
+            list(np_labels['anchor_boxes'][str(level)].shape),
+            [output_size_l[0], output_size_l[1], 4 * anchors_per_location],
+        )
+        n_anchors += output_size_l[0] * output_size_l[1] * anchors_per_location
+      # Checks shapes of training objectives.
+      self.assertEqual(np_labels['cls_weights'].shape, (int(n_anchors),))
+      for level in range(self.MIN_LEVEL, self.MAX_LEVEL + 1):
+        stride = 2**level
+        output_size_l = [self.OUTPUT_SIZE / stride, self.OUTPUT_SIZE / stride]
+        anchors_per_location = self.NUM_SCALES * len(self.ASPECT_RATIOS)
+        self.assertEqual(
+            list(np_labels['cls_targets'][str(level)].shape),
+            [output_size_l[0], output_size_l[1], anchors_per_location],
+        )
+        self.assertEqual(
+            list(np_labels['box_targets'][str(level)].shape),
+            [output_size_l[0], output_size_l[1], 4 * anchors_per_location],
+        )
+      # Checks shape of groundtruths for eval.
+      if not is_training:
+        self.assertEqual(np_labels['groundtruths']['source_id'].shape, ())
+        self.assertEqual(
+            np_labels['groundtruths']['classes'].shape,
+            (self.MAX_NUM_INSTANCES,),
+        )
+        self.assertEqual(
+            np_labels['groundtruths']['boxes'].shape,
+            (self.MAX_NUM_INSTANCES, 4),
+        )
+        self.assertEqual(
+            np_labels['groundtruths']['areas'].shape, (self.MAX_NUM_INSTANCES,)
+        )
+        self.assertEqual(
+            np_labels['groundtruths']['is_crowds'].shape,
+            (self.MAX_NUM_INSTANCES,),
+        )
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000072.jpg
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000072.jpg
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000078.jpg
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000078.jpg
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000315.jpg
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000315.jpg
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000431.jpg
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000431.jpg
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000446.jpg
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/images/000000000446.jpg
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/labels.json
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/coco_data/labels.json
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/Annotations/37ca2a3d-IMG_0520.xml
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/Annotations/37ca2a3d-IMG_0520.xml
@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<annotation>
+<folder>images</folder>
+<filename>37ca2a3d-IMG_0520.jpg</filename>
+<source>
+<database>MyDatabase</database>
+<annotation>COCO2017</annotation>
+<image>flickr</image>
+<flickrid>NULL</flickrid>
+<annotator>1</annotator>
+</source>
+<owner>
+<flickrid>NULL</flickrid>
+<name>Label Studio</name>
+</owner>
+<size>
+<width>800</width>
+<height>600</height>
+<depth>3</depth>
+</size>
+<segmented>0</segmented>
+<object>
+<name>pig_android</name>
+<pose>Unspecified</pose>
+<truncated>0</truncated>
+<difficult>0</difficult>
+<bndbox>
+<xmin>242</xmin>
+<ymin>17</ymin>
+<xmax>556</xmax>
+<ymax>476</ymax>
+</bndbox>
+</object>
+</annotation>
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/Annotations/3d3382d3-IMG_0514.xml
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/Annotations/3d3382d3-IMG_0514.xml
@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<annotation>
+<folder>images</folder>
+<filename>3d3382d3-IMG_0514.jpg</filename>
+<source>
+<database>MyDatabase</database>
+<annotation>COCO2017</annotation>
+<image>flickr</image>
+<flickrid>NULL</flickrid>
+<annotator>1</annotator>
+</source>
+<owner>
+<flickrid>NULL</flickrid>
+<name>Label Studio</name>
+</owner>
+<size>
+<width>800</width>
+<height>600</height>
+<depth>3</depth>
+</size>
+<segmented>0</segmented>
+<object>
+<name>android</name>
+<pose>Unspecified</pose>
+<truncated>0</truncated>
+<difficult>0</difficult>
+<bndbox>
+<xmin>306</xmin>
+<ymin>130</ymin>
+<xmax>550</xmax>
+<ymax>471</ymax>
+</bndbox>
+</object>
+</annotation>
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/Annotations/d1c65813-IMG_0546.xml
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/Annotations/d1c65813-IMG_0546.xml
@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="utf-8"?>
+<annotation>
+<folder>images</folder>
+<filename>d1c65813-IMG_0546.jpg</filename>
+<source>
+<database>MyDatabase</database>
+<annotation>COCO2017</annotation>
+<image>flickr</image>
+<flickrid>NULL</flickrid>
+<annotator>1</annotator>
+</source>
+<owner>
+<flickrid>NULL</flickrid>
+<name>Label Studio</name>
+</owner>
+<size>
+<width>800</width>
+<height>600</height>
+<depth>3</depth>
+</size>
+<segmented>0</segmented>
+<object>
+<name>android</name>
+<pose>Unspecified</pose>
+<truncated>0</truncated>
+<difficult>0</difficult>
+<bndbox>
+<xmin>93</xmin>
+<ymin>101</ymin>
+<xmax>358</xmax>
+<ymax>378</ymax>
+</bndbox>
+</object>
+<object>
+<name>pig_android</name>
+<pose>Unspecified</pose>
+<truncated>0</truncated>
+<difficult>0</difficult>
+<bndbox>
+<xmin>438</xmin>
+<ymin>28</ymin>
+<xmax>654</xmax>
+<ymax>296</ymax>
+</bndbox>
+</object>
+</annotation>
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/Annotations/d86b20e0-IMG_0509.xml
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/Annotations/d86b20e0-IMG_0509.xml
@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="utf-8"?>
+<annotation>
+  <folder>images</folder>
+  <filename>d86b20e0-IMG_0509.jpg</filename>
+  <source>
+    <database>MyDatabase</database>
+    <annotation>COCO2017</annotation>
+    <image>flickr</image>
+    <flickrid>NULL</flickrid>
+    <annotator>1</annotator>
+  </source>
+  <owner>
+    <flickrid>NULL</flickrid>
+    <name>Label Studio</name>
+  </owner>
+  <size>
+    <width>800</width>
+    <height>600</height>
+    <depth>3</depth>
+  </size>
+  <segmented>0</segmented>
+  <object>
+    <name>android</name>
+    <pose>Unspecified</pose>
+    <truncated>0</truncated>
+    <difficult>0</difficult>
+    <bndbox>
+    <xmin>7</xmin>
+    <ymin>122</ymin>
+    <xmax>296</xmax>
+    <ymax>402</ymax>
+    </bndbox>
+  </object>
+  <object>
+    <name>pig_android</name>
+    <pose>Unspecified</pose>
+    <truncated>0</truncated>
+    <difficult>0</difficult>
+    <bndbox>
+    <xmin>523</xmin>
+    <ymin>69</ymin>
+    <xmax>723</xmax>
+    <ymax>329</ymax>
+    </bndbox>
+  </object>
+</annotation>
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/images/37ca2a3d-IMG_0520.jpg
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/images/37ca2a3d-IMG_0520.jpg
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/images/3d3382d3-IMG_0514.jpg
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/images/3d3382d3-IMG_0514.jpg
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/images/d1c65813-IMG_0546.jpg
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/images/d1c65813-IMG_0546.jpg
--- a/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/images/d86b20e0-IMG_0509.jpg
+++ b/mediapipe/model_maker/python/vision/object_detector/testdata/pascal_voc_data/images/d86b20e0-IMG_0509.jpg