Update python documentation.

PiperOrigin-RevId: 488482142
2022-11-14 15:16:36 -08:00 · 2022-11-14 15:16:36 -08:00 · e714e656fe
commit e714e656fe
parent c027373688
8 changed files with 213 additions and 8 deletions
--- a/mediapipe/tasks/python/audio/audio_classifier.py
+++ b/mediapipe/tasks/python/audio/audio_classifier.py
@ -86,7 +86,30 @@ class AudioClassifierOptions:


 class AudioClassifier(base_audio_task_api.BaseAudioTaskApi):
-  """Class that performs audio classification on audio data."""
+  """Class that performs audio classification on audio data.
+
+  This API expects a TFLite model with mandatory TFLite Model Metadata that
+  contains the mandatory AudioProperties of the solo input audio tensor and the
+  optional (but recommended) category labels as AssociatedFiles with type
+  TENSOR_AXIS_LABELS per output classification tensor.
+
+  Input tensor:
+    (kTfLiteFloat32)
+    - input audio buffer of size `[batch * samples]`.
+    - batch inference is not supported (`batch` is required to be 1).
+    - for multi-channel models, the channels must be interleaved.
+  At least one output tensor with:
+    (kTfLiteFloat32)
+    - `[1 x N]` array with `N` represents the number of categories.
+    - optional (but recommended) category labels as AssociatedFiles with type
+      TENSOR_AXIS_LABELS, containing one label per line. The first such
+      AssociatedFile (if any) is used to fill the `category_name` field of the
+      results. The `display_name` field is filled from the AssociatedFile (if
+      any) whose locale matches the `display_names_locale` field of the
+      `AudioClassifierOptions` used at creation time ("en" by default, i.e.
+      English). If none of these are available, only the `index` field of the
+      results will be filled.
+  """

  @classmethod
  def create_from_model_path(cls, model_path: str) -> 'AudioClassifier':
--- a/mediapipe/tasks/python/audio/audio_embedder.py
+++ b/mediapipe/tasks/python/audio/audio_embedder.py
@ -87,7 +87,24 @@ class AudioEmbedderOptions:


 class AudioEmbedder(base_audio_task_api.BaseAudioTaskApi):
-  """Class that performs embedding extraction on audio clips or audio stream."""
+  """Class that performs embedding extraction on audio clips or audio stream.
+
+  This API expects a TFLite model with mandatory TFLite Model Metadata that
+  contains the mandatory AudioProperties of the solo input audio tensor and the
+  optional (but recommended) label items as AssociatedFiles with type
+  TENSOR_AXIS_LABELS per output embedding tensor.
+
+  Input tensor:
+    (kTfLiteFloat32)
+    - input audio buffer of size `[batch * samples]`.
+    - batch inference is not supported (`batch` is required to be 1).
+    - for multi-channel models, the channels must be interleaved.
+  At least one output tensor with:
+    (kTfLiteUInt8/kTfLiteFloat32)
+    - `N` components corresponding to the `N` dimensions of the returned
+    feature vector for this output layer.
+    - Either 2 or 4 dimensions, i.e. `[1 x N]` or `[1 x 1 x 1 x N]`.
+  """

  @classmethod
  def create_from_model_path(cls, model_path: str) -> 'AudioEmbedder':
--- a/mediapipe/tasks/python/text/text_classifier.py
+++ b/mediapipe/tasks/python/text/text_classifier.py
@ -62,7 +62,38 @@ class TextClassifierOptions:


 class TextClassifier(base_text_task_api.BaseTextTaskApi):
-  """Class that performs classification on text."""
+  """Class that performs classification on text.
+
+  This API expects a TFLite model with (optional) TFLite Model Metadata that
+  contains the mandatory (described below) input tensors, output tensor,
+  and the optional (but recommended) category labels as AssociatedFiles with
+  type
+  TENSOR_AXIS_LABELS per output classification tensor. Metadata is required for
+  models with int32 input tensors because it contains the input process unit
+  for the model's Tokenizer. No metadata is required for models with string
+  input tensors.
+
+  Input tensors:
+    (kTfLiteInt32)
+    - 3 input tensors of size `[batch_size x bert_max_seq_len]` representing
+      the input ids, segment ids, and mask ids
+    - or 1 input tensor of size `[batch_size x max_seq_len]` representing the
+      input ids
+    or (kTfLiteString)
+    - 1 input tensor that is shapeless or has shape [1] containing the input
+      string
+  At least one output tensor with:
+    (kTfLiteFloat32/kBool)
+    - `[1 x N]` array with `N` represents the number of categories.
+    - optional (but recommended) category labels as AssociatedFiles with type
+      TENSOR_AXIS_LABELS, containing one label per line. The first such
+      AssociatedFile (if any) is used to fill the `category_name` field of the
+      results. The `display_name` field is filled from the AssociatedFile (if
+      any) whose locale matches the `display_names_locale` field of the
+      `TextClassifierOptions` used at creation time ("en" by default, i.e.
+      English). If none of these are available, only the `index` field of the
+      results will be filled.
+  """

  @classmethod
  def create_from_model_path(cls, model_path: str) -> 'TextClassifier':
--- a/mediapipe/tasks/python/text/text_embedder.py
+++ b/mediapipe/tasks/python/text/text_embedder.py
@ -63,7 +63,27 @@ class TextEmbedderOptions:


 class TextEmbedder(base_text_task_api.BaseTextTaskApi):
-  """Class that performs embedding extraction on text."""
+  """Class that performs embedding extraction on text.
+
+  This API expects a TFLite model with TFLite Model Metadata that contains the
+  mandatory (described below) input tensors and output tensors. Metadata should
+  contain the input process unit for the model's Tokenizer as well as input /
+  output tensor metadata.
+
+  Input tensors:
+    (kTfLiteInt32)
+    - 3 input tensors of size `[batch_size x bert_max_seq_len]` with names
+      "ids", "mask", and "segment_ids" representing the input ids, mask ids, and
+      segment ids respectively.
+    - or 1 input tensor of size `[batch_size x max_seq_len]` representing the
+      input ids.
+
+  At least one output tensor with:
+    (kTfLiteFloat32)
+    - `N` components corresponding to the `N` dimensions of the returned
+      feature vector for this output layer.
+    - Either 2 or 4 dimensions, i.e. `[1 x N]` or `[1 x 1 x 1 x N]`.
+  """

  @classmethod
  def create_from_model_path(cls, model_path: str) -> 'TextEmbedder':
--- a/mediapipe/tasks/python/vision/image_classifier.py
+++ b/mediapipe/tasks/python/vision/image_classifier.py
@ -87,7 +87,40 @@ class ImageClassifierOptions:


 class ImageClassifier(base_vision_task_api.BaseVisionTaskApi):
-  """Class that performs image classification on images."""
+  """Class that performs image classification on images.
+
+  The API expects a TFLite model with optional, but strongly recommended,
+  TFLite Model Metadata.
+
+  Input tensor:
+    (kTfLiteUInt8/kTfLiteFloat32)
+    - image input of size `[batch x height x width x channels]`.
+    - batch inference is not supported (`batch` is required to be 1).
+    - only RGB inputs are supported (`channels` is required to be 3).
+    - if type is kTfLiteFloat32, NormalizationOptions are required to be
+      attached to the metadata for input normalization.
+  At least one output tensor with:
+    (kTfLiteUInt8/kTfLiteFloat32)
+    - `N `classes and either 2 or 4 dimensions, i.e. `[1 x N]` or
+      `[1 x 1 x 1 x N]`
+    - optional (but recommended) label map(s) as AssociatedFiles with type
+      TENSOR_AXIS_LABELS, containing one label per line. The first such
+      AssociatedFile (if any) is used to fill the `class_name` field of the
+      results. The `display_name` field is filled from the AssociatedFile (if
+      any) whose locale matches the `display_names_locale` field of the
+      `ImageClassifierOptions` used at creation time ("en" by default, i.e.
+      English). If none of these are available, only the `index` field of the
+      results will be filled.
+    - optional score calibration can be attached using ScoreCalibrationOptions
+      and an AssociatedFile with type TENSOR_AXIS_SCORE_CALIBRATION. See
+      metadata_schema.fbs [1] for more details.
+
+  An example of such model can be found at:
+  https://tfhub.dev/bohemian-visual-recognition-alliance/lite-model/models/mushroom-identification_v1/1
+
+  [1]:
+  https://github.com/google/mediapipe/blob/6cdc6443b6a7ed662744e2a2ce2d58d9c83e6d6f/mediapipe/tasks/metadata/metadata_schema.fbs#L456
+  """

  @classmethod
  def create_from_model_path(cls, model_path: str) -> 'ImageClassifier':
--- a/mediapipe/tasks/python/vision/image_embedder.py
+++ b/mediapipe/tasks/python/vision/image_embedder.py
@ -86,7 +86,24 @@ class ImageEmbedderOptions:


 class ImageEmbedder(base_vision_task_api.BaseVisionTaskApi):
-  """Class that performs embedding extraction on images."""
+  """Class that performs embedding extraction on images.
+
+  The API expects a TFLite model with optional, but strongly recommended,
+  TFLite Model Metadata.
+
+  Input tensor:
+    (kTfLiteUInt8/kTfLiteFloat32)
+    - image input of size `[batch x height x width x channels]`.
+    - batch inference is not supported (`batch` is required to be 1).
+    - only RGB inputs are supported (`channels` is required to be 3).
+    - if type is kTfLiteFloat32, NormalizationOptions are required to be
+      attached to the metadata for input normalization.
+  At least one output tensor with:
+    (kTfLiteUInt8/kTfLiteFloat32)
+    - `N` components corresponding to the `N` dimensions of the returned
+      feature vector for this output layer.
+    - Either 2 or 4 dimensions, i.e. `[1 x N]` or `[1 x 1 x 1 x N]`.
+  """

  @classmethod
  def create_from_model_path(cls, model_path: str) -> 'ImageEmbedder':
--- a/mediapipe/tasks/python/vision/image_segmenter.py
+++ b/mediapipe/tasks/python/vision/image_segmenter.py
@ -93,7 +93,29 @@ class ImageSegmenterOptions:


 class ImageSegmenter(base_vision_task_api.BaseVisionTaskApi):
-  """Class that performs image segmentation on images."""
+  """Class that performs image segmentation on images.
+
+  The API expects a TFLite model with mandatory TFLite Model Metadata.
+
+  Input tensor:
+    (kTfLiteUInt8/kTfLiteFloat32)
+    - image input of size `[batch x height x width x channels]`.
+    - batch inference is not supported (`batch` is required to be 1).
+    - RGB and greyscale inputs are supported (`channels` is required to be
+      1 or 3).
+    - if type is kTfLiteFloat32, NormalizationOptions are required to be
+      attached to the metadata for input normalization.
+  Output tensors:
+    (kTfLiteUInt8/kTfLiteFloat32)
+    - list of segmented masks.
+    - if `output_type` is CATEGORY_MASK, uint8 Image, Image vector of size 1.
+    - if `output_type` is CONFIDENCE_MASK, float32 Image list of size
+      `cahnnels`.
+    - batch is always 1
+
+  An example of such model can be found at:
+  https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/2
+  """

  @classmethod
  def create_from_model_path(cls, model_path: str) -> 'ImageSegmenter':
--- a/mediapipe/tasks/python/vision/object_detector.py
+++ b/mediapipe/tasks/python/vision/object_detector.py
@ -98,7 +98,49 @@ class ObjectDetectorOptions:


 class ObjectDetector(base_vision_task_api.BaseVisionTaskApi):
-  """Class that performs object detection on images."""
+  """Class that performs object detection on images.
+
+  The API expects a TFLite model with mandatory TFLite Model Metadata.
+
+  Input tensor:
+    (kTfLiteUInt8/kTfLiteFloat32)
+    - image input of size `[batch x height x width x channels]`.
+    - batch inference is not supported (`batch` is required to be 1).
+    - only RGB inputs are supported (`channels` is required to be 3).
+    - if type is kTfLiteFloat32, NormalizationOptions are required to be
+      attached to the metadata for input normalization.
+  Output tensors must be the 4 outputs of a `DetectionPostProcess` op, i.e:
+    (kTfLiteFloat32)
+    - locations tensor of size `[num_results x 4]`, the inner array
+      representing bounding boxes in the form [top, left, right, bottom].
+    - BoundingBoxProperties are required to be attached to the metadata
+      and must specify type=BOUNDARIES and coordinate_type=RATIO.
+    (kTfLiteFloat32)
+    - classes tensor of size `[num_results]`, each value representing the
+      integer index of a class.
+    - optional (but recommended) label map(s) can be attached as
+      AssociatedFile-s with type TENSOR_VALUE_LABELS, containing one label per
+      line. The first such AssociatedFile (if any) is used to fill the
+      `class_name` field of the results. The `display_name` field is filled
+      from the AssociatedFile (if any) whose locale matches the
+      `display_names_locale` field of the `ObjectDetectorOptions` used at
+      creation time ("en" by default, i.e. English). If none of these are
+      available, only the `index` field of the results will be filled.
+    (kTfLiteFloat32)
+    - scores tensor of size `[num_results]`, each value representing the score
+      of the detected object.
+    - optional score calibration can be attached using ScoreCalibrationOptions
+      and an AssociatedFile with type TENSOR_AXIS_SCORE_CALIBRATION. See
+      metadata_schema.fbs [1] for more details.
+    (kTfLiteFloat32)
+    - integer num_results as a tensor of size `[1]`
+
+  An example of such model can be found at:
+  https://tfhub.dev/google/lite-model/object_detection/mobile_object_localizer_v1/1/metadata/1
+
+  [1]:
+  https://github.com/google/mediapipe/blob/6cdc6443b6a7ed662744e2a2ce2d58d9c83e6d6f/mediapipe/tasks/metadata/metadata_schema.fbs#L456
+  """

  @classmethod
  def create_from_model_path(cls, model_path: str) -> 'ObjectDetector':