Implement MediaPipe Tasks Python AudioData.

PiperOrigin-RevId: 486147173
2022-11-04 08:30:30 -07:00 · 2022-11-04 08:30:30 -07:00 · 5f5f50d8f7
commit 5f5f50d8f7
parent 5024c815f1
2 changed files with 114 additions and 0 deletions
--- a/mediapipe/tasks/python/components/containers/BUILD
+++ b/mediapipe/tasks/python/components/containers/BUILD
@ -18,6 +18,11 @@ package(default_visibility = ["//mediapipe/tasks:internal"])
 licenses(["notice"])
 py_library(
    name = "audio_data",
    srcs = ["audio_data.py"],
 )
 py_library(
    name = "bounding_box",
    srcs = ["bounding_box.py"],
--- a/mediapipe/tasks/python/components/containers/audio_data.py
+++ b/mediapipe/tasks/python/components/containers/audio_data.py
@ -0,0 +1,109 @@
 # Copyright 2022 The MediaPipe Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """MediaPipe audio data."""
 import dataclasses
 from typing import Optional
 import numpy as np
@dataclasses.dataclass
 class AudioFormat:
  """Audio format metadata.
  Attributes:
    num_channels: the number of channels of the audio data.
    sample_rate: the audio sample rate.
  """
  num_channels: int = 1
  sample_rate: Optional[float] = None
 class AudioData(object):
  """MediaPipe Tasks' audio container."""
  def __init__(
      self, buffer_length: int,
      audio_format: AudioFormat = AudioFormat()) -> None:
    """Initializes the `AudioData` object.
    Args:
      buffer_length: the length of the audio buffer.
      audio_format: the audio format metadata.
    """
    self._audio_format = audio_format
    self._buffer = np.zeros([buffer_length, self._audio_format.num_channels],
                            dtype=np.float32)
  def clear(self):
    """Clears the internal buffer and fill it with zeros."""
    self._buffer.fill(0)
  def load_from_array(self,
                      src: np.ndarray,
                      offset: int = 0,
                      size: int = -1) -> None:
    """Loads the audio data from a NumPy array.
    Args:
      src: A NumPy source array contains the input audio.
      offset: An optional offset for loading a slice of the `src` array to the
        buffer.
      size: An optional size parameter denoting the number of samples to load
        from the `src` array.
    Raises:
      ValueError: If the input array has an incorrect shape or if
        `offset` + `size` exceeds the length of the `src` array.
    """
    if src.shape[1] != self._audio_format.num_channels:
      raise ValueError(f"Input audio contains an invalid number of channels. "
                       f"Expect {self._audio_format.num_channels}.")
    if size < 0:
      size = len(src)
    if offset + size > len(src):
      raise ValueError(
          f"Index out of range. offset {offset} + size {size} should be <= "
          f"src's length: {len(src)}")
    if len(src) >= len(self._buffer):
      # If the internal buffer is shorter than the load target (src), copy
      # values from the end of the src array to the internal buffer.
      new_offset = offset + size - len(self._buffer)
      new_size = len(self._buffer)
      self._buffer = src[new_offset:new_offset + new_size].copy()
    else:
      # Shift the internal buffer backward and add the incoming data to the end
      # of the buffer.
      shift = size
      self._buffer = np.roll(self._buffer, -shift, axis=0)
      self._buffer[-shift:, :] = src[offset:offset + size].copy()
  @property
  def audio_format(self) -> AudioFormat:
    """Gets the audio format of the audio."""
    return self._audio_format
  @property
  def buffer_length(self) -> int:
    """Gets the sample count of the audio."""
    return self._buffer.shape[0]
  @property
  def buffer(self) -> np.ndarray:
    """Gets the internal buffer."""
    return self._buffer