Merge branch 'master' into image-embedder-python

2022-11-08 05:02:46 +05:30 · 2022-11-08 05:02:46 +05:30 · 83608d4670
commit 83608d4670
parent 664d9c49e7 571c0b1fef
28 changed files with 1146 additions and 85 deletions
--- a/mediapipe/framework/api2/builder.h
+++ b/mediapipe/framework/api2/builder.h
@ -1,8 +1,10 @@
 #ifndef MEDIAPIPE_FRAMEWORK_API2_BUILDER_H_
 #define MEDIAPIPE_FRAMEWORK_API2_BUILDER_H_

+#include <memory>
 #include <string>
 #include <type_traits>
+#include <vector>

 #include "absl/container/btree_map.h"
 #include "absl/strings/string_view.h"
@ -74,6 +76,7 @@ class TagIndexMap {

 class Graph;
 class NodeBase;
+class PacketGenerator;

 // These structs are used internally to store information about the endpoints
 // of a connection.
@ -146,6 +149,7 @@ template <bool IsSide, typename T>
 class SourceImpl {
 public:
  using Base = SourceBase;
+  using PayloadT = T;

  // Src is used as the return type of fluent methods below. Since these are
  // single-port methods, it is desirable to always decay to a reference to the
@ -201,10 +205,61 @@ class SourceImpl {
 // when building the graph.
 template <typename T = internal::Generic>
 using Source = SourceImpl<false, T>;
+
+// Represents a stream of packets of a particular type.
+//
+// The intended use:
+// - decouple input/output streams from graph/node during graph construction
+// - pass streams around and connect them as needed, extracting reusable parts
+//   to utility/convenience functions or classes.
+//
+// For example:
+//   Stream<Image> Resize(Stream<Image> image, const Size& size, Graph& graph) {
+//     auto& scaler_node = graph.AddNode("GlScalerCalculator");
+//     auto& opts = scaler_node.GetOptions<GlScalerCalculatorOptions>();
+//     opts.set_output_width(size.width);
+//     opts.set_output_height(size.height);
+//     a >> scaler_node.In("IMAGE");
+//     return scaler_node.Out("IMAGE").Cast<Image>();
+//   }
+//
+// Where graph can use it as:
+//   Graph graph;
+//   Stream<Image> input_image = graph.In("INPUT_IMAGE").Cast<Image>();
+//   Stream<Image> resized_image = Resize(input_image, {64, 64}, graph);
+template <typename T>
+using Stream = Source<T>;
+
 template <typename T = internal::Generic>
 using MultiSource = MultiPort<Source<T>>;
+
 template <typename T = internal::Generic>
 using SideSource = SourceImpl<true, T>;
+
+// Represents a side packet of a particular type.
+//
+// The intended use:
+// - decouple input/output side packets from graph/node during graph
+//   construction
+// - pass side packets around and connect them as needed, extracting reusable
+//   parts utility/convenience functions or classes.
+//
+// For example:
+//   SidePacket<TfLiteModelPtr> GetModel(SidePacket<std::string> model_blob,
+//                                       Graph& graph) {
+//     auto& model_node = graph.AddNode("TfLiteModelCalculator");
+//     model_blob >> model_node.SideIn("MODEL_BLOB");
+//     return model_node.SideOut("MODEL").Cast<TfLiteModelPtr>();
+//   }
+//
+// Where graph can use it as:
+//   Graph graph;
+//   SidePacket<std::string> model_blob =
+//     graph.SideIn("MODEL_BLOB").Cast<std::string>();
+//   SidePacket<TfLiteModelPtr> model = GetModel(model_blob, graph);
+template <typename T>
+using SidePacket = SideSource<T>;
+
 template <typename T = internal::Generic>
 using MultiSideSource = MultiPort<SideSource<T>>;

--- a/mediapipe/python/BUILD
+++ b/mediapipe/python/BUILD
@ -87,6 +87,7 @@ cc_library(
 cc_library(
    name = "builtin_task_graphs",
    deps = [
+        "//mediapipe/tasks/cc/audio/audio_classifier:audio_classifier_graph",
        "//mediapipe/tasks/cc/vision/gesture_recognizer:gesture_recognizer_graph",
        "//mediapipe/tasks/cc/vision/image_classifier:image_classifier_graph",
        "//mediapipe/tasks/cc/vision/image_segmenter:image_segmenter_graph",
--- a/mediapipe/python/pybind/packet_creator.cc
+++ b/mediapipe/python/pybind/packet_creator.cc
@ -361,7 +361,7 @@ void PublicPacketCreators(pybind11::module* m) {
    packet = mp.packet_creator.create_float(0.1)
    data = mp.packet_getter.get_float(packet)
 )doc",
-      py::arg().noconvert(), py::return_value_policy::move);
+      py::return_value_policy::move);

  m->def(
      "create_double", [](double data) { return MakePacket<double>(data); },
@ -380,7 +380,7 @@ void PublicPacketCreators(pybind11::module* m) {
    packet = mp.packet_creator.create_double(0.1)
    data = mp.packet_getter.get_float(packet)
 )doc",
-      py::arg().noconvert(), py::return_value_policy::move);
+      py::return_value_policy::move);

  m->def(
      "create_int_array",
--- a/mediapipe/tasks/cc/audio/audio_classifier/BUILD
+++ b/mediapipe/tasks/cc/audio/audio_classifier/BUILD
@ -37,7 +37,6 @@ cc_library(
        "//mediapipe/tasks/cc/audio/utils:audio_tensor_specs",
        "//mediapipe/tasks/cc/components/containers/proto:classifications_cc_proto",
        "//mediapipe/tasks/cc/components/processors:classification_postprocessing_graph",
-        "//mediapipe/tasks/cc/components/processors:classifier_options",
        "//mediapipe/tasks/cc/components/processors/proto:classification_postprocessing_graph_options_cc_proto",
        "//mediapipe/tasks/cc/core:model_resources",
        "//mediapipe/tasks/cc/core:model_task_graph",
--- a/mediapipe/tasks/cc/audio/audio_classifier/audio_classifier.cc
+++ b/mediapipe/tasks/cc/audio/audio_classifier/audio_classifier.cc
@ -63,10 +63,8 @@ CalculatorGraphConfig CreateGraphConfig(
  api2::builder::Graph graph;
  auto& subgraph = graph.AddNode(kSubgraphTypeName);
  graph.In(kAudioTag).SetName(kAudioStreamName) >> subgraph.In(kAudioTag);
-  if (!options_proto->base_options().use_stream_mode()) {
  graph.In(kSampleRateTag).SetName(kSampleRateName) >>
      subgraph.In(kSampleRateTag);
-  }
  subgraph.GetOptions<proto::AudioClassifierGraphOptions>().Swap(
      options_proto.get());
  subgraph.Out(kClassificationsTag).SetName(kClassificationsName) >>
@ -93,9 +91,6 @@ ConvertAudioClassifierOptionsToProto(AudioClassifierOptions* options) {
              &(options->classifier_options)));
  options_proto->mutable_classifier_options()->Swap(
      classifier_options_proto.get());
-  if (options->sample_rate > 0) {
-    options_proto->set_default_input_audio_sample_rate(options->sample_rate);
-  }
  return options_proto;
 }

@ -129,14 +124,6 @@ absl::StatusOr<AudioClassifierResult> ConvertAsyncOutputPackets(
 /* static */
 absl::StatusOr<std::unique_ptr<AudioClassifier>> AudioClassifier::Create(
    std::unique_ptr<AudioClassifierOptions> options) {
-  if (options->running_mode == core::RunningMode::AUDIO_STREAM &&
-      options->sample_rate < 0) {
-    return CreateStatusWithPayload(
-        absl::StatusCode::kInvalidArgument,
-        "The audio classifier is in audio stream mode, the sample rate must be "
-        "specified in the AudioClassifierOptions.",
-        MediaPipeTasksStatus::kInvalidTaskGraphConfigError);
-  }
  auto options_proto = ConvertAudioClassifierOptionsToProto(options.get());
  tasks::core::PacketsCallback packets_callback = nullptr;
  if (options->result_callback) {
@ -161,7 +148,9 @@ absl::StatusOr<std::vector<AudioClassifierResult>> AudioClassifier::Classify(
 }

 absl::Status AudioClassifier::ClassifyAsync(Matrix audio_block,
+                                            double audio_sample_rate,
                                            int64 timestamp_ms) {
+  MP_RETURN_IF_ERROR(CheckOrSetSampleRate(kSampleRateName, audio_sample_rate));
  return SendAudioStreamData(
      {{kAudioStreamName,
        MakePacket<Matrix>(std::move(audio_block))
--- a/mediapipe/tasks/cc/audio/audio_classifier/audio_classifier.h
+++ b/mediapipe/tasks/cc/audio/audio_classifier/audio_classifier.h
@ -52,15 +52,10 @@ struct AudioClassifierOptions {
  // 1) The audio clips mode for running classification on independent audio
  //    clips.
  // 2) The audio stream mode for running classification on the audio stream,
-  //    such as from microphone. In this mode, the "sample_rate" below must be
-  //    provided, and the "result_callback" below must be specified to receive
-  //    the classification results asynchronously.
+  //    such as from microphone. In this mode, the "result_callback" below must
+  //    be specified to receive the classification results asynchronously.
  core::RunningMode running_mode = core::RunningMode::AUDIO_CLIPS;

-  // The sample rate of the input audios. Must be set when the running mode is
-  // set to RunningMode::AUDIO_STREAM.
-  double sample_rate = -1.0;
-
  // The user-defined result callback for processing audio stream data.
  // The result callback should only be specified when the running mode is set
  // to RunningMode::AUDIO_STREAM.
@ -160,15 +155,17 @@ class AudioClassifier : tasks::audio::core::BaseAudioTaskApi {
  // The audio block is represented as a MediaPipe Matrix that has the number
  // of channels rows and the number of samples per channel columns. The audio
  // data will be resampled, accumulated, and framed to the proper size for the
-  // underlying model to consume. It's required to provide a timestamp (in
-  // milliseconds) to indicate the start time of the input audio block. The
+  // underlying model to consume. It's required to provide the corresponding
+  // audio sample rate along with the input audio block as well as a timestamp
+  // (in milliseconds) to indicate the start time of the input audio block. The
  // timestamps must be monotonically increasing.
  //
  // The input audio block may be longer than what the model is able to process
  // in a single inference. When this occurs, the input audio block is split
  // into multiple chunks. For this reason, the callback may be called multiple
  // times (once per chunk) for each call to this function.
-  absl::Status ClassifyAsync(mediapipe::Matrix audio_block, int64 timestamp_ms);
+  absl::Status ClassifyAsync(mediapipe::Matrix audio_block,
+                             double audio_sample_rate, int64 timestamp_ms);

  // Shuts down the AudioClassifier when all works are done.
  absl::Status Close() { return runner_->Close(); }
--- a/mediapipe/tasks/cc/audio/audio_classifier/audio_classifier_graph.cc
+++ b/mediapipe/tasks/cc/audio/audio_classifier/audio_classifier_graph.cc
@ -72,18 +72,6 @@ struct AudioClassifierOutputStreams {
  Source<std::vector<ClassificationResult>> timestamped_classifications;
 };

-absl::Status SanityCheckOptions(
-    const proto::AudioClassifierGraphOptions& options) {
-  if (options.base_options().use_stream_mode() &&
-      !options.has_default_input_audio_sample_rate()) {
-    return CreateStatusWithPayload(absl::StatusCode::kInvalidArgument,
-                                   "In the streaming mode, the default input "
-                                   "audio sample rate must be set.",
-                                   MediaPipeTasksStatus::kInvalidArgumentError);
-  }
-  return absl::OkStatus();
-}
-
 // Builds an AudioTensorSpecs for configuring the preprocessing calculators.
 absl::StatusOr<AudioTensorSpecs> BuildPreprocessingSpecs(
    const core::ModelResources& model_resources) {
@ -170,19 +158,12 @@ class AudioClassifierGraph : public core::ModelTaskGraph {
        const auto* model_resources,
        CreateModelResources<proto::AudioClassifierGraphOptions>(sc));
    Graph graph;
-    const bool use_stream_mode =
-        sc->Options<proto::AudioClassifierGraphOptions>()
-            .base_options()
-            .use_stream_mode();
    ASSIGN_OR_RETURN(
        auto output_streams,
        BuildAudioClassificationTask(
            sc->Options<proto::AudioClassifierGraphOptions>(), *model_resources,
            graph[Input<Matrix>(kAudioTag)],
-            use_stream_mode
-                ? absl::nullopt
-                : absl::make_optional(graph[Input<double>(kSampleRateTag)]),
-            graph));
+            absl::make_optional(graph[Input<double>(kSampleRateTag)]), graph));
    output_streams.classifications >>
        graph[Output<ClassificationResult>(kClassificationsTag)];
    output_streams.timestamped_classifications >>
@ -207,7 +188,6 @@ class AudioClassifierGraph : public core::ModelTaskGraph {
      const proto::AudioClassifierGraphOptions& task_options,
      const core::ModelResources& model_resources, Source<Matrix> audio_in,
      absl::optional<Source<double>> sample_rate_in, Graph& graph) {
-    MP_RETURN_IF_ERROR(SanityCheckOptions(task_options));
    const bool use_stream_mode = task_options.base_options().use_stream_mode();
    const auto* metadata_extractor = model_resources.GetMetadataExtractor();
    // Checks that metadata is available.
--- a/mediapipe/tasks/cc/audio/audio_classifier/audio_classifier_test.cc
+++ b/mediapipe/tasks/cc/audio/audio_classifier/audio_classifier_test.cc
@ -70,6 +70,8 @@ Matrix GetAudioData(absl::string_view filename) {
  return matrix_mapping.matrix();
 }

+// TODO: Compares the exact score values to capture unexpected
+// changes in the inference pipeline.
 void CheckSpeechResult(const std::vector<AudioClassifierResult>& result,
                       int expected_num_categories = 521) {
  EXPECT_EQ(result.size(), 5);
@ -90,13 +92,15 @@ void CheckSpeechResult(const std::vector<AudioClassifierResult>& result,
  }
 }

+// TODO: Compares the exact score values to capture unexpected
+// changes in the inference pipeline.
 void CheckTwoHeadsResult(const std::vector<AudioClassifierResult>& result) {
  EXPECT_GE(result.size(), 1);
  EXPECT_LE(result.size(), 2);
-  // Check first result.
+  // Check the first result.
  EXPECT_EQ(result[0].timestamp_ms, 0);
  EXPECT_EQ(result[0].classifications.size(), 2);
-  // Check first head.
+  // Check the first head.
  EXPECT_EQ(result[0].classifications[0].head_index, 0);
  EXPECT_EQ(result[0].classifications[0].head_name, "yamnet_classification");
  EXPECT_EQ(result[0].classifications[0].categories.size(), 521);
@ -104,19 +108,19 @@ void CheckTwoHeadsResult(const std::vector<AudioClassifierResult>& result) {
  EXPECT_EQ(result[0].classifications[0].categories[0].category_name,
            "Environmental noise");
  EXPECT_GT(result[0].classifications[0].categories[0].score, 0.5f);
-  // Check second head.
+  // Check the second head.
  EXPECT_EQ(result[0].classifications[1].head_index, 1);
  EXPECT_EQ(result[0].classifications[1].head_name, "bird_classification");
  EXPECT_EQ(result[0].classifications[1].categories.size(), 5);
  EXPECT_EQ(result[0].classifications[1].categories[0].index, 4);
  EXPECT_EQ(result[0].classifications[1].categories[0].category_name,
            "Chestnut-crowned Antpitta");
-  EXPECT_GT(result[0].classifications[1].categories[0].score, 0.9f);
-  // Check second result, if present.
+  EXPECT_GT(result[0].classifications[1].categories[0].score, 0.93f);
+  // Check the second result, if present.
  if (result.size() == 2) {
    EXPECT_EQ(result[1].timestamp_ms, 975);
    EXPECT_EQ(result[1].classifications.size(), 2);
-    // Check first head.
+    // Check the first head.
    EXPECT_EQ(result[1].classifications[0].head_index, 0);
    EXPECT_EQ(result[1].classifications[0].head_name, "yamnet_classification");
    EXPECT_EQ(result[1].classifications[0].categories.size(), 521);
@ -124,7 +128,7 @@ void CheckTwoHeadsResult(const std::vector<AudioClassifierResult>& result) {
    EXPECT_EQ(result[1].classifications[0].categories[0].category_name,
              "Silence");
    EXPECT_GT(result[1].classifications[0].categories[0].score, 0.99f);
-    // Check second head.
+    // Check the second head.
    EXPECT_EQ(result[1].classifications[1].head_index, 1);
    EXPECT_EQ(result[1].classifications[1].head_name, "bird_classification");
    EXPECT_EQ(result[1].classifications[1].categories.size(), 5);
@ -234,7 +238,6 @@ TEST_F(CreateFromOptionsTest, FailsWithMissingCallback) {
  options->base_options.model_asset_path =
      JoinPath("./", kTestDataDirectory, kModelWithoutMetadata);
  options->running_mode = core::RunningMode::AUDIO_STREAM;
-  options->sample_rate = 16000;
  StatusOr<std::unique_ptr<AudioClassifier>> audio_classifier_or =
      AudioClassifier::Create(std::move(options));

@ -266,25 +269,6 @@ TEST_F(CreateFromOptionsTest, FailsWithUnnecessaryCallback) {
                  MediaPipeTasksStatus::kInvalidTaskGraphConfigError))));
 }

-TEST_F(CreateFromOptionsTest, FailsWithMissingDefaultInputAudioSampleRate) {
-  auto options = std::make_unique<AudioClassifierOptions>();
-  options->base_options.model_asset_path =
-      JoinPath("./", kTestDataDirectory, kModelWithoutMetadata);
-  options->running_mode = core::RunningMode::AUDIO_STREAM;
-  options->result_callback =
-      [](absl::StatusOr<AudioClassifierResult> status_or_result) {};
-  StatusOr<std::unique_ptr<AudioClassifier>> audio_classifier_or =
-      AudioClassifier::Create(std::move(options));
-
-  EXPECT_EQ(audio_classifier_or.status().code(),
-            absl::StatusCode::kInvalidArgument);
-  EXPECT_THAT(audio_classifier_or.status().message(),
-              HasSubstr("the sample rate must be specified"));
-  EXPECT_THAT(audio_classifier_or.status().GetPayload(kMediaPipeTasksPayload),
-              Optional(absl::Cord(absl::StrCat(
-                  MediaPipeTasksStatus::kInvalidTaskGraphConfigError))));
-}
-
 class ClassifyTest : public tflite_shims::testing::Test {};

 TEST_F(ClassifyTest, Succeeds) {
@ -493,7 +477,6 @@ TEST_F(ClassifyAsyncTest, Succeeds) {
  options->classifier_options.max_results = 1;
  options->classifier_options.score_threshold = 0.3f;
  options->running_mode = core::RunningMode::AUDIO_STREAM;
-  options->sample_rate = kSampleRateHz;
  std::vector<AudioClassifierResult> outputs;
  options->result_callback =
      [&outputs](absl::StatusOr<AudioClassifierResult> status_or_result) {
@ -506,7 +489,7 @@ TEST_F(ClassifyAsyncTest, Succeeds) {
    int num_samples = std::min((int)(audio_buffer.cols() - start_col),
                               kYamnetNumOfAudioSamples * 3);
    MP_ASSERT_OK(audio_classifier->ClassifyAsync(
-        audio_buffer.block(0, start_col, 1, num_samples),
+        audio_buffer.block(0, start_col, 1, num_samples), kSampleRateHz,
        start_col * kMilliSecondsPerSecond / kSampleRateHz));
    start_col += kYamnetNumOfAudioSamples * 3;
  }
@ -523,7 +506,6 @@ TEST_F(ClassifyAsyncTest, SucceedsWithNonDeterministicNumAudioSamples) {
  options->classifier_options.max_results = 1;
  options->classifier_options.score_threshold = 0.3f;
  options->running_mode = core::RunningMode::AUDIO_STREAM;
-  options->sample_rate = kSampleRateHz;
  std::vector<AudioClassifierResult> outputs;
  options->result_callback =
      [&outputs](absl::StatusOr<AudioClassifierResult> status_or_result) {
@ -538,7 +520,7 @@ TEST_F(ClassifyAsyncTest, SucceedsWithNonDeterministicNumAudioSamples) {
        std::min((int)(audio_buffer.cols() - start_col),
                 rand_r(&rseed) % 10 + kYamnetNumOfAudioSamples * 3);
    MP_ASSERT_OK(audio_classifier->ClassifyAsync(
-        audio_buffer.block(0, start_col, 1, num_samples),
+        audio_buffer.block(0, start_col, 1, num_samples), kSampleRateHz,
        start_col * kMilliSecondsPerSecond / kSampleRateHz));
    start_col += num_samples;
  }
--- a/mediapipe/tasks/cc/audio/core/base_audio_task_api.h
+++ b/mediapipe/tasks/cc/audio/core/base_audio_task_api.h
@ -72,8 +72,39 @@ class BaseAudioTaskApi : public tasks::core::BaseTaskApi {
    return runner_->Send(std::move(inputs));
  }

+  // Checks or sets the sample rate in the audio stream mode.
+  absl::Status CheckOrSetSampleRate(std::string sample_rate_stream_name,
+                                    double sample_rate) {
+    if (running_mode_ != RunningMode::AUDIO_STREAM) {
+      return CreateStatusWithPayload(
+          absl::StatusCode::kInvalidArgument,
+          absl::StrCat("Task is not initialized with the audio stream mode. "
+                       "Current running mode:",
+                       GetRunningModeName(running_mode_)),
+          MediaPipeTasksStatus::kRunnerApiCalledInWrongModeError);
+    }
+    if (default_sample_rate_ > 0) {
+      if (std::fabs(sample_rate - default_sample_rate_) >
+          std::numeric_limits<double>::epsilon()) {
+        return CreateStatusWithPayload(
+            absl::StatusCode::kInvalidArgument,
+            absl::StrCat("The input audio sample rate: ", sample_rate,
+                         " is inconsistent with the previously provided: ",
+                         default_sample_rate_),
+            MediaPipeTasksStatus::kInvalidArgumentError);
+      }
+    } else {
+      default_sample_rate_ = sample_rate;
+      MP_RETURN_IF_ERROR(runner_->Send(
+          {{sample_rate_stream_name, MakePacket<double>(default_sample_rate_)
+                                         .At(Timestamp::PreStream())}}));
+    }
+    return absl::OkStatus();
+  }
+
 private:
  RunningMode running_mode_;
+  double default_sample_rate_ = -1.0;
 };

 }  // namespace core
--- a/mediapipe/tasks/python/init.py
+++ b/mediapipe/tasks/python/init.py
@ -14,8 +14,10 @@

 """MediaPipe Tasks API."""

+from . import audio
 from . import components
 from . import core
+from . import text
 from . import vision

 BaseOptions = core.base_options.BaseOptions
--- a/mediapipe/tasks/python/audio/BUILD
+++ b/mediapipe/tasks/python/audio/BUILD
@ -0,0 +1,41 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Placeholder for internal Python strict library and test compatibility macro.
+
+package(default_visibility = ["//mediapipe/tasks:internal"])
+
+licenses(["notice"])
+
+py_library(
+    name = "audio_classifier",
+    srcs = [
+        "audio_classifier.py",
+    ],
+    deps = [
+        "//mediapipe/python:_framework_bindings",
+        "//mediapipe/python:packet_creator",
+        "//mediapipe/python:packet_getter",
+        "//mediapipe/tasks/cc/audio/audio_classifier/proto:audio_classifier_graph_options_py_pb2",
+        "//mediapipe/tasks/cc/components/containers/proto:classifications_py_pb2",
+        "//mediapipe/tasks/python/audio/core:audio_task_running_mode",
+        "//mediapipe/tasks/python/audio/core:base_audio_task_api",
+        "//mediapipe/tasks/python/components/containers:audio_data",
+        "//mediapipe/tasks/python/components/containers:classification_result",
+        "//mediapipe/tasks/python/components/processors:classifier_options",
+        "//mediapipe/tasks/python/core:base_options",
+        "//mediapipe/tasks/python/core:optional_dependencies",
+        "//mediapipe/tasks/python/core:task_info",
+    ],
+)
--- a/mediapipe/tasks/python/audio/init.py
+++ b/mediapipe/tasks/python/audio/init.py
@ -0,0 +1,27 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MediaPipe Tasks Audio API."""
+
+import mediapipe.tasks.python.audio.core
+import mediapipe.tasks.python.audio.audio_classifier
+
+AudioClassifier = audio_classifier.AudioClassifier
+AudioClassifierOptions = audio_classifier.AudioClassifierOptions
+RunningMode = core.audio_task_running_mode.AudioTaskRunningMode
+
+# Remove unnecessary modules to avoid duplication in API docs.
+del audio_classifier
+del core
+del mediapipe
--- a/mediapipe/tasks/python/audio/audio_classifier.py
+++ b/mediapipe/tasks/python/audio/audio_classifier.py
@ -0,0 +1,280 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe audio classifier task."""
+
+import dataclasses
+from typing import Callable, Mapping, List, Optional
+
+from mediapipe.python import packet_creator
+from mediapipe.python import packet_getter
+from mediapipe.python._framework_bindings import packet
+from mediapipe.tasks.cc.audio.audio_classifier.proto import audio_classifier_graph_options_pb2
+from mediapipe.tasks.cc.components.containers.proto import classifications_pb2
+from mediapipe.tasks.python.audio.core import audio_task_running_mode as running_mode_module
+from mediapipe.tasks.python.audio.core import base_audio_task_api
+from mediapipe.tasks.python.components.containers import audio_data as audio_data_module
+from mediapipe.tasks.python.components.containers import classification_result as classification_result_module
+from mediapipe.tasks.python.components.processors import classifier_options as classifier_options_module
+from mediapipe.tasks.python.core import base_options as base_options_module
+from mediapipe.tasks.python.core import task_info as task_info_module
+from mediapipe.tasks.python.core.optional_dependencies import doc_controls
+
+AudioClassifierResult = classification_result_module.ClassificationResult
+_AudioClassifierGraphOptionsProto = audio_classifier_graph_options_pb2.AudioClassifierGraphOptions
+_AudioData = audio_data_module.AudioData
+_BaseOptions = base_options_module.BaseOptions
+_ClassifierOptions = classifier_options_module.ClassifierOptions
+_RunningMode = running_mode_module.AudioTaskRunningMode
+_TaskInfo = task_info_module.TaskInfo
+
+_AUDIO_IN_STREAM_NAME = 'audio_in'
+_AUDIO_TAG = 'AUDIO'
+_CLASSIFICATIONS_STREAM_NAME = 'classifications_out'
+_CLASSIFICATIONS_TAG = 'CLASSIFICATIONS'
+_SAMPLE_RATE_IN_STREAM_NAME = 'sample_rate_in'
+_SAMPLE_RATE_TAG = 'SAMPLE_RATE'
+_TASK_GRAPH_NAME = 'mediapipe.tasks.audio.audio_classifier.AudioClassifierGraph'
+_TIMESTAMPED_CLASSIFICATIONS_STREAM_NAME = 'timestamped_classifications_out'
+_TIMESTAMPED_CLASSIFICATIONS_TAG = 'TIMESTAMPED_CLASSIFICATIONS'
+_MICRO_SECONDS_PER_MILLISECOND = 1000
+
+
+@dataclasses.dataclass
+class AudioClassifierOptions:
+  """Options for the audio classifier task.
+
+  Attributes:
+    base_options: Base options for the audio classifier task.
+    running_mode: The running mode of the task. Default to the audio clips mode.
+      Audio classifier task has two running modes: 1) The audio clips mode for
+      running classification on independent audio clips. 2) The audio stream
+      mode for running classification on the audio stream, such as from
+      microphone. In this mode,  the "result_callback" below must be specified
+      to receive the classification results asynchronously.
+    classifier_options: Options for configuring the classifier behavior, such as
+      score threshold, number of results, etc.
+    result_callback: The user-defined result callback for processing audio
+      stream data. The result callback should only be specified when the running
+      mode is set to the audio stream mode.
+  """
+  base_options: _BaseOptions
+  running_mode: _RunningMode = _RunningMode.AUDIO_CLIPS
+  classifier_options: _ClassifierOptions = _ClassifierOptions()
+  result_callback: Optional[Callable[[AudioClassifierResult, int], None]] = None
+
+  @doc_controls.do_not_generate_docs
+  def to_pb2(self) -> _AudioClassifierGraphOptionsProto:
+    """Generates an AudioClassifierOptions protobuf object."""
+    base_options_proto = self.base_options.to_pb2()
+    base_options_proto.use_stream_mode = False if self.running_mode == _RunningMode.AUDIO_CLIPS else True
+    classifier_options_proto = self.classifier_options.to_pb2()
+
+    return _AudioClassifierGraphOptionsProto(
+        base_options=base_options_proto,
+        classifier_options=classifier_options_proto)
+
+
+class AudioClassifier(base_audio_task_api.BaseAudioTaskApi):
+  """Class that performs audio classification on audio data."""
+
+  @classmethod
+  def create_from_model_path(cls, model_path: str) -> 'AudioClassifier':
+    """Creates an `AudioClassifier` object from a TensorFlow Lite model and the default `AudioClassifierOptions`.
+
+    Note that the created `AudioClassifier` instance is in audio clips mode, for
+    classifying on independent audio clips.
+
+    Args:
+      model_path: Path to the model.
+
+    Returns:
+      `AudioClassifier` object that's created from the model file and the
+      default `AudioClassifierOptions`.
+
+    Raises:
+      ValueError: If failed to create `AudioClassifier` object from the provided
+        file such as invalid file path.
+      RuntimeError: If other types of error occurred.
+    """
+    base_options = _BaseOptions(model_asset_path=model_path)
+    options = AudioClassifierOptions(
+        base_options=base_options, running_mode=_RunningMode.AUDIO_CLIPS)
+    return cls.create_from_options(options)
+
+  @classmethod
+  def create_from_options(cls,
+                          options: AudioClassifierOptions) -> 'AudioClassifier':
+    """Creates the `AudioClassifier` object from audio classifier options.
+
+    Args:
+      options: Options for the audio classifier task.
+
+    Returns:
+      `AudioClassifier` object that's created from `options`.
+
+    Raises:
+      ValueError: If failed to create `AudioClassifier` object from
+        `AudioClassifierOptions` such as missing the model.
+      RuntimeError: If other types of error occurred.
+    """
+
+    def packets_callback(output_packets: Mapping[str, packet.Packet]):
+      timestamp_ms = output_packets[
+          _CLASSIFICATIONS_STREAM_NAME].timestamp.value // _MICRO_SECONDS_PER_MILLISECOND
+      if output_packets[_CLASSIFICATIONS_STREAM_NAME].is_empty():
+        options.result_callback(
+            AudioClassifierResult(classifications=[]), timestamp_ms)
+        return
+      classification_result_proto = classifications_pb2.ClassificationResult()
+      classification_result_proto.CopyFrom(
+          packet_getter.get_proto(output_packets[_CLASSIFICATIONS_STREAM_NAME]))
+      options.result_callback(
+          AudioClassifierResult.create_from_pb2(classification_result_proto),
+          timestamp_ms)
+
+    task_info = _TaskInfo(
+        task_graph=_TASK_GRAPH_NAME,
+        input_streams=[
+            ':'.join([_AUDIO_TAG, _AUDIO_IN_STREAM_NAME]),
+            ':'.join([_SAMPLE_RATE_TAG, _SAMPLE_RATE_IN_STREAM_NAME])
+        ],
+        output_streams=[
+            ':'.join([_CLASSIFICATIONS_TAG, _CLASSIFICATIONS_STREAM_NAME]),
+            ':'.join([
+                _TIMESTAMPED_CLASSIFICATIONS_TAG,
+                _TIMESTAMPED_CLASSIFICATIONS_STREAM_NAME
+            ])
+        ],
+        task_options=options)
+    return cls(
+        # Audio tasks should not drop input audio due to flow limiting, which
+        # may cause data inconsistency.
+        task_info.generate_graph_config(enable_flow_limiting=False),
+        options.running_mode,
+        packets_callback if options.result_callback else None)
+
+  def classify(self, audio_clip: _AudioData) -> List[AudioClassifierResult]:
+    """Performs audio classification on the provided audio clip.
+
+    The audio clip is represented as a MediaPipe AudioData. The method accepts
+    audio clips with various length and audio sample rate. It's required to
+    provide the corresponding audio sample rate within the `AudioData` object.
+
+    The input audio clip may be longer than what the model is able to process
+    in a single inference. When this occurs, the input audio clip is split into
+    multiple chunks starting at different timestamps. For this reason, this
+    function returns a vector of ClassificationResult objects, each associated
+    ith a timestamp corresponding to the start (in milliseconds) of the chunk
+    data that was classified, e.g:
+
+    ClassificationResult #0 (first chunk of data):
+      timestamp_ms: 0 (starts at 0ms)
+      classifications #0 (single head model):
+        category #0:
+          category_name: "Speech"
+          score: 0.6
+        category #1:
+          category_name: "Music"
+          score: 0.2
+    ClassificationResult #1 (second chunk of data):
+      timestamp_ms: 800 (starts at 800ms)
+      classifications #0 (single head model):
+        category #0:
+          category_name: "Speech"
+          score: 0.5
+       category #1:
+         category_name: "Silence"
+         score: 0.1
+
+    Args:
+      audio_clip: MediaPipe AudioData.
+
+    Returns:
+      An `AudioClassifierResult` object that contains a list of
+      classification result objects, each associated with a timestamp
+      corresponding to the start (in milliseconds) of the chunk data that was
+      classified.
+
+    Raises:
+      ValueError: If any of the input arguments is invalid, such as the sample
+        rate is not provided in the `AudioData` object.
+      RuntimeError: If audio classification failed to run.
+    """
+    if not audio_clip.audio_format.sample_rate:
+      raise ValueError('Must provide the audio sample rate in audio data.')
+    output_packets = self._process_audio_clip({
+        _AUDIO_IN_STREAM_NAME:
+            packet_creator.create_matrix(audio_clip.buffer, transpose=True),
+        _SAMPLE_RATE_IN_STREAM_NAME:
+            packet_creator.create_double(audio_clip.audio_format.sample_rate)
+    })
+    output_list = []
+    classification_result_proto_list = packet_getter.get_proto_list(
+        output_packets[_TIMESTAMPED_CLASSIFICATIONS_STREAM_NAME])
+    for proto in classification_result_proto_list:
+      classification_result_proto = classifications_pb2.ClassificationResult()
+      classification_result_proto.CopyFrom(proto)
+      output_list.append(
+          AudioClassifierResult.create_from_pb2(classification_result_proto))
+    return output_list
+
+  def classify_async(self, audio_block: _AudioData, timestamp_ms: int) -> None:
+    """Sends audio data (a block in a continuous audio stream) to perform audio classification.
+
+    Only use this method when the AudioClassifier is created with the audio
+    stream running mode. The input timestamps should be monotonically increasing
+    for adjacent calls of this method. This method will return immediately after
+    the input audio data is accepted. The results will be available via the
+    `result_callback` provided in the `AudioClassifierOptions`. The
+    `classify_async` method is designed to process auido stream data such as
+    microphone input.
+
+    The input audio data may be longer than what the model is able to process
+    in a single inference. When this occurs, the input audio block is split
+    into multiple chunks. For this reason, the callback may be called multiple
+    times (once per chunk) for each call to this function.
+
+    The `result_callback` provides:
+      - An `AudioClassifierResult` object that contains a list of
+        classifications.
+      - The input timestamp in milliseconds.
+
+    Args:
+      audio_block: MediaPipe AudioData.
+      timestamp_ms: The timestamp of the input audio data in milliseconds.
+
+    Raises:
+      ValueError: If any of the followings:
+        1) The sample rate is not provided in the `AudioData` object or the
+        provided sample rate is inconsisent with the previously recevied.
+        2) The current input timestamp is smaller than what the audio
+        classifier has already processed.
+    """
+    if not audio_block.audio_format.sample_rate:
+      raise ValueError('Must provide the audio sample rate in audio data.')
+    if not self._default_sample_rate:
+      self._default_sample_rate = audio_block.audio_format.sample_rate
+      self._set_sample_rate(_SAMPLE_RATE_IN_STREAM_NAME,
+                            self._default_sample_rate)
+    elif audio_block.audio_format.sample_rate != self._default_sample_rate:
+      raise ValueError(
+          f'The audio sample rate provided in audio data: '
+          f'{audio_block.audio_format.sample_rate} is inconsisent with '
+          f'the previously received: {self._default_sample_rate}.')
+
+    self._send_audio_stream_data({
+        _AUDIO_IN_STREAM_NAME:
+            packet_creator.create_matrix(audio_block.buffer, transpose=True).at(
+                timestamp_ms * _MICRO_SECONDS_PER_MILLISECOND)
+    })
--- a/mediapipe/tasks/python/audio/core/BUILD
+++ b/mediapipe/tasks/python/audio/core/BUILD
@ -32,6 +32,7 @@ py_library(
        ":audio_task_running_mode",
        "//mediapipe/framework:calculator_py_pb2",
        "//mediapipe/python:_framework_bindings",
+        "//mediapipe/python:packet_creator",
        "//mediapipe/tasks/python/core:optional_dependencies",
    ],
 )
--- a/mediapipe/tasks/python/audio/core/base_audio_task_api.py
+++ b/mediapipe/tasks/python/audio/core/base_audio_task_api.py
@ -16,14 +16,17 @@
 from typing import Callable, Mapping, Optional

 from mediapipe.framework import calculator_pb2
+from mediapipe.python import packet_creator
 from mediapipe.python._framework_bindings import packet as packet_module
 from mediapipe.python._framework_bindings import task_runner as task_runner_module
+from mediapipe.python._framework_bindings import timestamp as timestamp_module
 from mediapipe.tasks.python.audio.core import audio_task_running_mode as running_mode_module
 from mediapipe.tasks.python.core.optional_dependencies import doc_controls

 _TaskRunner = task_runner_module.TaskRunner
 _Packet = packet_module.Packet
 _RunningMode = running_mode_module.AudioTaskRunningMode
+_Timestamp = timestamp_module.Timestamp


 class BaseAudioTaskApi(object):
@ -59,6 +62,7 @@ class BaseAudioTaskApi(object):
          'callback should not be provided.')
    self._runner = _TaskRunner.create(graph_config, packet_callback)
    self._running_mode = running_mode
+    self._default_sample_rate = None

  def _process_audio_clip(
      self, inputs: Mapping[str, _Packet]) -> Mapping[str, _Packet]:
@ -82,6 +86,27 @@ class BaseAudioTaskApi(object):
          + self._running_mode.name)
    return self._runner.process(inputs)

+  def _set_sample_rate(self, sample_rate_stream_name: str,
+                       sample_rate: float) -> None:
+    """An asynchronous method to set audio sample rate in the audio stream mode.
+
+    Args:
+      sample_rate_stream_name: The audio sample rate stream name.
+      sample_rate: The audio sample rate.
+
+    Raises:
+      ValueError: If the task's running mode is not set to the audio stream
+      mode.
+    """
+    if self._running_mode != _RunningMode.AUDIO_STREAM:
+      raise ValueError(
+          'Task is not initialized with the audio stream mode. Current running mode:'
+          + self._running_mode.name)
+    self._runner.send({
+        sample_rate_stream_name:
+            packet_creator.create_double(sample_rate).at(_Timestamp.PRESTREAM)
+    })
+
  def _send_audio_stream_data(self, inputs: Mapping[str, _Packet]) -> None:
    """An asynchronous method to send audio stream data to the runner.

--- a/mediapipe/tasks/python/components/containers/BUILD
+++ b/mediapipe/tasks/python/components/containers/BUILD
@ -95,6 +95,16 @@ py_library(
    ],
 )

+py_library(
+    name = "classification_result",
+    srcs = ["classification_result.py"],
+    deps = [
+        ":category",
+        "//mediapipe/tasks/cc/components/containers/proto:classifications_py_pb2",
+        "//mediapipe/tasks/python/core:optional_dependencies",
+    ],
+)
+
 py_library(
    name = "embeddings",
    srcs = ["embeddings.py"],
--- a/mediapipe/tasks/python/components/containers/audio_data.py
+++ b/mediapipe/tasks/python/components/containers/audio_data.py
@ -20,7 +20,7 @@ import numpy as np


@dataclasses.dataclass
-class AudioFormat:
+class AudioDataFormat:
  """Audio format metadata.

  Attributes:
@ -35,8 +35,10 @@ class AudioData(object):
  """MediaPipe Tasks' audio container."""

  def __init__(
-      self, buffer_length: int,
-      audio_format: AudioFormat = AudioFormat()) -> None:
+      self,
+      buffer_length: int,
+      audio_format: AudioDataFormat = AudioDataFormat()
+  ) -> None:
    """Initializes the `AudioData` object.

    Args:
@ -113,14 +115,14 @@ class AudioData(object):
    """
    obj = cls(
        buffer_length=src.shape[0],
-        audio_format=AudioFormat(
+        audio_format=AudioDataFormat(
            num_channels=1 if len(src.shape) == 1 else src.shape[1],
            sample_rate=sample_rate))
    obj.load_from_array(src)
    return obj

  @property
-  def audio_format(self) -> AudioFormat:
+  def audio_format(self) -> AudioDataFormat:
    """Gets the audio format of the audio."""
    return self._audio_format

--- a/mediapipe/tasks/python/components/containers/classification_result.py
+++ b/mediapipe/tasks/python/components/containers/classification_result.py
@ -0,0 +1,92 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classifications data class."""
+
+import dataclasses
+from typing import List, Optional
+
+from mediapipe.tasks.cc.components.containers.proto import classifications_pb2
+from mediapipe.tasks.python.components.containers import category as category_module
+from mediapipe.tasks.python.core.optional_dependencies import doc_controls
+
+_ClassificationsProto = classifications_pb2.Classifications
+_ClassificationResultProto = classifications_pb2.ClassificationResult
+
+
+@dataclasses.dataclass
+class Classifications:
+  """Represents the classification results for a given classifier head.
+
+  Attributes:
+    categories: The array of predicted categories, usually sorted by descending
+      scores (e.g. from high to low probability).
+    head_index: The index of the classifier head these categories refer to. This
+      is useful for multi-head models.
+    head_name: The name of the classifier head, which is the corresponding
+      tensor metadata name.
+  """
+
+  categories: List[category_module.Category]
+  head_index: int
+  head_name: Optional[str] = None
+
+  @classmethod
+  @doc_controls.do_not_generate_docs
+  def create_from_pb2(cls, pb2_obj: _ClassificationsProto) -> 'Classifications':
+    """Creates a `Classifications` object from the given protobuf object."""
+    categories = []
+    for entry in pb2_obj.classification_list.classification:
+      categories.append(
+          category_module.Category(
+              index=entry.index,
+              score=entry.score,
+              display_name=entry.display_name,
+              category_name=entry.label))
+
+    return Classifications(
+        categories=categories,
+        head_index=pb2_obj.head_index,
+        head_name=pb2_obj.head_name)
+
+
+@dataclasses.dataclass
+class ClassificationResult:
+  """Contains the classification results of a model.
+
+  Attributes:
+    classifications: A list of `Classifications` objects, each for a head of the
+      model.
+    timestamp_ms: The optional timestamp (in milliseconds) of the start of the
+      chunk of data corresponding to these results. This is only used for
+      classification on time series (e.g. audio classification). In these use
+      cases, the amount of data to process might exceed the maximum size that
+      the model can process: to solve this, the input data is split into
+      multiple chunks starting at different timestamps.
+  """
+
+  classifications: List[Classifications]
+  timestamp_ms: Optional[int] = None
+
+  @classmethod
+  @doc_controls.do_not_generate_docs
+  def create_from_pb2(
+      cls, pb2_obj: _ClassificationResultProto) -> 'ClassificationResult':
+    """Creates a `ClassificationResult` object from the given protobuf object.
+    """
+    return ClassificationResult(
+        classifications=[
+            Classifications.create_from_pb2(classification)
+            for classification in pb2_obj.classifications
+        ],
+        timestamp_ms=pb2_obj.timestamp_ms)
--- a/mediapipe/tasks/python/test/audio/BUILD
+++ b/mediapipe/tasks/python/test/audio/BUILD
@ -0,0 +1,37 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Placeholder for internal Python strict test compatibility macro.
+
+package(default_visibility = ["//mediapipe/tasks:internal"])
+
+licenses(["notice"])
+
+py_test(
+    name = "audio_classifier_test",
+    srcs = ["audio_classifier_test.py"],
+    data = [
+        "//mediapipe/tasks/testdata/audio:test_audio_clips",
+        "//mediapipe/tasks/testdata/audio:test_models",
+    ],
+    deps = [
+        "//mediapipe/tasks/python/audio:audio_classifier",
+        "//mediapipe/tasks/python/audio/core:audio_task_running_mode",
+        "//mediapipe/tasks/python/components/containers:audio_data",
+        "//mediapipe/tasks/python/components/containers:classification_result",
+        "//mediapipe/tasks/python/components/processors:classifier_options",
+        "//mediapipe/tasks/python/core:base_options",
+        "//mediapipe/tasks/python/test:test_utils",
+    ],
+)
--- a/mediapipe/tasks/python/test/audio/init.py
+++ b/mediapipe/tasks/python/test/audio/init.py
@ -0,0 +1,13 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/mediapipe/tasks/python/test/audio/audio_classifier_test.py
+++ b/mediapipe/tasks/python/test/audio/audio_classifier_test.py
@ -0,0 +1,381 @@
+# Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for audio classifier."""
+
+import os
+from typing import List, Tuple
+from unittest import mock
+
+from absl.testing import absltest
+from absl.testing import parameterized
+
+import numpy as np
+from scipy.io import wavfile
+
+from mediapipe.tasks.python.audio import audio_classifier
+from mediapipe.tasks.python.audio.core import audio_task_running_mode
+from mediapipe.tasks.python.components.containers import audio_data as audio_data_module
+from mediapipe.tasks.python.components.containers import classification_result as classification_result_module
+from mediapipe.tasks.python.components.processors import classifier_options
+from mediapipe.tasks.python.core import base_options as base_options_module
+from mediapipe.tasks.python.test import test_utils
+
+_AudioClassifier = audio_classifier.AudioClassifier
+_AudioClassifierOptions = audio_classifier.AudioClassifierOptions
+_AudioClassifierResult = classification_result_module.ClassificationResult
+_AudioData = audio_data_module.AudioData
+_BaseOptions = base_options_module.BaseOptions
+_ClassifierOptions = classifier_options.ClassifierOptions
+_RUNNING_MODE = audio_task_running_mode.AudioTaskRunningMode
+
+_YAMNET_MODEL_FILE = 'yamnet_audio_classifier_with_metadata.tflite'
+_YAMNET_MODEL_SAMPLE_RATE = 16000
+_TWO_HEADS_MODEL_FILE = 'two_heads.tflite'
+_SPEECH_WAV_16K_MONO = 'speech_16000_hz_mono.wav'
+_SPEECH_WAV_48K_MONO = 'speech_48000_hz_mono.wav'
+_TEST_DATA_DIR = 'mediapipe/tasks/testdata/audio'
+_TWO_HEADS_WAV_16K_MONO = 'two_heads_16000_hz_mono.wav'
+_TWO_HEADS_WAV_44K_MONO = 'two_heads_44100_hz_mono.wav'
+_YAMNET_NUM_OF_SAMPLES = 15600
+_MILLSECONDS_PER_SECOND = 1000
+
+
+class AudioClassifierTest(parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.yamnet_model_path = test_utils.get_test_data_path(
+        os.path.join(_TEST_DATA_DIR, _YAMNET_MODEL_FILE))
+    self.two_heads_model_path = test_utils.get_test_data_path(
+        os.path.join(_TEST_DATA_DIR, _TWO_HEADS_MODEL_FILE))
+
+  def _read_wav_file(self, file_name) -> _AudioData:
+    sample_rate, buffer = wavfile.read(
+        test_utils.get_test_data_path(os.path.join(_TEST_DATA_DIR, file_name)))
+    return _AudioData.create_from_array(
+        buffer.astype(float) / np.iinfo(np.int16).max, sample_rate)
+
+  def _read_wav_file_as_stream(self, file_name) -> List[Tuple[_AudioData, int]]:
+    sample_rate, buffer = wavfile.read(
+        test_utils.get_test_data_path(os.path.join(_TEST_DATA_DIR, file_name)))
+    audio_data_list = []
+    start = 0
+    step_size = _YAMNET_NUM_OF_SAMPLES * sample_rate / _YAMNET_MODEL_SAMPLE_RATE
+    while start < len(buffer):
+      end = min(start + (int)(step_size), len(buffer))
+      audio_data_list.append((_AudioData.create_from_array(
+          buffer[start:end].astype(float) / np.iinfo(np.int16).max,
+          sample_rate), (int)(start / sample_rate * _MILLSECONDS_PER_SECOND)))
+      start = end
+    return audio_data_list
+
+  # TODO: Compares the exact score values to capture unexpected
+  # changes in the inference pipeline.
+  def _check_yamnet_result(
+      self,
+      classification_result_list: List[_AudioClassifierResult],
+      expected_num_categories=521):
+    self.assertLen(classification_result_list, 5)
+    for idx, timestamp in enumerate([0, 975, 1950, 2925]):
+      classification_result = classification_result_list[idx]
+      self.assertEqual(classification_result.timestamp_ms, timestamp)
+      self.assertLen(classification_result.classifications, 1)
+      classifcation = classification_result.classifications[0]
+      self.assertEqual(classifcation.head_index, 0)
+      self.assertEqual(classifcation.head_name, 'scores')
+      self.assertLen(classifcation.categories, expected_num_categories)
+      audio_category = classifcation.categories[0]
+      self.assertEqual(audio_category.index, 0)
+      self.assertEqual(audio_category.category_name, 'Speech')
+      self.assertGreater(audio_category.score, 0.9)
+
+  # TODO: Compares the exact score values to capture unexpected
+  # changes in the inference pipeline.
+  def _check_two_heads_result(
+      self,
+      classification_result_list: List[_AudioClassifierResult],
+      first_head_expected_num_categories=521,
+      second_head_expected_num_categories=5):
+    self.assertGreaterEqual(len(classification_result_list), 1)
+    self.assertLessEqual(len(classification_result_list), 2)
+    # Checks the first result.
+    classification_result = classification_result_list[0]
+    self.assertEqual(classification_result.timestamp_ms, 0)
+    self.assertLen(classification_result.classifications, 2)
+    # Checks the first head.
+    yamnet_classifcation = classification_result.classifications[0]
+    self.assertEqual(yamnet_classifcation.head_index, 0)
+    self.assertEqual(yamnet_classifcation.head_name, 'yamnet_classification')
+    self.assertLen(yamnet_classifcation.categories,
+                   first_head_expected_num_categories)
+    # Checks the second head.
+    yamnet_category = yamnet_classifcation.categories[0]
+    self.assertEqual(yamnet_category.index, 508)
+    self.assertEqual(yamnet_category.category_name, 'Environmental noise')
+    self.assertGreater(yamnet_category.score, 0.5)
+    bird_classifcation = classification_result.classifications[1]
+    self.assertEqual(bird_classifcation.head_index, 1)
+    self.assertEqual(bird_classifcation.head_name, 'bird_classification')
+    self.assertLen(bird_classifcation.categories,
+                   second_head_expected_num_categories)
+    bird_category = bird_classifcation.categories[0]
+    self.assertEqual(bird_category.index, 4)
+    self.assertEqual(bird_category.category_name, 'Chestnut-crowned Antpitta')
+    self.assertGreater(bird_category.score, 0.93)
+    # Checks the second result, if present.
+    if len(classification_result_list) == 2:
+      classification_result = classification_result_list[1]
+      self.assertEqual(classification_result.timestamp_ms, 975)
+      self.assertLen(classification_result.classifications, 2)
+      # Checks the first head.
+      yamnet_classifcation = classification_result.classifications[0]
+      self.assertEqual(yamnet_classifcation.head_index, 0)
+      self.assertEqual(yamnet_classifcation.head_name, 'yamnet_classification')
+      self.assertLen(yamnet_classifcation.categories,
+                     first_head_expected_num_categories)
+      yamnet_category = yamnet_classifcation.categories[0]
+      self.assertEqual(yamnet_category.index, 494)
+      self.assertEqual(yamnet_category.category_name, 'Silence')
+      self.assertGreater(yamnet_category.score, 0.9)
+      bird_classifcation = classification_result.classifications[1]
+      self.assertEqual(bird_classifcation.head_index, 1)
+      self.assertEqual(bird_classifcation.head_name, 'bird_classification')
+      self.assertLen(bird_classifcation.categories,
+                     second_head_expected_num_categories)
+      # Checks the second head.
+      bird_category = bird_classifcation.categories[0]
+      self.assertEqual(bird_category.index, 1)
+      self.assertEqual(bird_category.category_name, 'White-breasted Wood-Wren')
+      self.assertGreater(bird_category.score, 0.99)
+
+  def test_create_from_file_succeeds_with_valid_model_path(self):
+    # Creates with default option and valid model file successfully.
+    with _AudioClassifier.create_from_model_path(
+        self.yamnet_model_path) as classifier:
+      self.assertIsInstance(classifier, _AudioClassifier)
+
+  def test_create_from_options_succeeds_with_valid_model_path(self):
+    # Creates with options containing model file successfully.
+    with _AudioClassifier.create_from_options(
+        _AudioClassifierOptions(
+            base_options=_BaseOptions(
+                model_asset_path=self.yamnet_model_path))) as classifier:
+      self.assertIsInstance(classifier, _AudioClassifier)
+
+  def test_create_from_options_fails_with_invalid_model_path(self):
+    # Invalid empty model path.
+    with self.assertRaisesRegex(
+        ValueError,
+        r"ExternalFile must specify at least one of 'file_content', "
+        r"'file_name', 'file_pointer_meta' or 'file_descriptor_meta'."):
+      base_options = _BaseOptions(model_asset_path='')
+      options = _AudioClassifierOptions(base_options=base_options)
+      _AudioClassifier.create_from_options(options)
+
+  def test_create_from_options_succeeds_with_valid_model_content(self):
+    # Creates with options containing model content successfully.
+    with open(self.yamnet_model_path, 'rb') as f:
+      base_options = _BaseOptions(model_asset_buffer=f.read())
+      options = _AudioClassifierOptions(base_options=base_options)
+      classifier = _AudioClassifier.create_from_options(options)
+      self.assertIsInstance(classifier, _AudioClassifier)
+
+  @parameterized.parameters((_SPEECH_WAV_16K_MONO), (_SPEECH_WAV_48K_MONO))
+  def test_classify_with_yamnet_model(self, audio_file):
+    with _AudioClassifier.create_from_model_path(
+        self.yamnet_model_path) as classifier:
+      classification_result_list = classifier.classify(
+          self._read_wav_file(audio_file))
+      self._check_yamnet_result(classification_result_list)
+
+  def test_classify_with_yamnet_model_and_inputs_at_different_sample_rates(
+      self):
+    with _AudioClassifier.create_from_model_path(
+        self.yamnet_model_path) as classifier:
+      for audio_file in [_SPEECH_WAV_16K_MONO, _SPEECH_WAV_48K_MONO]:
+        classification_result_list = classifier.classify(
+            self._read_wav_file(audio_file))
+        self._check_yamnet_result(classification_result_list)
+
+  def test_max_result_options(self):
+    with _AudioClassifier.create_from_options(
+        _AudioClassifierOptions(
+            base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+            classifier_options=_ClassifierOptions(
+                max_results=1))) as classifier:
+      for audio_file in [_SPEECH_WAV_16K_MONO, _SPEECH_WAV_16K_MONO]:
+        classification_result_list = classifier.classify(
+            self._read_wav_file(audio_file))
+        self._check_yamnet_result(
+            classification_result_list, expected_num_categories=1)
+
+  def test_score_threshold_options(self):
+    with _AudioClassifier.create_from_options(
+        _AudioClassifierOptions(
+            base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+            classifier_options=_ClassifierOptions(
+                score_threshold=0.9))) as classifier:
+      for audio_file in [_SPEECH_WAV_16K_MONO, _SPEECH_WAV_16K_MONO]:
+        classification_result_list = classifier.classify(
+            self._read_wav_file(audio_file))
+        self._check_yamnet_result(
+            classification_result_list, expected_num_categories=1)
+
+  def test_allow_list_option(self):
+    with _AudioClassifier.create_from_options(
+        _AudioClassifierOptions(
+            base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+            classifier_options=_ClassifierOptions(
+                category_allowlist=['Speech']))) as classifier:
+      for audio_file in [_SPEECH_WAV_16K_MONO, _SPEECH_WAV_16K_MONO]:
+        classification_result_list = classifier.classify(
+            self._read_wav_file(audio_file))
+        self._check_yamnet_result(
+            classification_result_list, expected_num_categories=1)
+
+  def test_combined_allowlist_and_denylist(self):
+    # Fails with combined allowlist and denylist
+    with self.assertRaisesRegex(
+        ValueError,
+        r'`category_allowlist` and `category_denylist` are mutually '
+        r'exclusive options.'):
+      options = _AudioClassifierOptions(
+          base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+          classifier_options=_ClassifierOptions(
+              category_allowlist=['foo'], category_denylist=['bar']))
+      with _AudioClassifier.create_from_options(options) as unused_classifier:
+        pass
+
+  @parameterized.parameters((_TWO_HEADS_WAV_16K_MONO),
+                            (_TWO_HEADS_WAV_44K_MONO))
+  def test_classify_with_two_heads_model_and_inputs_at_different_sample_rates(
+      self, audio_file):
+    with _AudioClassifier.create_from_model_path(
+        self.two_heads_model_path) as classifier:
+      classification_result_list = classifier.classify(
+          self._read_wav_file(audio_file))
+      self._check_two_heads_result(classification_result_list)
+
+  def test_classify_with_two_heads_model(self):
+    with _AudioClassifier.create_from_model_path(
+        self.two_heads_model_path) as classifier:
+      for audio_file in [_TWO_HEADS_WAV_16K_MONO, _TWO_HEADS_WAV_44K_MONO]:
+        classification_result_list = classifier.classify(
+            self._read_wav_file(audio_file))
+        self._check_two_heads_result(classification_result_list)
+
+  def test_classify_with_two_heads_model_with_max_results(self):
+    with _AudioClassifier.create_from_options(
+        _AudioClassifierOptions(
+            base_options=_BaseOptions(
+                model_asset_path=self.two_heads_model_path),
+            classifier_options=_ClassifierOptions(
+                max_results=1))) as classifier:
+      for audio_file in [_TWO_HEADS_WAV_16K_MONO, _TWO_HEADS_WAV_44K_MONO]:
+        classification_result_list = classifier.classify(
+            self._read_wav_file(audio_file))
+        self._check_two_heads_result(classification_result_list, 1, 1)
+
+  def test_missing_sample_rate_in_audio_clips_mode(self):
+    options = _AudioClassifierOptions(
+        base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+        running_mode=_RUNNING_MODE.AUDIO_CLIPS)
+    with self.assertRaisesRegex(ValueError,
+                                r'Must provide the audio sample rate'):
+      with _AudioClassifier.create_from_options(options) as classifier:
+        classifier.classify(_AudioData(buffer_length=100))
+
+  def test_missing_sample_rate_in_audio_stream_mode(self):
+    options = _AudioClassifierOptions(
+        base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+        running_mode=_RUNNING_MODE.AUDIO_STREAM,
+        result_callback=mock.MagicMock())
+    with self.assertRaisesRegex(ValueError,
+                                r'provide the audio sample rate in audio data'):
+      with _AudioClassifier.create_from_options(options) as classifier:
+        classifier.classify(_AudioData(buffer_length=100))
+
+  def test_missing_result_callback(self):
+    options = _AudioClassifierOptions(
+        base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+        running_mode=_RUNNING_MODE.AUDIO_STREAM)
+    with self.assertRaisesRegex(ValueError,
+                                r'result callback must be provided'):
+      with _AudioClassifier.create_from_options(options) as unused_classifier:
+        pass
+
+  def test_illegal_result_callback(self):
+    options = _AudioClassifierOptions(
+        base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+        running_mode=_RUNNING_MODE.AUDIO_CLIPS,
+        result_callback=mock.MagicMock())
+    with self.assertRaisesRegex(ValueError,
+                                r'result callback should not be provided'):
+      with _AudioClassifier.create_from_options(options) as unused_classifier:
+        pass
+
+  def test_calling_classify_in_audio_stream_mode(self):
+    options = _AudioClassifierOptions(
+        base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+        running_mode=_RUNNING_MODE.AUDIO_STREAM,
+        result_callback=mock.MagicMock())
+    with _AudioClassifier.create_from_options(options) as classifier:
+      with self.assertRaisesRegex(ValueError,
+                                  r'not initialized with the audio clips mode'):
+        classifier.classify(self._read_wav_file(_SPEECH_WAV_16K_MONO))
+
+  def test_calling_classify_async_in_audio_clips_mode(self):
+    options = _AudioClassifierOptions(
+        base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+        running_mode=_RUNNING_MODE.AUDIO_CLIPS)
+    with _AudioClassifier.create_from_options(options) as classifier:
+      with self.assertRaisesRegex(
+          ValueError, r'not initialized with the audio stream mode'):
+        classifier.classify_async(self._read_wav_file(_SPEECH_WAV_16K_MONO), 0)
+
+  def test_classify_async_calls_with_illegal_timestamp(self):
+    options = _AudioClassifierOptions(
+        base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+        running_mode=_RUNNING_MODE.AUDIO_STREAM,
+        result_callback=mock.MagicMock())
+    with _AudioClassifier.create_from_options(options) as classifier:
+      classifier.classify_async(self._read_wav_file(_SPEECH_WAV_16K_MONO), 100)
+      with self.assertRaisesRegex(
+          ValueError, r'Input timestamp must be monotonically increasing'):
+        classifier.classify_async(self._read_wav_file(_SPEECH_WAV_16K_MONO), 0)
+
+  @parameterized.parameters((_SPEECH_WAV_16K_MONO), (_SPEECH_WAV_48K_MONO))
+  def test_classify_async(self, audio_file):
+    classification_result_list = []
+
+    def save_result(result: _AudioClassifierResult, timestamp_ms: int):
+      result.timestamp_ms = timestamp_ms
+      classification_result_list.append(result)
+
+    options = _AudioClassifierOptions(
+        base_options=_BaseOptions(model_asset_path=self.yamnet_model_path),
+        running_mode=_RUNNING_MODE.AUDIO_STREAM,
+        classifier_options=_ClassifierOptions(max_results=1),
+        result_callback=save_result)
+    classifier = _AudioClassifier.create_from_options(options)
+    audio_data_list = self._read_wav_file_as_stream(audio_file)
+    for audio_data, timestamp_ms in audio_data_list:
+      classifier.classify_async(audio_data, timestamp_ms)
+    classifier.close()
+    self._check_yamnet_result(
+        classification_result_list, expected_num_categories=1)
+
+
+if __name__ == '__main__':
+  absltest.main()
--- a/mediapipe/tasks/python/text/init.py
+++ b/mediapipe/tasks/python/text/init.py
@ -11,3 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+"""MediaPipe Tasks Text API."""
+
+import mediapipe.tasks.python.text.text_classifier
+
+TextClassifier = text_classifier.TextClassifier
+TextClassifierOptions = text_classifier.TextClassifierOptions
+
+# Remove unnecessary modules to avoid duplication in API docs.
+del mediapipe
+del text_classifier
--- a/mediapipe/tasks/web/audio/BUILD
+++ b/mediapipe/tasks/web/audio/BUILD
@ -0,0 +1,11 @@
+# This contains the MediaPipe Audio Tasks.
+
+load("//mediapipe/framework/port:build_config.bzl", "mediapipe_ts_library")
+
+mediapipe_ts_library(
+    name = "audio_lib",
+    srcs = ["index.ts"],
+    deps = [
+        "//mediapipe/tasks/web/audio/audio_classifier",
+    ],
+)
--- a/mediapipe/tasks/web/audio/index.ts
+++ b/mediapipe/tasks/web/audio/index.ts
@ -0,0 +1,20 @@
+/**
+ * Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Audio Classifier
+export * from '../../../tasks/web/audio/audio_classifier/audio_classifier_options';
+export * from '../../../tasks/web/audio/audio_classifier/audio_classifier_result';
+export * from '../../../tasks/web/audio/audio_classifier/audio_classifier';
--- a/mediapipe/tasks/web/text/BUILD
+++ b/mediapipe/tasks/web/text/BUILD
@ -0,0 +1,11 @@
+# This contains the MediaPipe Text Tasks.
+
+load("//mediapipe/framework/port:build_config.bzl", "mediapipe_ts_library")
+
+mediapipe_ts_library(
+    name = "text_lib",
+    srcs = ["index.ts"],
+    deps = [
+        "//mediapipe/tasks/web/text/text_classifier",
+    ],
+)
--- a/mediapipe/tasks/web/text/index.ts
+++ b/mediapipe/tasks/web/text/index.ts
@ -0,0 +1,20 @@
+/**
+ * Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Text Classifier
+export * from '../../../tasks/web/text/text_classifier/text_classifier_options';
+export * from '../../../tasks/web/text/text_classifier/text_classifier_result';
+export * from '../../../tasks/web/text/text_classifier/text_classifier';
--- a/mediapipe/tasks/web/vision/BUILD
+++ b/mediapipe/tasks/web/vision/BUILD
@ -0,0 +1,13 @@
+# This contains the MediaPipe Vision Tasks.
+
+load("//mediapipe/framework/port:build_config.bzl", "mediapipe_ts_library")
+
+mediapipe_ts_library(
+    name = "vision_lib",
+    srcs = ["index.ts"],
+    deps = [
+        "//mediapipe/tasks/web/vision/gesture_recognizer",
+        "//mediapipe/tasks/web/vision/image_classifier",
+        "//mediapipe/tasks/web/vision/object_detector",
+    ],
+)
--- a/mediapipe/tasks/web/vision/index.ts
+++ b/mediapipe/tasks/web/vision/index.ts
@ -0,0 +1,30 @@
+/**
+ * Copyright 2022 The MediaPipe Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Image Classifier
+export * from '../../../tasks/web/vision/image_classifier/image_classifier_options';
+export * from '../../../tasks/web/vision/image_classifier/image_classifier_result';
+export * from '../../../tasks/web/vision/image_classifier/image_classifier';
+
+// Gesture Recognizer
+export * from '../../../tasks/web/vision/gesture_recognizer/gesture_recognizer_options';
+export * from '../../../tasks/web/vision/gesture_recognizer/gesture_recognizer_result';
+export * from '../../../tasks/web/vision/gesture_recognizer/gesture_recognizer';
+
+// Object Detector
+export * from '../../../tasks/web/vision/object_detector/object_detector_options';
+export * from '../../../tasks/web/vision/object_detector/object_detector_result';
+export * from '../../../tasks/web/vision/object_detector/object_detector';