Resolved issues and added a common header to hold all the necessary structures for the vision tasks

2023-11-07 14:23:15 -08:00 · 2023-11-07 14:23:15 -08:00 · c442d6117e
commit c442d6117e
parent 197358dfee
13 changed files with 185 additions and 108 deletions
--- a/mediapipe/tasks/c/components/containers/embedding_result_converter.cc
+++ b/mediapipe/tasks/c/components/containers/embedding_result_converter.cc
@ -66,7 +66,7 @@ void CppConvertToEmbeddingResult(
  }
 }

-void ConvertToCppEmbedding(
+void CppConvertToCppEmbedding(
    const Embedding& in,  // C struct as input
    mediapipe::tasks::components::containers::Embedding* out) {
  // Handle float embeddings
@ -85,7 +85,7 @@ void ConvertToCppEmbedding(

  // Copy head_name if it is present.
  if (in.head_name) {
-    out->head_name = std::make_optional(std::string(in.head_name));
+    out->head_name = std::string(in.head_name);
  }
 }

--- a/mediapipe/tasks/c/components/containers/embedding_result_converter.h
+++ b/mediapipe/tasks/c/components/containers/embedding_result_converter.h
@ -29,7 +29,7 @@ void CppConvertToEmbeddingResult(
    const mediapipe::tasks::components::containers::EmbeddingResult& in,
    EmbeddingResult* out);

-void ConvertToCppEmbedding(
+void CppConvertToCppEmbedding(
    const Embedding& in,
    mediapipe::tasks::components::containers::Embedding* out);

--- a/mediapipe/tasks/c/text/text_embedder/text_embedder.cc
+++ b/mediapipe/tasks/c/text/text_embedder/text_embedder.cc
@ -29,9 +29,8 @@ namespace mediapipe::tasks::c::text::text_embedder {

 namespace {

-
-using ::mediapipe::tasks::c::components::containers::ConvertToCppEmbedding;
 using ::mediapipe::tasks::c::components::containers::CppCloseEmbeddingResult;
+using ::mediapipe::tasks::c::components::containers::CppConvertToCppEmbedding;
 using ::mediapipe::tasks::c::components::containers::
    CppConvertToEmbeddingResult;
 using ::mediapipe::tasks::c::components::processors::
@ -97,9 +96,9 @@ int CppTextEmbedderClose(void* embedder, char** error_msg) {
 int CppTextEmbedderCosineSimilarity(const Embedding& u, const Embedding& v,
                                    double* similarity, char** error_msg) {
  CppEmbedding cpp_u;
-  ConvertToCppEmbedding(u, &cpp_u);
+  CppConvertToCppEmbedding(u, &cpp_u);
  CppEmbedding cpp_v;
-  ConvertToCppEmbedding(v, &cpp_v);
+  CppConvertToCppEmbedding(v, &cpp_v);
  auto status_or_similarity =
      mediapipe::tasks::text::text_embedder::TextEmbedder::CosineSimilarity(
          cpp_u, cpp_v);
@ -137,8 +136,8 @@ int text_embedder_close(void* embedder, char** error_ms) {
      embedder, error_ms);
 }

-int cosine_similarity(const Embedding& u, const Embedding& v,
-                      double* similarity, char** error_msg) {
+int text_embedder_cosine_similarity(const Embedding& u, const Embedding& v,
+                                    double* similarity, char** error_msg) {
  return mediapipe::tasks::c::text::text_embedder::
      CppTextEmbedderCosineSimilarity(u, v, similarity, error_msg);
 }
--- a/mediapipe/tasks/c/text/text_embedder/text_embedder.h
+++ b/mediapipe/tasks/c/text/text_embedder/text_embedder.h
@ -72,8 +72,10 @@ MP_EXPORT int text_embedder_close(void* embedder, char** error_msg);
 // 0.
 //
 // [1]: https://en.wikipedia.org/wiki/Cosine_similarity
-MP_EXPORT int cosine_similarity(const Embedding& u, const Embedding& v,
-                                double* similarity, char** error_msg = nullptr);
+MP_EXPORT int text_embedder_cosine_similarity(const Embedding& u,
+                                              const Embedding& v,
+                                              double* similarity,
+                                              char** error_msg);

 #ifdef __cplusplus
 }  // extern C
--- a/mediapipe/tasks/c/text/text_embedder/text_embedder_test.cc
+++ b/mediapipe/tasks/c/text/text_embedder/text_embedder_test.cc
@ -33,10 +33,10 @@ constexpr char kTestDataDirectory[] = "/mediapipe/tasks/testdata/text/";
 constexpr char kTestBertModelPath[] =
    "mobilebert_embedding_with_metadata.tflite";
 constexpr char kTestString0[] =
-        "When you go to this restaurant, they hold the pancake upside-down "
-        "before they hand it to you. It's a great gimmick.";
+    "When you go to this restaurant, they hold the pancake upside-down "
+    "before they hand it to you. It's a great gimmick.";
 constexpr char kTestString1[] =
-        "Let's make a plan to steal the declaration of independence.";
+    "Let's make a plan to steal the declaration of independence.";
 constexpr float kPrecision = 1e-3;

 std::string GetFullPath(absl::string_view file_name) {
@ -81,14 +81,16 @@ TEST(TextEmbedderTest, SucceedsWithCosineSimilarity) {

  // Extract both embeddings.
  TextEmbedderResult result0;
-  text_embedder_embed(embedder, kTestString0, &result0, /* error_msg */ nullptr);
+  text_embedder_embed(embedder, kTestString0, &result0,
+                      /* error_msg */ nullptr);
  TextEmbedderResult result1;
-  text_embedder_embed(embedder, kTestString1, &result1, /* error_msg */ nullptr);
+  text_embedder_embed(embedder, kTestString1, &result1,
+                      /* error_msg */ nullptr);

  // Check cosine similarity.
  double similarity;
-  cosine_similarity(result0.embeddings[0], result1.embeddings[0],
-                    &similarity);
+  text_embedder_cosine_similarity(result0.embeddings[0], result1.embeddings[0],
+                                  &similarity, nullptr);
  double expected_similarity = 0.98077;
  EXPECT_LE(abs(similarity - expected_similarity), kPrecision);
  text_embedder_close(embedder, /* error_msg */ nullptr);
--- a/mediapipe/tasks/c/vision/core/BUILD
+++ b/mediapipe/tasks/c/vision/core/BUILD
@ -0,0 +1,22 @@
+# Copyright 2023 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(default_visibility = ["//mediapipe/tasks:internal"])
+
+licenses(["notice"])
+
+cc_library(
+    name = "common",
+    hdrs = ["common.h"],
+)
--- a/mediapipe/tasks/c/vision/core/common.h
+++ b/mediapipe/tasks/c/vision/core/common.h
@ -0,0 +1,69 @@
+/* Copyright 2023 The MediaPipe Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+#ifndef MEDIAPIPE_TASKS_C_VISION_CORE_COMMON_H_
+#define MEDIAPIPE_TASKS_C_VISION_CORE_COMMON_H_
+
+#include <cstdint>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Supported image formats.
+enum ImageFormat {
+    UNKNOWN = 0,
+    SRGB = 1,
+    SRGBA = 2,
+    GRAY8 = 3,
+    SBGRA = 11  // compatible with Flutter `bgra8888` format.
+};
+
+// Supported processing modes.
+enum RunningMode {
+    IMAGE = 1,
+    VIDEO = 2,
+    LIVE_STREAM = 3,
+};
+
+// Structure to hold image frame.
+struct ImageFrame {
+    enum ImageFormat format;
+    const uint8_t* image_buffer;
+    int width;
+    int height;
+};
+
+// TODO: Add GPU buffer declaration and processing logic for it.
+struct GpuBuffer {
+    int width;
+    int height;
+};
+
+// The object to contain an image, realizes `OneOf` concept.
+struct MpImage {
+    enum { IMAGE_FRAME, GPU_BUFFER } type;
+    union {
+        struct ImageFrame image_frame;
+        struct GpuBuffer gpu_buffer;
+    };
+};
+
+#ifdef __cplusplus
+}  // extern C
+#endif
+
+#endif  // MEDIAPIPE_TASKS_C_VISION_CORE_COMMON_H_
--- a/mediapipe/tasks/c/vision/image_classifier/BUILD
+++ b/mediapipe/tasks/c/vision/image_classifier/BUILD
@ -30,6 +30,7 @@ cc_library(
        "//mediapipe/tasks/c/components/processors:classifier_options_converter",
        "//mediapipe/tasks/c/core:base_options",
        "//mediapipe/tasks/c/core:base_options_converter",
+        "//mediapipe/tasks/c/vision/core:common",
        "//mediapipe/tasks/cc/vision/core:running_mode",
        "//mediapipe/tasks/cc/vision/image_classifier",
        "//mediapipe/tasks/cc/vision/utils:image_utils",
--- a/mediapipe/tasks/c/vision/image_classifier/image_classifier.h
+++ b/mediapipe/tasks/c/vision/image_classifier/image_classifier.h
@ -16,11 +16,10 @@ limitations under the License.
 #ifndef MEDIAPIPE_TASKS_C_VISION_IMAGE_CLASSIFIER_IMAGE_CLASSIFIER_H_
 #define MEDIAPIPE_TASKS_C_VISION_IMAGE_CLASSIFIER_IMAGE_CLASSIFIER_H_

-#include <cstdint>
-
 #include "mediapipe/tasks/c/components/containers/classification_result.h"
 #include "mediapipe/tasks/c/components/processors/classifier_options.h"
 #include "mediapipe/tasks/c/core/base_options.h"
+#include "mediapipe/tasks/c/vision/core/common.h"

 #ifndef MP_EXPORT
 #define MP_EXPORT __attribute__((visibility("default")))
@ -32,46 +31,7 @@ extern "C" {

 typedef ClassificationResult ImageClassifierResult;

-// Supported image formats.
-enum ImageFormat {
-  UNKNOWN = 0,
-  SRGB = 1,
-  SRGBA = 2,
-  GRAY8 = 3,
-  SBGRA = 11  // compatible with Flutter `bgra8888` format.
-};
-
-// Supported processing modes.
-enum RunningMode {
-  IMAGE = 1,
-  VIDEO = 2,
-  LIVE_STREAM = 3,
-};
-
-// Structure to hold image frame.
-struct ImageFrame {
-  enum ImageFormat format;
-  const uint8_t* image_buffer;
-  int width;
-  int height;
-};
-
-// TODO: Add GPU buffer declaration and proccessing logic for it.
-struct GpuBuffer {
-  int width;
-  int height;
-};
-
-// The object to contain an image, realizes `OneOf` concept.
-struct MpImage {
-  enum { IMAGE_FRAME, GPU_BUFFER } type;
-  union {
-    struct ImageFrame image_frame;
-    struct GpuBuffer gpu_buffer;
-  };
-};
-
-// The options for configuring a Mediapipe image classifier task.
+// The options for configuring a MediaPipe image classifier task.
 struct ImageClassifierOptions {
  // Base options for configuring MediaPipe Tasks, such as specifying the model
  // file with metadata, accelerator options, op resolver, etc.
@ -122,12 +82,39 @@ MP_EXPORT int image_classifier_classify_image(void* classifier,
                                              ImageClassifierResult* result,
                                              char** error_msg);

+// Performs image classification on the provided video frame.
+// Only use this method when the ImageClassifier is created with the video
+// running mode.
+// The image can be of any size with format RGB or RGBA. It's required to
+// provide the video frame's timestamp (in milliseconds). The input timestamps
+// must be monotonically increasing.
+// If an error occurs, returns an error code and sets the error parameter to an
+// an error message (if `error_msg` is not `nullptr`). You must free the memory
+// allocated for the error message.
 MP_EXPORT int image_classifier_classify_for_video(void* classifier,
                                                  const MpImage* image,
                                                  int64_t timestamp_ms,
                                                  ImageClassifierResult* result,
                                                  char** error_msg);

+// Sends live image data to image classification, and the results will be
+// available via the "result_callback" provided in the ImageClassifierOptions.
+// Only use this method when the ImageClassifier is created with the live
+// stream running mode.
+// The image can be of any size with format RGB or RGBA. It's required to
+// provide a timestamp (in milliseconds) to indicate when the input image is
+// sent to the object detector. The input timestamps must be monotonically
+// increasing.
+// The "result_callback" provides:
+//   - The classification results as an ImageClassifierResult object.
+//   - The const reference to the corresponding input image that the image
+//     classifier runs on. Note that the const reference to the image will no
+//     longer be valid when the callback returns. To access the image data
+//     outside of the callback, callers need to make a copy of the image.
+//   - The input timestamp in milliseconds.
+// If an error occurs, returns an error code and sets the error parameter to an
+// an error message (if `error_msg` is not `nullptr`). You must free the memory
+// allocated for the error message.
 MP_EXPORT int image_classifier_classify_async(void* classifier,
                                              const MpImage* image,
                                              int64_t timestamp_ms,
--- a/mediapipe/tasks/c/vision/image_embedder/BUILD
+++ b/mediapipe/tasks/c/vision/image_embedder/BUILD
@ -30,6 +30,8 @@ cc_library(
        "//mediapipe/tasks/c/components/processors:embedder_options_converter",
        "//mediapipe/tasks/c/core:base_options",
        "//mediapipe/tasks/c/core:base_options_converter",
+        "//mediapipe/tasks/c/vision/core:common",
+        "//mediapipe/tasks/cc/vision/core:running_mode",
        "//mediapipe/tasks/cc/vision/image_embedder",
        "//mediapipe/tasks/cc/vision/utils:image_utils",
        "@com_google_absl//absl/log:absl_log",
--- a/mediapipe/tasks/c/vision/image_embedder/image_embedder.cc
+++ b/mediapipe/tasks/c/vision/image_embedder/image_embedder.cc
@ -36,8 +36,8 @@ namespace mediapipe::tasks::c::vision::image_embedder {

 namespace {

-using ::mediapipe::tasks::c::components::containers::ConvertToCppEmbedding;
 using ::mediapipe::tasks::c::components::containers::CppCloseEmbeddingResult;
+using ::mediapipe::tasks::c::components::containers::CppConvertToCppEmbedding;
 using ::mediapipe::tasks::c::components::containers::
    CppConvertToEmbeddingResult;
 using ::mediapipe::tasks::c::components::processors::
@ -235,9 +235,9 @@ int CppImageEmbedderClose(void* embedder, char** error_msg) {
 int CppImageEmbedderCosineSimilarity(const Embedding& u, const Embedding& v,
                                     double* similarity, char** error_msg) {
  CppEmbedding cpp_u;
-  ConvertToCppEmbedding(u, &cpp_u);
+  CppConvertToCppEmbedding(u, &cpp_u);
  CppEmbedding cpp_v;
-  ConvertToCppEmbedding(v, &cpp_v);
+  CppConvertToCppEmbedding(v, &cpp_v);
  auto status_or_similarity =
      mediapipe::tasks::vision::image_embedder::ImageEmbedder::CosineSimilarity(
          cpp_u, cpp_v);
@ -291,8 +291,8 @@ int image_embedder_close(void* embedder, char** error_msg) {
      embedder, error_msg);
 }

-int cosine_similarity(const Embedding& u, const Embedding& v,
-                      double* similarity, char** error_msg) {
+int image_embedder_cosine_similarity(const Embedding& u, const Embedding& v,
+                                     double* similarity, char** error_msg) {
  return mediapipe::tasks::c::vision::image_embedder::
      CppImageEmbedderCosineSimilarity(u, v, similarity, error_msg);
 }
--- a/mediapipe/tasks/c/vision/image_embedder/image_embedder.h
+++ b/mediapipe/tasks/c/vision/image_embedder/image_embedder.h
@ -21,6 +21,7 @@ limitations under the License.
 #include "mediapipe/tasks/c/components/containers/embedding_result.h"
 #include "mediapipe/tasks/c/components/processors/embedder_options.h"
 #include "mediapipe/tasks/c/core/base_options.h"
+#include "mediapipe/tasks/c/vision/core/common.h"

 #ifndef MP_EXPORT
 #define MP_EXPORT __attribute__((visibility("default")))
@ -32,45 +33,6 @@ extern "C" {

 typedef EmbeddingResult ImageEmbedderResult;

-// Supported image formats.
-enum ImageFormat {
-  UNKNOWN = 0,
-  SRGB = 1,
-  SRGBA = 2,
-  GRAY8 = 3,
-  SBGRA = 11  // compatible with Flutter `bgra8888` format.
-};
-
-// Supported processing modes.
-enum RunningMode {
-  IMAGE = 1,
-  VIDEO = 2,
-  LIVE_STREAM = 3,
-};
-
-// Structure to hold image frame.
-struct ImageFrame {
-  enum ImageFormat format;
-  const uint8_t* image_buffer;
-  int width;
-  int height;
-};
-
-// TODO: Add GPU buffer declaration and proccessing logic for it.
-struct GpuBuffer {
-  int width;
-  int height;
-};
-
-// The object to contain an image, realizes `OneOf` concept.
-struct MpImage {
-  enum { IMAGE_FRAME, GPU_BUFFER } type;
-  union {
-    struct ImageFrame image_frame;
-    struct GpuBuffer gpu_buffer;
-  };
-};
-
 // The options for configuring a MediaPipe image embedder task.
 struct ImageEmbedderOptions {
  // Base options for configuring MediaPipe Tasks, such as specifying the model
@ -121,12 +83,40 @@ MP_EXPORT int image_embedder_embed_image(void* embedder, const MpImage* image,
                                         ImageEmbedderResult* result,
                                         char** error_msg);

+// Performs embedding extraction on the provided video frame.
+// Only use this method when the ImageEmbedder is created with the video
+// running mode.
+// The image can be of any size with format RGB or RGBA. It's required to
+// provide the video frame's timestamp (in milliseconds). The input timestamps
+// must be monotonically increasing.
+// If an error occurs, returns an error code and sets the error parameter to an
+// an error message (if `error_msg` is not `nullptr`). You must free the memory
+// allocated for the error message.
 MP_EXPORT int image_embedder_embed_for_video(void* embedder,
                                             const MpImage* image,
                                             int64_t timestamp_ms,
                                             ImageEmbedderResult* result,
                                             char** error_msg);

+// Sends live image data to embedder, and the results will be available via
+// the "result_callback" provided in the ImageEmbedderOptions.
+// Only use this method when the ImageEmbedder is created with the live
+// stream running mode.
+// The image can be of any size with format RGB or RGBA. It's required to
+// provide a timestamp (in milliseconds) to indicate when the input image is
+// sent to the object detector. The input timestamps must be monotonically
+// increasing.
+// The "result_callback" provides
+//   - The embedding results as a
+//     components::containers::proto::EmbeddingResult object.
+//   - The const reference to the corresponding input image that the image
+//     embedder runs on. Note that the const reference to the image will no
+//     longer be valid when the callback returns. To access the image data
+//     outside of the callback, callers need to make a copy of the image.
+//   - The input timestamp in milliseconds.
+// If an error occurs, returns an error code and sets the error parameter to an
+// an error message (if `error_msg` is not `nullptr`). You must free the memory
+// allocated for the error message.
 MP_EXPORT int image_embedder_embed_async(void* embedder, const MpImage* image,
                                         int64_t timestamp_ms,
                                         char** error_msg);
@ -147,8 +137,10 @@ MP_EXPORT int image_embedder_close(void* embedder, char** error_msg);
 // 0.
 //
 // [1]: https://en.wikipedia.org/wiki/Cosine_similarity
-MP_EXPORT int cosine_similarity(const Embedding& u, const Embedding& v,
-                                double* similarity, char** error_msg);
+MP_EXPORT int image_embedder_cosine_similarity(const Embedding& u,
+                                               const Embedding& v,
+                                               double* similarity,
+                                               char** error_msg);

 #ifdef __cplusplus
 }  // extern C
--- a/mediapipe/tasks/c/vision/image_embedder/image_embedder_test.cc
+++ b/mediapipe/tasks/c/vision/image_embedder/image_embedder_test.cc
@ -143,8 +143,9 @@ TEST(ImageEmbedderTest, SucceedsWithCosineSimilarity) {
  CheckMobileNetV3Result(crop_result, false);
  // Check cosine similarity.
  double similarity;
-  cosine_similarity(image_result.embeddings[0], crop_result.embeddings[0],
-                    &similarity, /* error_msg */ nullptr);
+  image_embedder_cosine_similarity(image_result.embeddings[0],
+                                   crop_result.embeddings[0], &similarity,
+                                   /* error_msg */ nullptr);
  double expected_similarity = 0.925519;
  EXPECT_LE(abs(similarity - expected_similarity), kPrecision);
  image_embedder_close_result(&image_result);