From a44c8109217ac7672abcdc4cb3ae7ddb93145e4b Mon Sep 17 00:00:00 2001
From: MediaPipe Team <mediapipe-team@google.com>
Date: Fri, 18 Aug 2023 15:44:04 -0700
Subject: [PATCH] Update PackMediaSequenceCalculator to support adding
 clip/media/id to the MediaSequence.

As the media ID is usually a video ID which is provided to the graph as a side packet, in this graph it expects it to be provided as as a input side packet instead of an input stream.

PiperOrigin-RevId: 558266967
---
 mediapipe/calculators/tensorflow/BUILD        |  1 +
 .../pack_media_sequence_calculator.cc         | 40 +++++---
 .../pack_media_sequence_calculator_test.cc    | 92 ++++++++++++++++++-
 3 files changed, 120 insertions(+), 13 deletions(-)
diff --git a/mediapipe/calculators/tensorflow/BUILD b/mediapipe/calculators/tensorflow/BUILD
index 374478457..78da0934c 100644
--- a/mediapipe/calculators/tensorflow/BUILD
+++ b/mediapipe/calculators/tensorflow/BUILD
@@ -929,6 +929,7 @@ cc_test(
         "//mediapipe/calculators/image:opencv_image_encoder_calculator_cc_proto",
         "//mediapipe/framework:calculator_framework",
         "//mediapipe/framework:calculator_runner",
+        "//mediapipe/framework:packet",
         "//mediapipe/framework:timestamp",
         "//mediapipe/framework/formats:classification_cc_proto",
         "//mediapipe/framework/formats:detection_cc_proto",
diff --git a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc
index 75878b74a..9185e22a5 100644
--- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc
+++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -45,6 +46,7 @@ const char kForwardFlowEncodedTag[] = "FORWARD_FLOW_ENCODED";
 const char kBBoxTag[] = "BBOX";
 const char kKeypointsTag[] = "KEYPOINTS";
 const char kSegmentationMaskTag[] = "CLASS_SEGMENTATION";
+const char kClipMediaIdTag[] = "CLIP_MEDIA_ID";
 
 namespace tf = ::tensorflow;
 namespace mpms = mediapipe::mediasequence;
@@ -56,17 +58,21 @@ namespace mpms = mediapipe::mediasequence;
 // context features can be supplied verbatim in the calculator's options. The
 // SequenceExample will conform to the description in media_sequence.h.
 //
-// The supported input stream tags are "IMAGE", which stores the encoded
-// images from the OpenCVImageEncoderCalculator, "IMAGE_LABEL", which stores
-// image labels from vector<Classification>, "FORWARD_FLOW_ENCODED", which
-// stores the encoded optical flow from the same calculator, "BBOX" which stores
-// bounding boxes from vector<Detections>, and streams with the
-// "FLOAT_FEATURE_${NAME}" pattern, which stores the values from vector<float>'s
-// associated with the name ${NAME}. "KEYPOINTS" stores a map of 2D keypoints
-// from flat_hash_map<string, vector<pair<float, float>>>. "IMAGE_${NAME}",
-// "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store prefixed versions of
-// each stream, which allows for multiple image streams to be included. However,
-// the default names are suppored by more tools.
+// The supported input stream tags are:
+// * "IMAGE", which stores the encoded images from the
+//   OpenCVImageEncoderCalculator,
+// * "IMAGE_LABEL", which stores image labels from vector<Classification>,
+// * "FORWARD_FLOW_ENCODED", which stores the encoded optical flow from the same
+//   calculator,
+// * "BBOX" which stores bounding boxes from vector<Detections>,
+// * streams with the "FLOAT_FEATURE_${NAME}" pattern, which stores the values
+//   from vector<float>'s associated with the name ${NAME},
+// * "KEYPOINTS" stores a map of 2D keypoints from flat_hash_map<string,
+//   vector<pair<float, float>>>,
+// * "CLIP_MEDIA_ID", which stores the clip's media ID as a string.
+// "IMAGE_${NAME}", "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store
+// prefixed versions of each stream, which allows for multiple image streams to
+// be included. However, the default names are suppored by more tools.
 //
 // Example config:
 // node {
@@ -102,6 +108,9 @@ class PackMediaSequenceCalculator : public CalculatorBase {
   static absl::Status GetContract(CalculatorContract* cc) {
     RET_CHECK(cc->InputSidePackets().HasTag(kSequenceExampleTag));
     cc->InputSidePackets().Tag(kSequenceExampleTag).Set<tf::SequenceExample>();
+    if (cc->InputSidePackets().HasTag(kClipMediaIdTag)) {
+      cc->InputSidePackets().Tag(kClipMediaIdTag).Set<std::string>();
+    }
 
     if (cc->Inputs().HasTag(kForwardFlowEncodedTag)) {
       cc->Inputs()
@@ -190,6 +199,11 @@ class PackMediaSequenceCalculator : public CalculatorBase {
         cc->InputSidePackets()
             .Tag(kSequenceExampleTag)
             .Get<tf::SequenceExample>());
+    if (cc->InputSidePackets().HasTag(kClipMediaIdTag) &&
+        !cc->InputSidePackets().Tag(kClipMediaIdTag).IsEmpty()) {
+      clip_media_id_ =
+          cc->InputSidePackets().Tag(kClipMediaIdTag).Get<std::string>();
+    }
 
     const auto& context_features =
         cc->Options<PackMediaSequenceCalculatorOptions>().context_feature_map();
@@ -592,10 +606,14 @@ class PackMediaSequenceCalculator : public CalculatorBase {
         }
       }
     }
+    if (clip_media_id_.has_value()) {
+      mpms::SetClipMediaId(*clip_media_id_, sequence_.get());
+    }
     return absl::OkStatus();
   }
 
   std::unique_ptr<tf::SequenceExample> sequence_;
+  std::optional<std::string> clip_media_id_ = std::nullopt;
   std::map<std::string, bool> features_present_;
   bool replace_keypoints_;
 };
diff --git a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc
index 5c0ad8ac5..fa3e0bdea 100644
--- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc
+++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc
@@ -25,6 +25,7 @@
 #include "mediapipe/framework/formats/detection.pb.h"
 #include "mediapipe/framework/formats/location.h"
 #include "mediapipe/framework/formats/location_opencv.h"
+#include "mediapipe/framework/packet.h"
 #include "mediapipe/framework/port/opencv_imgcodecs_inc.h"
 #include "mediapipe/framework/port/status_matchers.h"
 #include "mediapipe/framework/timestamp.h"
@@ -63,6 +64,7 @@ constexpr char kImageLabelOtherTag[] = "IMAGE_LABEL_OTHER";
 constexpr char kImagePrefixTag[] = "IMAGE_PREFIX";
 constexpr char kSequenceExampleTag[] = "SEQUENCE_EXAMPLE";
 constexpr char kImageTag[] = "IMAGE";
+constexpr char kClipMediaIdTag[] = "CLIP_MEDIA_ID";
 
 class PackMediaSequenceCalculatorTest : public ::testing::Test {
  protected:
@@ -70,10 +72,14 @@ class PackMediaSequenceCalculatorTest : public ::testing::Test {
                        const tf::Features& features,
                        const bool output_only_if_all_present,
                        const bool replace_instead_of_append,
-                       const bool output_as_zero_timestamp = false) {
+                       const bool output_as_zero_timestamp = false,
+                       const std::vector<std::string>& input_side_packets = {
+                           "SEQUENCE_EXAMPLE:input_sequence"}) {
     CalculatorGraphConfig::Node config;
     config.set_calculator("PackMediaSequenceCalculator");
-    config.add_input_side_packet("SEQUENCE_EXAMPLE:input_sequence");
+    for (const std::string& side_packet : input_side_packets) {
+      config.add_input_side_packet(side_packet);
+    }
     config.add_output_stream("SEQUENCE_EXAMPLE:output_sequence");
     for (const std::string& stream : input_streams) {
       config.add_input_stream(stream);
@@ -833,6 +839,88 @@ TEST_F(PackMediaSequenceCalculatorTest, PacksTwoMaskDetections) {
               testing::ElementsAreArray(::std::vector<std::string>({"mask"})));
 }
 
+TEST_F(PackMediaSequenceCalculatorTest, AddClipMediaId) {
+  SetUpCalculator(
+      /*input_streams=*/{"FLOAT_FEATURE_TEST:test",
+                         "FLOAT_FEATURE_OTHER:test2"},
+      /*features=*/{},
+      /*output_only_if_all_present=*/false,
+      /*replace_instead_of_append=*/true,
+      /*output_as_zero_timestamp=*/false, /*input_side_packets=*/
+      {"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"});
+  auto input_sequence = absl::make_unique<tf::SequenceExample>();
+  const std::string test_video_id = "test_video_id";
+
+  int num_timesteps = 2;
+  for (int i = 0; i < num_timesteps; ++i) {
+    auto vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
+    runner_->MutableInputs()
+        ->Tag(kFloatFeatureTestTag)
+        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
+    vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
+    runner_->MutableInputs()
+        ->Tag(kFloatFeatureOtherTag)
+        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
+  }
+
+  runner_->MutableSidePackets()->Tag(kClipMediaIdTag) =
+      MakePacket<std::string>(test_video_id);
+  runner_->MutableSidePackets()->Tag(kSequenceExampleTag) =
+      Adopt(input_sequence.release());
+
+  MP_ASSERT_OK(runner_->Run());
+
+  const std::vector<Packet>& output_packets =
+      runner_->Outputs().Tag(kSequenceExampleTag).packets;
+  ASSERT_EQ(1, output_packets.size());
+  const tf::SequenceExample& output_sequence =
+      output_packets[0].Get<tf::SequenceExample>();
+
+  ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence));
+}
+
+TEST_F(PackMediaSequenceCalculatorTest, ReplaceClipMediaId) {
+  SetUpCalculator(
+      /*input_streams=*/{"FLOAT_FEATURE_TEST:test",
+                         "FLOAT_FEATURE_OTHER:test2"},
+      /*features=*/{},
+      /*output_only_if_all_present=*/false,
+      /*replace_instead_of_append=*/true,
+      /*output_as_zero_timestamp=*/false, /*input_side_packets=*/
+      {"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"});
+  auto input_sequence = absl::make_unique<tf::SequenceExample>();
+  const std::string existing_video_id = "existing_video_id";
+  mpms::SetClipMediaId(existing_video_id, input_sequence.get());
+  const std::string test_video_id = "test_video_id";
+
+  int num_timesteps = 2;
+  for (int i = 0; i < num_timesteps; ++i) {
+    auto vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
+    runner_->MutableInputs()
+        ->Tag(kFloatFeatureTestTag)
+        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
+    vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
+    runner_->MutableInputs()
+        ->Tag(kFloatFeatureOtherTag)
+        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
+  }
+
+  runner_->MutableSidePackets()->Tag(kClipMediaIdTag) =
+      MakePacket<std::string>(test_video_id).At(Timestamp(0));
+  runner_->MutableSidePackets()->Tag(kSequenceExampleTag) =
+      Adopt(input_sequence.release());
+
+  MP_ASSERT_OK(runner_->Run());
+
+  const std::vector<Packet>& output_packets =
+      runner_->Outputs().Tag(kSequenceExampleTag).packets;
+  ASSERT_EQ(1, output_packets.size());
+  const tf::SequenceExample& output_sequence =
+      output_packets[0].Get<tf::SequenceExample>();
+
+  ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence));
+}
+
 TEST_F(PackMediaSequenceCalculatorTest, MissingStreamOK) {
   SetUpCalculator(
       {"FORWARD_FLOW_ENCODED:flow", "FLOAT_FEATURE_I3D_FLOW:feature"}, {},