Update PackMediaSequenceCalculator to support adding clip/media/id to the MediaSequence.

As the media ID is usually a video ID which is provided to the graph as a side packet, in this graph it expects it to be provided as as a input side packet instead of an input stream. PiperOrigin-RevId: 558266967
2023-08-18 15:44:04 -07:00 · 2023-08-18 15:44:04 -07:00 · a44c810921
commit a44c810921
parent fda0d19337
3 changed files with 120 additions and 13 deletions
--- a/mediapipe/calculators/tensorflow/BUILD
+++ b/mediapipe/calculators/tensorflow/BUILD
@ -929,6 +929,7 @@ cc_test(
        "//mediapipe/calculators/image:opencv_image_encoder_calculator_cc_proto",
        "//mediapipe/framework:calculator_framework",
        "//mediapipe/framework:calculator_runner",
+        "//mediapipe/framework:packet",
        "//mediapipe/framework:timestamp",
        "//mediapipe/framework/formats:classification_cc_proto",
        "//mediapipe/framework/formats:detection_cc_proto",
--- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc
+++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc
@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <optional>
 #include <string>
 #include <vector>

@ -45,6 +46,7 @@ const char kForwardFlowEncodedTag[] = "FORWARD_FLOW_ENCODED";
 const char kBBoxTag[] = "BBOX";
 const char kKeypointsTag[] = "KEYPOINTS";
 const char kSegmentationMaskTag[] = "CLASS_SEGMENTATION";
+const char kClipMediaIdTag[] = "CLIP_MEDIA_ID";

 namespace tf = ::tensorflow;
 namespace mpms = mediapipe::mediasequence;
@ -56,17 +58,21 @@ namespace mpms = mediapipe::mediasequence;
 // context features can be supplied verbatim in the calculator's options. The
 // SequenceExample will conform to the description in media_sequence.h.
 //
-// The supported input stream tags are "IMAGE", which stores the encoded
-// images from the OpenCVImageEncoderCalculator, "IMAGE_LABEL", which stores
-// image labels from vector<Classification>, "FORWARD_FLOW_ENCODED", which
-// stores the encoded optical flow from the same calculator, "BBOX" which stores
-// bounding boxes from vector<Detections>, and streams with the
-// "FLOAT_FEATURE_${NAME}" pattern, which stores the values from vector<float>'s
-// associated with the name ${NAME}. "KEYPOINTS" stores a map of 2D keypoints
-// from flat_hash_map<string, vector<pair<float, float>>>. "IMAGE_${NAME}",
-// "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store prefixed versions of
-// each stream, which allows for multiple image streams to be included. However,
-// the default names are suppored by more tools.
+// The supported input stream tags are:
+// * "IMAGE", which stores the encoded images from the
+//   OpenCVImageEncoderCalculator,
+// * "IMAGE_LABEL", which stores image labels from vector<Classification>,
+// * "FORWARD_FLOW_ENCODED", which stores the encoded optical flow from the same
+//   calculator,
+// * "BBOX" which stores bounding boxes from vector<Detections>,
+// * streams with the "FLOAT_FEATURE_${NAME}" pattern, which stores the values
+//   from vector<float>'s associated with the name ${NAME},
+// * "KEYPOINTS" stores a map of 2D keypoints from flat_hash_map<string,
+//   vector<pair<float, float>>>,
+// * "CLIP_MEDIA_ID", which stores the clip's media ID as a string.
+// "IMAGE_${NAME}", "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store
+// prefixed versions of each stream, which allows for multiple image streams to
+// be included. However, the default names are suppored by more tools.
 //
 // Example config:
 // node {
@ -102,6 +108,9 @@ class PackMediaSequenceCalculator : public CalculatorBase {
  static absl::Status GetContract(CalculatorContract* cc) {
    RET_CHECK(cc->InputSidePackets().HasTag(kSequenceExampleTag));
    cc->InputSidePackets().Tag(kSequenceExampleTag).Set<tf::SequenceExample>();
+    if (cc->InputSidePackets().HasTag(kClipMediaIdTag)) {
+      cc->InputSidePackets().Tag(kClipMediaIdTag).Set<std::string>();
+    }

    if (cc->Inputs().HasTag(kForwardFlowEncodedTag)) {
      cc->Inputs()
@ -190,6 +199,11 @@ class PackMediaSequenceCalculator : public CalculatorBase {
        cc->InputSidePackets()
            .Tag(kSequenceExampleTag)
            .Get<tf::SequenceExample>());
+    if (cc->InputSidePackets().HasTag(kClipMediaIdTag) &&
+        !cc->InputSidePackets().Tag(kClipMediaIdTag).IsEmpty()) {
+      clip_media_id_ =
+          cc->InputSidePackets().Tag(kClipMediaIdTag).Get<std::string>();
+    }

    const auto& context_features =
        cc->Options<PackMediaSequenceCalculatorOptions>().context_feature_map();
@ -592,10 +606,14 @@ class PackMediaSequenceCalculator : public CalculatorBase {
        }
      }
    }
+    if (clip_media_id_.has_value()) {
+      mpms::SetClipMediaId(*clip_media_id_, sequence_.get());
+    }
    return absl::OkStatus();
  }

  std::unique_ptr<tf::SequenceExample> sequence_;
+  std::optional<std::string> clip_media_id_ = std::nullopt;
  std::map<std::string, bool> features_present_;
  bool replace_keypoints_;
 };
--- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc
+++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc
@ -25,6 +25,7 @@
 #include "mediapipe/framework/formats/detection.pb.h"
 #include "mediapipe/framework/formats/location.h"
 #include "mediapipe/framework/formats/location_opencv.h"
+#include "mediapipe/framework/packet.h"
 #include "mediapipe/framework/port/opencv_imgcodecs_inc.h"
 #include "mediapipe/framework/port/status_matchers.h"
 #include "mediapipe/framework/timestamp.h"
@ -63,6 +64,7 @@ constexpr char kImageLabelOtherTag[] = "IMAGE_LABEL_OTHER";
 constexpr char kImagePrefixTag[] = "IMAGE_PREFIX";
 constexpr char kSequenceExampleTag[] = "SEQUENCE_EXAMPLE";
 constexpr char kImageTag[] = "IMAGE";
+constexpr char kClipMediaIdTag[] = "CLIP_MEDIA_ID";

 class PackMediaSequenceCalculatorTest : public ::testing::Test {
 protected:
@ -70,10 +72,14 @@ class PackMediaSequenceCalculatorTest : public ::testing::Test {
                       const tf::Features& features,
                       const bool output_only_if_all_present,
                       const bool replace_instead_of_append,
-                       const bool output_as_zero_timestamp = false) {
+                       const bool output_as_zero_timestamp = false,
+                       const std::vector<std::string>& input_side_packets = {
+                           "SEQUENCE_EXAMPLE:input_sequence"}) {
    CalculatorGraphConfig::Node config;
    config.set_calculator("PackMediaSequenceCalculator");
-    config.add_input_side_packet("SEQUENCE_EXAMPLE:input_sequence");
+    for (const std::string& side_packet : input_side_packets) {
+      config.add_input_side_packet(side_packet);
+    }
    config.add_output_stream("SEQUENCE_EXAMPLE:output_sequence");
    for (const std::string& stream : input_streams) {
      config.add_input_stream(stream);
@ -833,6 +839,88 @@ TEST_F(PackMediaSequenceCalculatorTest, PacksTwoMaskDetections) {
              testing::ElementsAreArray(::std::vector<std::string>({"mask"})));
 }

+TEST_F(PackMediaSequenceCalculatorTest, AddClipMediaId) {
+  SetUpCalculator(
+      /*input_streams=*/{"FLOAT_FEATURE_TEST:test",
+                         "FLOAT_FEATURE_OTHER:test2"},
+      /*features=*/{},
+      /*output_only_if_all_present=*/false,
+      /*replace_instead_of_append=*/true,
+      /*output_as_zero_timestamp=*/false, /*input_side_packets=*/
+      {"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"});
+  auto input_sequence = absl::make_unique<tf::SequenceExample>();
+  const std::string test_video_id = "test_video_id";
+
+  int num_timesteps = 2;
+  for (int i = 0; i < num_timesteps; ++i) {
+    auto vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
+    runner_->MutableInputs()
+        ->Tag(kFloatFeatureTestTag)
+        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
+    vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
+    runner_->MutableInputs()
+        ->Tag(kFloatFeatureOtherTag)
+        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
+  }
+
+  runner_->MutableSidePackets()->Tag(kClipMediaIdTag) =
+      MakePacket<std::string>(test_video_id);
+  runner_->MutableSidePackets()->Tag(kSequenceExampleTag) =
+      Adopt(input_sequence.release());
+
+  MP_ASSERT_OK(runner_->Run());
+
+  const std::vector<Packet>& output_packets =
+      runner_->Outputs().Tag(kSequenceExampleTag).packets;
+  ASSERT_EQ(1, output_packets.size());
+  const tf::SequenceExample& output_sequence =
+      output_packets[0].Get<tf::SequenceExample>();
+
+  ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence));
+}
+
+TEST_F(PackMediaSequenceCalculatorTest, ReplaceClipMediaId) {
+  SetUpCalculator(
+      /*input_streams=*/{"FLOAT_FEATURE_TEST:test",
+                         "FLOAT_FEATURE_OTHER:test2"},
+      /*features=*/{},
+      /*output_only_if_all_present=*/false,
+      /*replace_instead_of_append=*/true,
+      /*output_as_zero_timestamp=*/false, /*input_side_packets=*/
+      {"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"});
+  auto input_sequence = absl::make_unique<tf::SequenceExample>();
+  const std::string existing_video_id = "existing_video_id";
+  mpms::SetClipMediaId(existing_video_id, input_sequence.get());
+  const std::string test_video_id = "test_video_id";
+
+  int num_timesteps = 2;
+  for (int i = 0; i < num_timesteps; ++i) {
+    auto vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
+    runner_->MutableInputs()
+        ->Tag(kFloatFeatureTestTag)
+        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
+    vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
+    runner_->MutableInputs()
+        ->Tag(kFloatFeatureOtherTag)
+        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
+  }
+
+  runner_->MutableSidePackets()->Tag(kClipMediaIdTag) =
+      MakePacket<std::string>(test_video_id).At(Timestamp(0));
+  runner_->MutableSidePackets()->Tag(kSequenceExampleTag) =
+      Adopt(input_sequence.release());
+
+  MP_ASSERT_OK(runner_->Run());
+
+  const std::vector<Packet>& output_packets =
+      runner_->Outputs().Tag(kSequenceExampleTag).packets;
+  ASSERT_EQ(1, output_packets.size());
+  const tf::SequenceExample& output_sequence =
+      output_packets[0].Get<tf::SequenceExample>();
+
+  ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence));
+}
+
 TEST_F(PackMediaSequenceCalculatorTest, MissingStreamOK) {
  SetUpCalculator(
      {"FORWARD_FLOW_ENCODED:flow", "FLOAT_FEATURE_I3D_FLOW:feature"}, {},