Update PackMediaSequenceCalculator to support adding clip/media/id to the MediaSequence.

As the media ID is usually a video ID which is provided to the graph as a side packet, in this graph it expects it to be provided as as a input side packet instead of an input stream. PiperOrigin-RevId: 558266967
2023-08-18 15:44:04 -07:00 · 2023-08-18 15:44:04 -07:00 · a44c810921
commit a44c810921
parent fda0d19337
3 changed files with 120 additions and 13 deletions
--- a/mediapipe/calculators/tensorflow/BUILD
+++ b/mediapipe/calculators/tensorflow/BUILD
@ -929,6 +929,7 @@ cc_test(
        "//mediapipe/calculators/image:opencv_image_encoder_calculator_cc_proto",
        "//mediapipe/framework:calculator_framework",
        "//mediapipe/framework:calculator_runner",
        "//mediapipe/framework:packet",
        "//mediapipe/framework:timestamp",
        "//mediapipe/framework/formats:classification_cc_proto",
        "//mediapipe/framework/formats:detection_cc_proto",
--- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc
+++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc
@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <optional>
 #include <string>
 #include <vector>
@ -45,6 +46,7 @@ const char kForwardFlowEncodedTag[] = "FORWARD_FLOW_ENCODED";
 const char kBBoxTag[] = "BBOX";
 const char kKeypointsTag[] = "KEYPOINTS";
 const char kSegmentationMaskTag[] = "CLASS_SEGMENTATION";
 const char kClipMediaIdTag[] = "CLIP_MEDIA_ID";
 namespace tf = ::tensorflow;
 namespace mpms = mediapipe::mediasequence;
@ -56,17 +58,21 @@ namespace mpms = mediapipe::mediasequence;
 // context features can be supplied verbatim in the calculator's options. The
 // SequenceExample will conform to the description in media_sequence.h.
 //
-// The supported input stream tags are "IMAGE", which stores the encoded
+// The supported input stream tags are:
-// images from the OpenCVImageEncoderCalculator, "IMAGE_LABEL", which stores
+// * "IMAGE", which stores the encoded images from the
-// image labels from vector<Classification>, "FORWARD_FLOW_ENCODED", which
+//   OpenCVImageEncoderCalculator,
-// stores the encoded optical flow from the same calculator, "BBOX" which stores
+// * "IMAGE_LABEL", which stores image labels from vector<Classification>,
-// bounding boxes from vector<Detections>, and streams with the
+// * "FORWARD_FLOW_ENCODED", which stores the encoded optical flow from the same
-// "FLOAT_FEATURE_${NAME}" pattern, which stores the values from vector<float>'s
+//   calculator,
-// associated with the name ${NAME}. "KEYPOINTS" stores a map of 2D keypoints
+// * "BBOX" which stores bounding boxes from vector<Detections>,
-// from flat_hash_map<string, vector<pair<float, float>>>. "IMAGE_${NAME}",
+// * streams with the "FLOAT_FEATURE_${NAME}" pattern, which stores the values
-// "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store prefixed versions of
+//   from vector<float>'s associated with the name ${NAME},
-// each stream, which allows for multiple image streams to be included. However,
+// * "KEYPOINTS" stores a map of 2D keypoints from flat_hash_map<string,
-// the default names are suppored by more tools.
+//   vector<pair<float, float>>>,
 // * "CLIP_MEDIA_ID", which stores the clip's media ID as a string.
 // "IMAGE_${NAME}", "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store
 // prefixed versions of each stream, which allows for multiple image streams to
 // be included. However, the default names are suppored by more tools.
 //
 // Example config:
 // node {
@ -102,6 +108,9 @@ class PackMediaSequenceCalculator : public CalculatorBase {
  static absl::Status GetContract(CalculatorContract* cc) {
    RET_CHECK(cc->InputSidePackets().HasTag(kSequenceExampleTag));
    cc->InputSidePackets().Tag(kSequenceExampleTag).Set<tf::SequenceExample>();
    if (cc->InputSidePackets().HasTag(kClipMediaIdTag)) {
      cc->InputSidePackets().Tag(kClipMediaIdTag).Set<std::string>();
    }
    if (cc->Inputs().HasTag(kForwardFlowEncodedTag)) {
      cc->Inputs()
@ -190,6 +199,11 @@ class PackMediaSequenceCalculator : public CalculatorBase {
        cc->InputSidePackets()
            .Tag(kSequenceExampleTag)
            .Get<tf::SequenceExample>());
    if (cc->InputSidePackets().HasTag(kClipMediaIdTag) &&
        !cc->InputSidePackets().Tag(kClipMediaIdTag).IsEmpty()) {
      clip_media_id_ =
          cc->InputSidePackets().Tag(kClipMediaIdTag).Get<std::string>();
    }
    const auto& context_features =
        cc->Options<PackMediaSequenceCalculatorOptions>().context_feature_map();
@ -592,10 +606,14 @@ class PackMediaSequenceCalculator : public CalculatorBase {
        }
      }
    }
    if (clip_media_id_.has_value()) {
      mpms::SetClipMediaId(*clip_media_id_, sequence_.get());
    }
    return absl::OkStatus();
  }
  std::unique_ptr<tf::SequenceExample> sequence_;
  std::optional<std::string> clip_media_id_ = std::nullopt;
  std::map<std::string, bool> features_present_;
  bool replace_keypoints_;
 };
--- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc
+++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc
@ -25,6 +25,7 @@
 #include "mediapipe/framework/formats/detection.pb.h"
 #include "mediapipe/framework/formats/location.h"
 #include "mediapipe/framework/formats/location_opencv.h"
 #include "mediapipe/framework/packet.h"
 #include "mediapipe/framework/port/opencv_imgcodecs_inc.h"
 #include "mediapipe/framework/port/status_matchers.h"
 #include "mediapipe/framework/timestamp.h"
@ -63,6 +64,7 @@ constexpr char kImageLabelOtherTag[] = "IMAGE_LABEL_OTHER";
 constexpr char kImagePrefixTag[] = "IMAGE_PREFIX";
 constexpr char kSequenceExampleTag[] = "SEQUENCE_EXAMPLE";
 constexpr char kImageTag[] = "IMAGE";
 constexpr char kClipMediaIdTag[] = "CLIP_MEDIA_ID";
 class PackMediaSequenceCalculatorTest : public ::testing::Test {
 protected:
@ -70,10 +72,14 @@ class PackMediaSequenceCalculatorTest : public ::testing::Test {
                       const tf::Features& features,
                       const bool output_only_if_all_present,
                       const bool replace_instead_of_append,
-                       const bool output_as_zero_timestamp = false) {
+                       const bool output_as_zero_timestamp = false,
                       const std::vector<std::string>& input_side_packets = {
                           "SEQUENCE_EXAMPLE:input_sequence"}) {
    CalculatorGraphConfig::Node config;
    config.set_calculator("PackMediaSequenceCalculator");
-    config.add_input_side_packet("SEQUENCE_EXAMPLE:input_sequence");
+    for (const std::string& side_packet : input_side_packets) {
      config.add_input_side_packet(side_packet);
    }
    config.add_output_stream("SEQUENCE_EXAMPLE:output_sequence");
    for (const std::string& stream : input_streams) {
      config.add_input_stream(stream);
@ -833,6 +839,88 @@ TEST_F(PackMediaSequenceCalculatorTest, PacksTwoMaskDetections) {
              testing::ElementsAreArray(::std::vector<std::string>({"mask"})));
 }
 TEST_F(PackMediaSequenceCalculatorTest, AddClipMediaId) {
  SetUpCalculator(
      /*input_streams=*/{"FLOAT_FEATURE_TEST:test",
                         "FLOAT_FEATURE_OTHER:test2"},
      /*features=*/{},
      /*output_only_if_all_present=*/false,
      /*replace_instead_of_append=*/true,
      /*output_as_zero_timestamp=*/false, /*input_side_packets=*/
      {"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"});
  auto input_sequence = absl::make_unique<tf::SequenceExample>();
  const std::string test_video_id = "test_video_id";
  int num_timesteps = 2;
  for (int i = 0; i < num_timesteps; ++i) {
    auto vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
    runner_->MutableInputs()
        ->Tag(kFloatFeatureTestTag)
        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
    vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
    runner_->MutableInputs()
        ->Tag(kFloatFeatureOtherTag)
        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
  }
  runner_->MutableSidePackets()->Tag(kClipMediaIdTag) =
      MakePacket<std::string>(test_video_id);
  runner_->MutableSidePackets()->Tag(kSequenceExampleTag) =
      Adopt(input_sequence.release());
  MP_ASSERT_OK(runner_->Run());
  const std::vector<Packet>& output_packets =
      runner_->Outputs().Tag(kSequenceExampleTag).packets;
  ASSERT_EQ(1, output_packets.size());
  const tf::SequenceExample& output_sequence =
      output_packets[0].Get<tf::SequenceExample>();
  ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence));
 }
 TEST_F(PackMediaSequenceCalculatorTest, ReplaceClipMediaId) {
  SetUpCalculator(
      /*input_streams=*/{"FLOAT_FEATURE_TEST:test",
                         "FLOAT_FEATURE_OTHER:test2"},
      /*features=*/{},
      /*output_only_if_all_present=*/false,
      /*replace_instead_of_append=*/true,
      /*output_as_zero_timestamp=*/false, /*input_side_packets=*/
      {"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"});
  auto input_sequence = absl::make_unique<tf::SequenceExample>();
  const std::string existing_video_id = "existing_video_id";
  mpms::SetClipMediaId(existing_video_id, input_sequence.get());
  const std::string test_video_id = "test_video_id";
  int num_timesteps = 2;
  for (int i = 0; i < num_timesteps; ++i) {
    auto vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
    runner_->MutableInputs()
        ->Tag(kFloatFeatureTestTag)
        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
    vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
    runner_->MutableInputs()
        ->Tag(kFloatFeatureOtherTag)
        .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
  }
  runner_->MutableSidePackets()->Tag(kClipMediaIdTag) =
      MakePacket<std::string>(test_video_id).At(Timestamp(0));
  runner_->MutableSidePackets()->Tag(kSequenceExampleTag) =
      Adopt(input_sequence.release());
  MP_ASSERT_OK(runner_->Run());
  const std::vector<Packet>& output_packets =
      runner_->Outputs().Tag(kSequenceExampleTag).packets;
  ASSERT_EQ(1, output_packets.size());
  const tf::SequenceExample& output_sequence =
      output_packets[0].Get<tf::SequenceExample>();
  ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence));
 }
 TEST_F(PackMediaSequenceCalculatorTest, MissingStreamOK) {
  SetUpCalculator(
      {"FORWARD_FLOW_ENCODED:flow", "FLOAT_FEATURE_I3D_FLOW:feature"}, {},