From a44c8109217ac7672abcdc4cb3ae7ddb93145e4b Mon Sep 17 00:00:00 2001 From: MediaPipe Team Date: Fri, 18 Aug 2023 15:44:04 -0700 Subject: [PATCH] Update PackMediaSequenceCalculator to support adding clip/media/id to the MediaSequence. As the media ID is usually a video ID which is provided to the graph as a side packet, in this graph it expects it to be provided as as a input side packet instead of an input stream. PiperOrigin-RevId: 558266967 --- mediapipe/calculators/tensorflow/BUILD | 1 + .../pack_media_sequence_calculator.cc | 40 +++++--- .../pack_media_sequence_calculator_test.cc | 92 ++++++++++++++++++- 3 files changed, 120 insertions(+), 13 deletions(-) diff --git a/mediapipe/calculators/tensorflow/BUILD b/mediapipe/calculators/tensorflow/BUILD index 374478457..78da0934c 100644 --- a/mediapipe/calculators/tensorflow/BUILD +++ b/mediapipe/calculators/tensorflow/BUILD @@ -929,6 +929,7 @@ cc_test( "//mediapipe/calculators/image:opencv_image_encoder_calculator_cc_proto", "//mediapipe/framework:calculator_framework", "//mediapipe/framework:calculator_runner", + "//mediapipe/framework:packet", "//mediapipe/framework:timestamp", "//mediapipe/framework/formats:classification_cc_proto", "//mediapipe/framework/formats:detection_cc_proto", diff --git a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc index 75878b74a..9185e22a5 100644 --- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc +++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include @@ -45,6 +46,7 @@ const char kForwardFlowEncodedTag[] = "FORWARD_FLOW_ENCODED"; const char kBBoxTag[] = "BBOX"; const char kKeypointsTag[] = "KEYPOINTS"; const char kSegmentationMaskTag[] = "CLASS_SEGMENTATION"; +const char kClipMediaIdTag[] = "CLIP_MEDIA_ID"; namespace tf = ::tensorflow; namespace mpms = mediapipe::mediasequence; @@ -56,17 +58,21 @@ namespace mpms = mediapipe::mediasequence; // context features can be supplied verbatim in the calculator's options. The // SequenceExample will conform to the description in media_sequence.h. // -// The supported input stream tags are "IMAGE", which stores the encoded -// images from the OpenCVImageEncoderCalculator, "IMAGE_LABEL", which stores -// image labels from vector, "FORWARD_FLOW_ENCODED", which -// stores the encoded optical flow from the same calculator, "BBOX" which stores -// bounding boxes from vector, and streams with the -// "FLOAT_FEATURE_${NAME}" pattern, which stores the values from vector's -// associated with the name ${NAME}. "KEYPOINTS" stores a map of 2D keypoints -// from flat_hash_map>>. "IMAGE_${NAME}", -// "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store prefixed versions of -// each stream, which allows for multiple image streams to be included. However, -// the default names are suppored by more tools. +// The supported input stream tags are: +// * "IMAGE", which stores the encoded images from the +// OpenCVImageEncoderCalculator, +// * "IMAGE_LABEL", which stores image labels from vector, +// * "FORWARD_FLOW_ENCODED", which stores the encoded optical flow from the same +// calculator, +// * "BBOX" which stores bounding boxes from vector, +// * streams with the "FLOAT_FEATURE_${NAME}" pattern, which stores the values +// from vector's associated with the name ${NAME}, +// * "KEYPOINTS" stores a map of 2D keypoints from flat_hash_map>>, +// * "CLIP_MEDIA_ID", which stores the clip's media ID as a string. +// "IMAGE_${NAME}", "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store +// prefixed versions of each stream, which allows for multiple image streams to +// be included. However, the default names are suppored by more tools. // // Example config: // node { @@ -102,6 +108,9 @@ class PackMediaSequenceCalculator : public CalculatorBase { static absl::Status GetContract(CalculatorContract* cc) { RET_CHECK(cc->InputSidePackets().HasTag(kSequenceExampleTag)); cc->InputSidePackets().Tag(kSequenceExampleTag).Set(); + if (cc->InputSidePackets().HasTag(kClipMediaIdTag)) { + cc->InputSidePackets().Tag(kClipMediaIdTag).Set(); + } if (cc->Inputs().HasTag(kForwardFlowEncodedTag)) { cc->Inputs() @@ -190,6 +199,11 @@ class PackMediaSequenceCalculator : public CalculatorBase { cc->InputSidePackets() .Tag(kSequenceExampleTag) .Get()); + if (cc->InputSidePackets().HasTag(kClipMediaIdTag) && + !cc->InputSidePackets().Tag(kClipMediaIdTag).IsEmpty()) { + clip_media_id_ = + cc->InputSidePackets().Tag(kClipMediaIdTag).Get(); + } const auto& context_features = cc->Options().context_feature_map(); @@ -592,10 +606,14 @@ class PackMediaSequenceCalculator : public CalculatorBase { } } } + if (clip_media_id_.has_value()) { + mpms::SetClipMediaId(*clip_media_id_, sequence_.get()); + } return absl::OkStatus(); } std::unique_ptr sequence_; + std::optional clip_media_id_ = std::nullopt; std::map features_present_; bool replace_keypoints_; }; diff --git a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc index 5c0ad8ac5..fa3e0bdea 100644 --- a/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc +++ b/mediapipe/calculators/tensorflow/pack_media_sequence_calculator_test.cc @@ -25,6 +25,7 @@ #include "mediapipe/framework/formats/detection.pb.h" #include "mediapipe/framework/formats/location.h" #include "mediapipe/framework/formats/location_opencv.h" +#include "mediapipe/framework/packet.h" #include "mediapipe/framework/port/opencv_imgcodecs_inc.h" #include "mediapipe/framework/port/status_matchers.h" #include "mediapipe/framework/timestamp.h" @@ -63,6 +64,7 @@ constexpr char kImageLabelOtherTag[] = "IMAGE_LABEL_OTHER"; constexpr char kImagePrefixTag[] = "IMAGE_PREFIX"; constexpr char kSequenceExampleTag[] = "SEQUENCE_EXAMPLE"; constexpr char kImageTag[] = "IMAGE"; +constexpr char kClipMediaIdTag[] = "CLIP_MEDIA_ID"; class PackMediaSequenceCalculatorTest : public ::testing::Test { protected: @@ -70,10 +72,14 @@ class PackMediaSequenceCalculatorTest : public ::testing::Test { const tf::Features& features, const bool output_only_if_all_present, const bool replace_instead_of_append, - const bool output_as_zero_timestamp = false) { + const bool output_as_zero_timestamp = false, + const std::vector& input_side_packets = { + "SEQUENCE_EXAMPLE:input_sequence"}) { CalculatorGraphConfig::Node config; config.set_calculator("PackMediaSequenceCalculator"); - config.add_input_side_packet("SEQUENCE_EXAMPLE:input_sequence"); + for (const std::string& side_packet : input_side_packets) { + config.add_input_side_packet(side_packet); + } config.add_output_stream("SEQUENCE_EXAMPLE:output_sequence"); for (const std::string& stream : input_streams) { config.add_input_stream(stream); @@ -833,6 +839,88 @@ TEST_F(PackMediaSequenceCalculatorTest, PacksTwoMaskDetections) { testing::ElementsAreArray(::std::vector({"mask"}))); } +TEST_F(PackMediaSequenceCalculatorTest, AddClipMediaId) { + SetUpCalculator( + /*input_streams=*/{"FLOAT_FEATURE_TEST:test", + "FLOAT_FEATURE_OTHER:test2"}, + /*features=*/{}, + /*output_only_if_all_present=*/false, + /*replace_instead_of_append=*/true, + /*output_as_zero_timestamp=*/false, /*input_side_packets=*/ + {"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"}); + auto input_sequence = absl::make_unique(); + const std::string test_video_id = "test_video_id"; + + int num_timesteps = 2; + for (int i = 0; i < num_timesteps; ++i) { + auto vf_ptr = ::absl::make_unique>(2, 2 << i); + runner_->MutableInputs() + ->Tag(kFloatFeatureTestTag) + .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i))); + vf_ptr = ::absl::make_unique>(2, 2 << i); + runner_->MutableInputs() + ->Tag(kFloatFeatureOtherTag) + .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i))); + } + + runner_->MutableSidePackets()->Tag(kClipMediaIdTag) = + MakePacket(test_video_id); + runner_->MutableSidePackets()->Tag(kSequenceExampleTag) = + Adopt(input_sequence.release()); + + MP_ASSERT_OK(runner_->Run()); + + const std::vector& output_packets = + runner_->Outputs().Tag(kSequenceExampleTag).packets; + ASSERT_EQ(1, output_packets.size()); + const tf::SequenceExample& output_sequence = + output_packets[0].Get(); + + ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence)); +} + +TEST_F(PackMediaSequenceCalculatorTest, ReplaceClipMediaId) { + SetUpCalculator( + /*input_streams=*/{"FLOAT_FEATURE_TEST:test", + "FLOAT_FEATURE_OTHER:test2"}, + /*features=*/{}, + /*output_only_if_all_present=*/false, + /*replace_instead_of_append=*/true, + /*output_as_zero_timestamp=*/false, /*input_side_packets=*/ + {"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"}); + auto input_sequence = absl::make_unique(); + const std::string existing_video_id = "existing_video_id"; + mpms::SetClipMediaId(existing_video_id, input_sequence.get()); + const std::string test_video_id = "test_video_id"; + + int num_timesteps = 2; + for (int i = 0; i < num_timesteps; ++i) { + auto vf_ptr = ::absl::make_unique>(2, 2 << i); + runner_->MutableInputs() + ->Tag(kFloatFeatureTestTag) + .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i))); + vf_ptr = ::absl::make_unique>(2, 2 << i); + runner_->MutableInputs() + ->Tag(kFloatFeatureOtherTag) + .packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i))); + } + + runner_->MutableSidePackets()->Tag(kClipMediaIdTag) = + MakePacket(test_video_id).At(Timestamp(0)); + runner_->MutableSidePackets()->Tag(kSequenceExampleTag) = + Adopt(input_sequence.release()); + + MP_ASSERT_OK(runner_->Run()); + + const std::vector& output_packets = + runner_->Outputs().Tag(kSequenceExampleTag).packets; + ASSERT_EQ(1, output_packets.size()); + const tf::SequenceExample& output_sequence = + output_packets[0].Get(); + + ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence)); +} + TEST_F(PackMediaSequenceCalculatorTest, MissingStreamOK) { SetUpCalculator( {"FORWARD_FLOW_ENCODED:flow", "FLOAT_FEATURE_I3D_FLOW:feature"}, {},