Update PackMediaSequenceCalculator to support adding clip/media/id to the MediaSequence.

As the media ID is usually a video ID which is provided to the graph as a side packet, in this graph it expects it to be provided as as a input side packet instead of an input stream.

PiperOrigin-RevId: 558266967
This commit is contained in:
MediaPipe Team 2023-08-18 15:44:04 -07:00 committed by Copybara-Service
parent fda0d19337
commit a44c810921
3 changed files with 120 additions and 13 deletions

View File

@ -929,6 +929,7 @@ cc_test(
"//mediapipe/calculators/image:opencv_image_encoder_calculator_cc_proto", "//mediapipe/calculators/image:opencv_image_encoder_calculator_cc_proto",
"//mediapipe/framework:calculator_framework", "//mediapipe/framework:calculator_framework",
"//mediapipe/framework:calculator_runner", "//mediapipe/framework:calculator_runner",
"//mediapipe/framework:packet",
"//mediapipe/framework:timestamp", "//mediapipe/framework:timestamp",
"//mediapipe/framework/formats:classification_cc_proto", "//mediapipe/framework/formats:classification_cc_proto",
"//mediapipe/framework/formats:detection_cc_proto", "//mediapipe/framework/formats:detection_cc_proto",

View File

@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <optional>
#include <string> #include <string>
#include <vector> #include <vector>
@ -45,6 +46,7 @@ const char kForwardFlowEncodedTag[] = "FORWARD_FLOW_ENCODED";
const char kBBoxTag[] = "BBOX"; const char kBBoxTag[] = "BBOX";
const char kKeypointsTag[] = "KEYPOINTS"; const char kKeypointsTag[] = "KEYPOINTS";
const char kSegmentationMaskTag[] = "CLASS_SEGMENTATION"; const char kSegmentationMaskTag[] = "CLASS_SEGMENTATION";
const char kClipMediaIdTag[] = "CLIP_MEDIA_ID";
namespace tf = ::tensorflow; namespace tf = ::tensorflow;
namespace mpms = mediapipe::mediasequence; namespace mpms = mediapipe::mediasequence;
@ -56,17 +58,21 @@ namespace mpms = mediapipe::mediasequence;
// context features can be supplied verbatim in the calculator's options. The // context features can be supplied verbatim in the calculator's options. The
// SequenceExample will conform to the description in media_sequence.h. // SequenceExample will conform to the description in media_sequence.h.
// //
// The supported input stream tags are "IMAGE", which stores the encoded // The supported input stream tags are:
// images from the OpenCVImageEncoderCalculator, "IMAGE_LABEL", which stores // * "IMAGE", which stores the encoded images from the
// image labels from vector<Classification>, "FORWARD_FLOW_ENCODED", which // OpenCVImageEncoderCalculator,
// stores the encoded optical flow from the same calculator, "BBOX" which stores // * "IMAGE_LABEL", which stores image labels from vector<Classification>,
// bounding boxes from vector<Detections>, and streams with the // * "FORWARD_FLOW_ENCODED", which stores the encoded optical flow from the same
// "FLOAT_FEATURE_${NAME}" pattern, which stores the values from vector<float>'s // calculator,
// associated with the name ${NAME}. "KEYPOINTS" stores a map of 2D keypoints // * "BBOX" which stores bounding boxes from vector<Detections>,
// from flat_hash_map<string, vector<pair<float, float>>>. "IMAGE_${NAME}", // * streams with the "FLOAT_FEATURE_${NAME}" pattern, which stores the values
// "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store prefixed versions of // from vector<float>'s associated with the name ${NAME},
// each stream, which allows for multiple image streams to be included. However, // * "KEYPOINTS" stores a map of 2D keypoints from flat_hash_map<string,
// the default names are suppored by more tools. // vector<pair<float, float>>>,
// * "CLIP_MEDIA_ID", which stores the clip's media ID as a string.
// "IMAGE_${NAME}", "BBOX_${NAME}", and "KEYPOINTS_${NAME}" will also store
// prefixed versions of each stream, which allows for multiple image streams to
// be included. However, the default names are suppored by more tools.
// //
// Example config: // Example config:
// node { // node {
@ -102,6 +108,9 @@ class PackMediaSequenceCalculator : public CalculatorBase {
static absl::Status GetContract(CalculatorContract* cc) { static absl::Status GetContract(CalculatorContract* cc) {
RET_CHECK(cc->InputSidePackets().HasTag(kSequenceExampleTag)); RET_CHECK(cc->InputSidePackets().HasTag(kSequenceExampleTag));
cc->InputSidePackets().Tag(kSequenceExampleTag).Set<tf::SequenceExample>(); cc->InputSidePackets().Tag(kSequenceExampleTag).Set<tf::SequenceExample>();
if (cc->InputSidePackets().HasTag(kClipMediaIdTag)) {
cc->InputSidePackets().Tag(kClipMediaIdTag).Set<std::string>();
}
if (cc->Inputs().HasTag(kForwardFlowEncodedTag)) { if (cc->Inputs().HasTag(kForwardFlowEncodedTag)) {
cc->Inputs() cc->Inputs()
@ -190,6 +199,11 @@ class PackMediaSequenceCalculator : public CalculatorBase {
cc->InputSidePackets() cc->InputSidePackets()
.Tag(kSequenceExampleTag) .Tag(kSequenceExampleTag)
.Get<tf::SequenceExample>()); .Get<tf::SequenceExample>());
if (cc->InputSidePackets().HasTag(kClipMediaIdTag) &&
!cc->InputSidePackets().Tag(kClipMediaIdTag).IsEmpty()) {
clip_media_id_ =
cc->InputSidePackets().Tag(kClipMediaIdTag).Get<std::string>();
}
const auto& context_features = const auto& context_features =
cc->Options<PackMediaSequenceCalculatorOptions>().context_feature_map(); cc->Options<PackMediaSequenceCalculatorOptions>().context_feature_map();
@ -592,10 +606,14 @@ class PackMediaSequenceCalculator : public CalculatorBase {
} }
} }
} }
if (clip_media_id_.has_value()) {
mpms::SetClipMediaId(*clip_media_id_, sequence_.get());
}
return absl::OkStatus(); return absl::OkStatus();
} }
std::unique_ptr<tf::SequenceExample> sequence_; std::unique_ptr<tf::SequenceExample> sequence_;
std::optional<std::string> clip_media_id_ = std::nullopt;
std::map<std::string, bool> features_present_; std::map<std::string, bool> features_present_;
bool replace_keypoints_; bool replace_keypoints_;
}; };

View File

@ -25,6 +25,7 @@
#include "mediapipe/framework/formats/detection.pb.h" #include "mediapipe/framework/formats/detection.pb.h"
#include "mediapipe/framework/formats/location.h" #include "mediapipe/framework/formats/location.h"
#include "mediapipe/framework/formats/location_opencv.h" #include "mediapipe/framework/formats/location_opencv.h"
#include "mediapipe/framework/packet.h"
#include "mediapipe/framework/port/opencv_imgcodecs_inc.h" #include "mediapipe/framework/port/opencv_imgcodecs_inc.h"
#include "mediapipe/framework/port/status_matchers.h" #include "mediapipe/framework/port/status_matchers.h"
#include "mediapipe/framework/timestamp.h" #include "mediapipe/framework/timestamp.h"
@ -63,6 +64,7 @@ constexpr char kImageLabelOtherTag[] = "IMAGE_LABEL_OTHER";
constexpr char kImagePrefixTag[] = "IMAGE_PREFIX"; constexpr char kImagePrefixTag[] = "IMAGE_PREFIX";
constexpr char kSequenceExampleTag[] = "SEQUENCE_EXAMPLE"; constexpr char kSequenceExampleTag[] = "SEQUENCE_EXAMPLE";
constexpr char kImageTag[] = "IMAGE"; constexpr char kImageTag[] = "IMAGE";
constexpr char kClipMediaIdTag[] = "CLIP_MEDIA_ID";
class PackMediaSequenceCalculatorTest : public ::testing::Test { class PackMediaSequenceCalculatorTest : public ::testing::Test {
protected: protected:
@ -70,10 +72,14 @@ class PackMediaSequenceCalculatorTest : public ::testing::Test {
const tf::Features& features, const tf::Features& features,
const bool output_only_if_all_present, const bool output_only_if_all_present,
const bool replace_instead_of_append, const bool replace_instead_of_append,
const bool output_as_zero_timestamp = false) { const bool output_as_zero_timestamp = false,
const std::vector<std::string>& input_side_packets = {
"SEQUENCE_EXAMPLE:input_sequence"}) {
CalculatorGraphConfig::Node config; CalculatorGraphConfig::Node config;
config.set_calculator("PackMediaSequenceCalculator"); config.set_calculator("PackMediaSequenceCalculator");
config.add_input_side_packet("SEQUENCE_EXAMPLE:input_sequence"); for (const std::string& side_packet : input_side_packets) {
config.add_input_side_packet(side_packet);
}
config.add_output_stream("SEQUENCE_EXAMPLE:output_sequence"); config.add_output_stream("SEQUENCE_EXAMPLE:output_sequence");
for (const std::string& stream : input_streams) { for (const std::string& stream : input_streams) {
config.add_input_stream(stream); config.add_input_stream(stream);
@ -833,6 +839,88 @@ TEST_F(PackMediaSequenceCalculatorTest, PacksTwoMaskDetections) {
testing::ElementsAreArray(::std::vector<std::string>({"mask"}))); testing::ElementsAreArray(::std::vector<std::string>({"mask"})));
} }
TEST_F(PackMediaSequenceCalculatorTest, AddClipMediaId) {
SetUpCalculator(
/*input_streams=*/{"FLOAT_FEATURE_TEST:test",
"FLOAT_FEATURE_OTHER:test2"},
/*features=*/{},
/*output_only_if_all_present=*/false,
/*replace_instead_of_append=*/true,
/*output_as_zero_timestamp=*/false, /*input_side_packets=*/
{"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"});
auto input_sequence = absl::make_unique<tf::SequenceExample>();
const std::string test_video_id = "test_video_id";
int num_timesteps = 2;
for (int i = 0; i < num_timesteps; ++i) {
auto vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
runner_->MutableInputs()
->Tag(kFloatFeatureTestTag)
.packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
runner_->MutableInputs()
->Tag(kFloatFeatureOtherTag)
.packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
}
runner_->MutableSidePackets()->Tag(kClipMediaIdTag) =
MakePacket<std::string>(test_video_id);
runner_->MutableSidePackets()->Tag(kSequenceExampleTag) =
Adopt(input_sequence.release());
MP_ASSERT_OK(runner_->Run());
const std::vector<Packet>& output_packets =
runner_->Outputs().Tag(kSequenceExampleTag).packets;
ASSERT_EQ(1, output_packets.size());
const tf::SequenceExample& output_sequence =
output_packets[0].Get<tf::SequenceExample>();
ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence));
}
TEST_F(PackMediaSequenceCalculatorTest, ReplaceClipMediaId) {
SetUpCalculator(
/*input_streams=*/{"FLOAT_FEATURE_TEST:test",
"FLOAT_FEATURE_OTHER:test2"},
/*features=*/{},
/*output_only_if_all_present=*/false,
/*replace_instead_of_append=*/true,
/*output_as_zero_timestamp=*/false, /*input_side_packets=*/
{"SEQUENCE_EXAMPLE:input_sequence", "CLIP_MEDIA_ID:video_id"});
auto input_sequence = absl::make_unique<tf::SequenceExample>();
const std::string existing_video_id = "existing_video_id";
mpms::SetClipMediaId(existing_video_id, input_sequence.get());
const std::string test_video_id = "test_video_id";
int num_timesteps = 2;
for (int i = 0; i < num_timesteps; ++i) {
auto vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
runner_->MutableInputs()
->Tag(kFloatFeatureTestTag)
.packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
vf_ptr = ::absl::make_unique<std::vector<float>>(2, 2 << i);
runner_->MutableInputs()
->Tag(kFloatFeatureOtherTag)
.packets.push_back(Adopt(vf_ptr.release()).At(Timestamp(i)));
}
runner_->MutableSidePackets()->Tag(kClipMediaIdTag) =
MakePacket<std::string>(test_video_id).At(Timestamp(0));
runner_->MutableSidePackets()->Tag(kSequenceExampleTag) =
Adopt(input_sequence.release());
MP_ASSERT_OK(runner_->Run());
const std::vector<Packet>& output_packets =
runner_->Outputs().Tag(kSequenceExampleTag).packets;
ASSERT_EQ(1, output_packets.size());
const tf::SequenceExample& output_sequence =
output_packets[0].Get<tf::SequenceExample>();
ASSERT_EQ(test_video_id, mpms::GetClipMediaId(output_sequence));
}
TEST_F(PackMediaSequenceCalculatorTest, MissingStreamOK) { TEST_F(PackMediaSequenceCalculatorTest, MissingStreamOK) {
SetUpCalculator( SetUpCalculator(
{"FORWARD_FLOW_ENCODED:flow", "FLOAT_FEATURE_I3D_FLOW:feature"}, {}, {"FORWARD_FLOW_ENCODED:flow", "FLOAT_FEATURE_I3D_FLOW:feature"}, {},