// Copyright 2019 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "mediapipe/util/sequence/media_sequence.h" #include #include #include "absl/strings/str_split.h" #include "mediapipe/framework/port/opencv_imgcodecs_inc.h" #include "mediapipe/framework/port/ret_check.h" #include "mediapipe/util/sequence/media_sequence_util.h" namespace mediapipe { namespace mediasequence { namespace { // Decodes the image header to get metadata as strings and ints. bool ImageMetadata(const std::string& image_str, std::string* format_string, int* width, int* height, int* channels) { // Determine the image encoding by matching known header bytes. if (image_str[0] == static_cast(0x89) && image_str[1] == 'P' && image_str[2] == 'N' && image_str[3] == 'G') { *format_string = "PNG"; } else if (image_str[0] == static_cast(0xFF) && image_str[1] == static_cast(0xD8) && image_str[image_str.size() - 2] == static_cast(0xFF) && image_str[image_str.size() - 1] == static_cast(0xD9)) { *format_string = "JPEG"; } else { *format_string = "UNKNOWN"; } auto buf = reinterpret_cast(const_cast(image_str.data())); cv::Mat img = cv::imdecode(cv::Mat(/*rows=*/image_str.size(), /*cols=*/1, CV_8UC1, buf), -1 /*cv::ImreadModes::IMREAD_UNCHANGED*/); if (img.data == nullptr) { return false; } *width = img.cols; *height = img.rows; *channels = img.channels(); return true; } // Finds the nearest timestamp in a FeatureList of timestamps. The FeatureList // must contain int64 values and only the first value at each step is used. int NearestIndex(int64 timestamp, const tensorflow::FeatureList& int64_feature_list) { int64 closest_distance = std::numeric_limits::max(); int index = -1; for (int i = 0; i < int64_feature_list.feature_size(); ++i) { int64 current_value = int64_feature_list.feature(i).int64_list().value(0); int64 current_distance = std::abs(current_value - timestamp); if (current_distance < closest_distance) { index = i; closest_distance = current_distance; } } return index; } // Find the numerical sampling rate between two values in seconds if the input // timestamps are in microseconds. float TimestampsToRate(int64 first_timestamp, int64 second_timestamp) { int64 timestamp_diff = second_timestamp - first_timestamp; // convert from microseconds to seconds. float rate = 1.0 / (static_cast(timestamp_diff) / 1000000); return rate; } // Sets the values of "clip/number_of_frames", "clip/keyframe/index", // "segment/start/index" and "segment/end/index" by finding the closest // timestamps in the "image/timestamp" FeatureList if image timestamps are // present. absl::Status ReconcileAnnotationIndicesByImageTimestamps( tensorflow::SequenceExample* sequence) { if (GetImageTimestampSize(*sequence) == 0) { return absl::OkStatus(); } int index; // clip/segment/index if (HasSegmentStartTimestamp(*sequence)) { int segment_size = GetSegmentStartTimestampSize(*sequence); RET_CHECK_EQ(GetSegmentEndTimestampSize(*sequence), segment_size) << "Expected an equal number of segment timestamps, but found " << "start: " << segment_size << ", end: " << GetSegmentEndTimestampSize(*sequence); std::vector start_indices; start_indices.reserve(segment_size); for (const int64& timestamp : GetSegmentStartTimestamp(*sequence)) { index = NearestIndex(timestamp, GetFeatureList(*sequence, kImageTimestampKey)); start_indices.push_back(index); } SetSegmentStartIndex(start_indices, sequence); std::vector end_indices; end_indices.reserve(segment_size); for (const int64& timestamp : GetSegmentEndTimestamp(*sequence)) { index = NearestIndex(timestamp, GetFeatureList(*sequence, kImageTimestampKey)); end_indices.push_back(index); } SetSegmentEndIndex(end_indices, sequence); } return absl::OkStatus(); } // Sets the values of "image/format", "image/channels", "image/height", // "image/width", and "image/frame_rate" based image metadata and timestamps. absl::Status ReconcileMetadataImages(const std::string& prefix, tensorflow::SequenceExample* sequence) { if (GetImageEncodedSize(prefix, *sequence) == 0) { return absl::OkStatus(); } std::string format; int height, width, channels; RET_CHECK(ImageMetadata(GetImageEncodedAt(prefix, *sequence, 0), &format, &width, &height, &channels)) << "Failure to decode image metadata of image: " << GetImageEncodedAt(prefix, *sequence, 0); SetImageFormat(prefix, format, sequence); SetImageHeight(prefix, height, sequence); SetImageWidth(prefix, width, sequence); SetImageChannels(prefix, channels, sequence); if (GetImageTimestampSize(prefix, *sequence) > 1) { float rate = TimestampsToRate(GetImageTimestampAt(prefix, *sequence, 0), GetImageTimestampAt(prefix, *sequence, 1)); SetImageFrameRate(prefix, rate, sequence); } return absl::OkStatus(); } // Sets the values of "feature/${TAG}/dimensions", and // "feature/${TAG}/frame_rate" for each float list feature TAG. If the // dimensions are already present as a context feature, this method verifies // the number of elements in the feature. Otherwise, it will write the // dimensions as a 1D vector with the number of elements. absl::Status ReconcileMetadataFeatureFloats( tensorflow::SequenceExample* sequence) { // Loop through all keys and see if they contain "/feature/floats" // If so, check dimensions and set rate. for (const auto& key_value : sequence->feature_lists().feature_list()) { const std::string& key = key_value.first; if (absl::StrContains(key, kFeatureFloatsKey)) { const auto prefix = key.substr(0, key.find(kFeatureFloatsKey) - 1); int number_of_elements = GetFeatureFloatsAt(prefix, *sequence, 0).size(); if (HasFeatureDimensions(prefix, *sequence)) { int64 product = 1; for (int64 value : GetFeatureDimensions(prefix, *sequence)) { product *= value; } RET_CHECK_EQ(number_of_elements, product) << "The number of elements in float feature_list " << prefix << "/feature/floats does not match the dimensions: " << number_of_elements; } else { SetFeatureDimensions(prefix, {number_of_elements}, sequence); } if (GetFeatureTimestampSize(prefix, *sequence) > 1) { float rate = TimestampsToRate(GetFeatureTimestampAt(prefix, *sequence, 0), GetFeatureTimestampAt(prefix, *sequence, 1)); SetFeatureRate(prefix, rate, sequence); } } } return absl::OkStatus(); } // Go through all bounding box annotations and move the annotation to the // nearest image frame with a timestamp. If timestamps are not present, does // nothing. If two or more annotations are closest to the same frame, then only // the closest annotation is saved. This matches the behavior of downsampling // images streams in time. absl::Status ReconcileMetadataBoxAnnotations( const std::string& prefix, tensorflow::SequenceExample* sequence) { int num_bboxes = GetBBoxTimestampSize(prefix, *sequence); int num_frames = GetImageTimestampSize(*sequence); if (num_bboxes && num_frames) { // If no one has indicated which frames are annotated, assume annotations // are dense. if (GetBBoxIsAnnotatedSize(prefix, *sequence) == 0) { for (int i = 0; i < num_bboxes; ++i) { AddBBoxIsAnnotated(prefix, true, sequence); } } RET_CHECK_EQ(num_bboxes, GetBBoxIsAnnotatedSize(prefix, *sequence)) << "Expected number of BBox timestamps and annotation marks to match."; // Update num_bboxes. if (GetBBoxSize(prefix, *sequence) > 0) { std::string xmin_key = merge_prefix(prefix, kRegionBBoxXMinKey); auto* bbox_feature_list = MutableFeatureList(xmin_key, sequence); RET_CHECK_EQ(num_bboxes, bbox_feature_list->feature_size()) << "Expected number of BBox timestamps and boxes to match."; ClearBBoxNumRegions(prefix, sequence); for (int i = 0; i < num_bboxes; ++i) { AddBBoxNumRegions( prefix, bbox_feature_list->feature(i).float_list().value_size(), sequence); } } if (GetPointSize(prefix, *sequence) > 0) { std::string x_key = merge_prefix(prefix, kRegionPointXKey); auto* region_feature_list = MutableFeatureList(x_key, sequence); RET_CHECK_EQ(num_bboxes, region_feature_list->feature_size()) << "Expected number of BBox timestamps and boxes to match."; ClearBBoxNumRegions(prefix, sequence); for (int i = 0; i < num_bboxes; ++i) { AddBBoxNumRegions( prefix, region_feature_list->feature(i).float_list().value_size(), sequence); } } if (Get3dPointSize(prefix, *sequence) > 0) { std::string x_key = merge_prefix(prefix, kRegion3dPointXKey); auto* region_feature_list = MutableFeatureList(x_key, sequence); RET_CHECK_EQ(num_bboxes, region_feature_list->feature_size()) << "Expected number of BBox timestamps and boxes to match."; ClearBBoxNumRegions(prefix, sequence); for (int i = 0; i < num_bboxes; ++i) { AddBBoxNumRegions( prefix, region_feature_list->feature(i).float_list().value_size(), sequence); } } // Collect which timestamps currently match to which indices in timestamps. // skip empty timestamps. // Requires sorted indices. ::std::vector box_timestamps(num_bboxes); int bbox_index = 0; std::string timestamp_key = merge_prefix(prefix, kRegionTimestampKey); for (auto& feature : GetFeatureList(*sequence, timestamp_key).feature()) { box_timestamps[bbox_index] = feature.int64_list().value(0); ++bbox_index; } ::std::vector box_is_annotated(num_bboxes); bbox_index = 0; std::string is_annotated_key = merge_prefix(prefix, kRegionIsAnnotatedKey); for (auto& feature : GetFeatureList(*sequence, is_annotated_key).feature()) { box_is_annotated[bbox_index] = feature.int64_list().value(0); ++bbox_index; } ::std::vector image_timestamps(num_frames); int frame_index = 0; for (auto& feature : GetFeatureList(*sequence, kImageTimestampKey).feature()) { image_timestamps[frame_index] = feature.int64_list().value(0); ++frame_index; } // Collect which bbox timestamps are closest to which image indices. ::std::vector bbox_index_if_annotated(num_frames, -1); int box_index = 0; int image_index = 0; while (box_index < num_bboxes) { // leave unannotated boxes at -1. if (!box_is_annotated[box_index]) { box_index += 1; // annotated boxes should updated their closest index. } else if (image_index >= num_frames - 1 || llabs(image_timestamps[image_index] - box_timestamps[box_index]) < llabs(image_timestamps[image_index + 1] - box_timestamps[box_index])) { // Only overwrite with a new value if no value is present or this is // closer in time. if (bbox_index_if_annotated[image_index] == -1 || llabs(image_timestamps[image_index] - box_timestamps[bbox_index_if_annotated[image_index]]) > llabs(image_timestamps[image_index] - box_timestamps[box_index])) { bbox_index_if_annotated[image_index] = box_index; } box_index += 1; } else { image_index += 1; } } // Only update unmodified bbox timestamp if it doesn't exist to prevent // overwriting with modified values. if (!GetUnmodifiedBBoxTimestampSize(prefix, *sequence)) { for (int i = 0; i < num_frames; ++i) { const int bbox_index = bbox_index_if_annotated[i]; if (bbox_index >= 0 && GetBBoxIsAnnotatedAt(prefix, *sequence, bbox_index)) { AddUnmodifiedBBoxTimestamp(prefix, box_timestamps[bbox_index], sequence); } } } // store some new feature_lists in a temporary sequence std::string expected_prefix = merge_prefix(prefix, "region/"); ::tensorflow::SequenceExample tmp_seq; for (const auto& key_value : sequence->feature_lists().feature_list()) { const std::string& key = key_value.first; if (::absl::StartsWith(key, expected_prefix)) { // create a new set of values and swap them in. tmp_seq.Clear(); auto* old_feature_list = MutableFeatureList(key, sequence); auto* new_feature_list = MutableFeatureList(key, &tmp_seq); if (key != merge_prefix(prefix, kUnmodifiedRegionTimestampKey)) { RET_CHECK_EQ(num_bboxes, old_feature_list->feature().size()) << "Expected number of BBox timestamps to match number of " "entries " << "in " << key; for (int i = 0; i < num_frames; ++i) { if (bbox_index_if_annotated[i] >= 0) { if (key == merge_prefix(prefix, kRegionTimestampKey)) { new_feature_list->add_feature() ->mutable_int64_list() ->add_value(image_timestamps[i]); } else { *new_feature_list->add_feature() = old_feature_list->feature(bbox_index_if_annotated[i]); } } else { // Add either a default value or an empty. if (key == merge_prefix(prefix, kRegionIsAnnotatedKey)) { new_feature_list->add_feature() ->mutable_int64_list() ->add_value(0); } else if (key == merge_prefix(prefix, kRegionNumRegionsKey)) { new_feature_list->add_feature() ->mutable_int64_list() ->add_value(0); } else if (key == merge_prefix(prefix, kRegionTimestampKey)) { new_feature_list->add_feature() ->mutable_int64_list() ->add_value(image_timestamps[i]); } else { new_feature_list->add_feature(); // Adds an empty. } } } *old_feature_list = *new_feature_list; } } } } return absl::OkStatus(); } absl::Status ReconcileMetadataRegionAnnotations( tensorflow::SequenceExample* sequence) { // Copy keys for fixed iteration order while updating feature_lists. std::vector key_ptrs; for (const auto& key_value : sequence->feature_lists().feature_list()) { key_ptrs.push_back(&key_value.first); } for (const std::string* key_ptr : key_ptrs) { const std::string& key = *key_ptr; if (::absl::StrContains(key, kRegionTimestampKey)) { std::string prefix = key.substr(0, key.size() - sizeof(kRegionTimestampKey)); if (key == kRegionTimestampKey) { prefix = ""; } RET_CHECK_OK(ReconcileMetadataBoxAnnotations(prefix, sequence)); } } return absl::OkStatus(); } } // namespace int GetBBoxSize(const std::string& prefix, const tensorflow::SequenceExample& sequence) { return GetBBoxXMinSize(prefix, sequence); } std::vector<::mediapipe::Location> GetBBoxAt( const std::string& prefix, const tensorflow::SequenceExample& sequence, int index) { std::vector<::mediapipe::Location> bboxes; const auto& xmins = GetBBoxXMinAt(prefix, sequence, index); const auto& ymins = GetBBoxYMinAt(prefix, sequence, index); const auto& xmaxs = GetBBoxXMaxAt(prefix, sequence, index); const auto& ymaxs = GetBBoxYMaxAt(prefix, sequence, index); bboxes.reserve(xmins.size()); for (int i = 0; i < xmins.size(); ++i) { bboxes.push_back(::mediapipe::Location::CreateRelativeBBoxLocation( xmins[i], ymins[i], xmaxs[i] - xmins[i], ymaxs[i] - ymins[i])); } return bboxes; } void AddBBox(const std::string& prefix, const std::vector<::mediapipe::Location>& bboxes, tensorflow::SequenceExample* sequence) { ::std::vector xmins; ::std::vector ymins; ::std::vector xmaxs; ::std::vector ymaxs; for (auto& bbox : bboxes) { const auto& rect = bbox.GetRelativeBBox(); xmins.push_back(rect.xmin()); ymins.push_back(rect.ymin()); xmaxs.push_back(rect.xmax()); ymaxs.push_back(rect.ymax()); } AddBBoxXMin(prefix, xmins, sequence); AddBBoxYMin(prefix, ymins, sequence); AddBBoxXMax(prefix, xmaxs, sequence); AddBBoxYMax(prefix, ymaxs, sequence); } void ClearBBox(const std::string& prefix, tensorflow::SequenceExample* sequence) { ClearBBoxXMin(prefix, sequence); ClearBBoxYMin(prefix, sequence); ClearBBoxXMax(prefix, sequence); ClearBBoxYMax(prefix, sequence); } int GetPointSize(const std::string& prefix, const tensorflow::SequenceExample& sequence) { return GetBBoxPointXSize(prefix, sequence); } std::vector<::std::pair> GetPointAt( const std::string& prefix, const tensorflow::SequenceExample& sequence, int index) { const auto& ys = GetBBoxPointYAt(prefix, sequence, index); const auto& xs = GetBBoxPointXAt(prefix, sequence, index); std::vector<::std::pair> points(ys.size()); for (int i = 0; i < xs.size(); ++i) { points[i].first = ys[i]; points[i].second = xs[i]; } return points; } void AddPoint(const std::string& prefix, const std::vector<::std::pair>& points, tensorflow::SequenceExample* sequence) { ::std::vector xs; ::std::vector ys; for (auto& point : points) { ys.push_back(point.first); xs.push_back(point.second); } AddBBoxPointY(prefix, ys, sequence); AddBBoxPointX(prefix, xs, sequence); } void ClearPoint(const std::string& prefix, tensorflow::SequenceExample* sequence) { ClearBBoxPointY(prefix, sequence); ClearBBoxPointX(prefix, sequence); } int Get3dPointSize(const std::string& prefix, const tensorflow::SequenceExample& sequence) { return GetBBox3dPointXSize(prefix, sequence); } std::vector<::std::tuple> Get3dPointAt( const std::string& prefix, const tensorflow::SequenceExample& sequence, int index) { const auto& xs = GetBBox3dPointXAt(prefix, sequence, index); const auto& ys = GetBBox3dPointYAt(prefix, sequence, index); const auto& zs = GetBBox3dPointZAt(prefix, sequence, index); std::vector<::std::tuple> points(ys.size()); for (int i = 0; i < xs.size(); ++i) { points[i] = std::make_tuple(xs[i], ys[i], zs[i]); } return points; } void Add3dPoint(const std::string& prefix, const std::vector<::std::tuple>& points, tensorflow::SequenceExample* sequence) { ::std::vector xs; ::std::vector ys; ::std::vector zs; for (auto& point : points) { xs.push_back(std::get<0>(point)); ys.push_back(std::get<1>(point)); zs.push_back(std::get<2>(point)); } AddBBox3dPointX(prefix, xs, sequence); AddBBox3dPointY(prefix, ys, sequence); AddBBox3dPointZ(prefix, zs, sequence); } void Clear3dPoint(const std::string& prefix, tensorflow::SequenceExample* sequence) { ClearBBox3dPointX(prefix, sequence); ClearBBox3dPointY(prefix, sequence); ClearBBox3dPointZ(prefix, sequence); } std::unique_ptr GetAudioFromFeatureAt( const std::string& prefix, const tensorflow::SequenceExample& sequence, int index) { const auto& flat_data = GetFeatureFloatsAt(prefix, sequence, index); CHECK(HasFeatureNumChannels(prefix, sequence)) << "GetAudioAt requires num_channels context to be specified as key: " << merge_prefix(prefix, kFeatureNumChannelsKey); int num_channels = GetFeatureNumChannels(prefix, sequence); CHECK_EQ(flat_data.size() % num_channels, 0) << "The data size is not a multiple of the number of channels: " << flat_data.size() << " % " << num_channels << " = " << flat_data.size() % num_channels << " for sequence index " << index; auto output = absl::make_unique( num_channels, flat_data.size() / num_channels); std::copy(flat_data.begin(), flat_data.end(), output->data()); return output; } void AddAudioAsFeature(const std::string& prefix, const mediapipe::Matrix& audio, tensorflow::SequenceExample* sequence) { auto* value_list = MutableFeatureList(merge_prefix(prefix, kFeatureFloatsKey), sequence) ->add_feature() ->mutable_float_list() ->mutable_value(); mediapipe::proto_ns::RepeatedField( audio.data(), audio.data() + audio.rows() * audio.cols()) .Swap(value_list); } absl::Status ReconcileMetadata(bool reconcile_bbox_annotations, bool reconcile_region_annotations, tensorflow::SequenceExample* sequence) { RET_CHECK_OK(ReconcileAnnotationIndicesByImageTimestamps(sequence)); RET_CHECK_OK(ReconcileMetadataImages("", sequence)); RET_CHECK_OK(ReconcileMetadataImages(kForwardFlowPrefix, sequence)); RET_CHECK_OK(ReconcileMetadataImages(kClassSegmentationPrefix, sequence)); RET_CHECK_OK(ReconcileMetadataImages(kInstanceSegmentationPrefix, sequence)); RET_CHECK_OK(ReconcileMetadataFeatureFloats(sequence)); if (reconcile_bbox_annotations) { RET_CHECK_OK(ReconcileMetadataBoxAnnotations("", sequence)); } if (reconcile_region_annotations) { RET_CHECK_OK(ReconcileMetadataRegionAnnotations(sequence)); } // audio is always reconciled in the framework. return absl::OkStatus(); } } // namespace mediasequence } // namespace mediapipe