mediapipe/mediapipe2/util/sequence/media_sequence.cc

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "mediapipe/util/sequence/media_sequence.h"

#include <cmath>
#include <limits>

#include "absl/strings/str_split.h"
#include "mediapipe/framework/port/opencv_imgcodecs_inc.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/util/sequence/media_sequence_util.h"

namespace mediapipe {
namespace mediasequence {

namespace {

// Decodes the image header to get metadata as strings and ints.
bool ImageMetadata(const std::string& image_str, std::string* format_string,
                   int* width, int* height, int* channels) {
  // Determine the image encoding by matching known header bytes.
  if (image_str[0] == static_cast<char>(0x89) && image_str[1] == 'P' &&
      image_str[2] == 'N' && image_str[3] == 'G') {
    *format_string = "PNG";
  } else if (image_str[0] == static_cast<char>(0xFF) &&
             image_str[1] == static_cast<char>(0xD8) &&
             image_str[image_str.size() - 2] == static_cast<char>(0xFF) &&
             image_str[image_str.size() - 1] == static_cast<char>(0xD9)) {
    *format_string = "JPEG";
  } else {
    *format_string = "UNKNOWN";
  }
  auto buf = reinterpret_cast<void*>(const_cast<char*>(image_str.data()));
  cv::Mat img = cv::imdecode(cv::Mat(/*rows=*/image_str.size(),
                                     /*cols=*/1, CV_8UC1, buf),
                             -1 /*cv::ImreadModes::IMREAD_UNCHANGED*/);
  if (img.data == nullptr) {
    return false;
  }
  *width = img.cols;
  *height = img.rows;
  *channels = img.channels();
  return true;
}

// Finds the nearest timestamp in a FeatureList of timestamps. The FeatureList
// must contain int64 values and only the first value at each step is used.
int NearestIndex(int64 timestamp,
                 const tensorflow::FeatureList& int64_feature_list) {
  int64 closest_distance = std::numeric_limits<int64>::max();
  int index = -1;
  for (int i = 0; i < int64_feature_list.feature_size(); ++i) {
    int64 current_value = int64_feature_list.feature(i).int64_list().value(0);
    int64 current_distance = std::abs(current_value - timestamp);
    if (current_distance < closest_distance) {
      index = i;
      closest_distance = current_distance;
    }
  }
  return index;
}

// Find the numerical sampling rate between two values in seconds if the input
// timestamps are in microseconds.
float TimestampsToRate(int64 first_timestamp, int64 second_timestamp) {
  int64 timestamp_diff = second_timestamp - first_timestamp;
  // convert from microseconds to seconds.
  float rate = 1.0 / (static_cast<float>(timestamp_diff) / 1000000);
  return rate;
}

// Sets the values of "clip/number_of_frames", "clip/keyframe/index",
// "segment/start/index" and "segment/end/index" by finding the closest
// timestamps in the "image/timestamp" FeatureList if image timestamps are
// present.
absl::Status ReconcileAnnotationIndicesByImageTimestamps(
    tensorflow::SequenceExample* sequence) {
  if (GetImageTimestampSize(*sequence) == 0) {
    return absl::OkStatus();
  }
  int index;

  // clip/segment/index
  if (HasSegmentStartTimestamp(*sequence)) {
    int segment_size = GetSegmentStartTimestampSize(*sequence);
    RET_CHECK_EQ(GetSegmentEndTimestampSize(*sequence), segment_size)
        << "Expected an equal number of segment timestamps, but found "
        << "start: " << segment_size
        << ", end: " << GetSegmentEndTimestampSize(*sequence);

    std::vector<int64> start_indices;
    start_indices.reserve(segment_size);
    for (const int64& timestamp : GetSegmentStartTimestamp(*sequence)) {
      index = NearestIndex(timestamp,
                           GetFeatureList(*sequence, kImageTimestampKey));
      start_indices.push_back(index);
    }
    SetSegmentStartIndex(start_indices, sequence);

    std::vector<int64> end_indices;
    end_indices.reserve(segment_size);
    for (const int64& timestamp : GetSegmentEndTimestamp(*sequence)) {
      index = NearestIndex(timestamp,
                           GetFeatureList(*sequence, kImageTimestampKey));
      end_indices.push_back(index);
    }
    SetSegmentEndIndex(end_indices, sequence);
  }
  return absl::OkStatus();
}

// Sets the values of "image/format", "image/channels", "image/height",
// "image/width", and "image/frame_rate" based image metadata and timestamps.
absl::Status ReconcileMetadataImages(const std::string& prefix,
                                     tensorflow::SequenceExample* sequence) {
  if (GetImageEncodedSize(prefix, *sequence) == 0) {
    return absl::OkStatus();
  }
  std::string format;
  int height, width, channels;
  RET_CHECK(ImageMetadata(GetImageEncodedAt(prefix, *sequence, 0), &format,
                          &width, &height, &channels))
      << "Failure to decode image metadata of image: "
      << GetImageEncodedAt(prefix, *sequence, 0);
  SetImageFormat(prefix, format, sequence);
  SetImageHeight(prefix, height, sequence);
  SetImageWidth(prefix, width, sequence);
  SetImageChannels(prefix, channels, sequence);

  if (GetImageTimestampSize(prefix, *sequence) > 1) {
    float rate = TimestampsToRate(GetImageTimestampAt(prefix, *sequence, 0),
                                  GetImageTimestampAt(prefix, *sequence, 1));
    SetImageFrameRate(prefix, rate, sequence);
  }
  return absl::OkStatus();
}

// Sets the values of "feature/${TAG}/dimensions", and
// "feature/${TAG}/frame_rate" for each float list feature TAG. If the
// dimensions are already present as a context feature, this method verifies
// the number of elements in the feature. Otherwise, it will write the
// dimensions as a 1D vector with the number of elements.
absl::Status ReconcileMetadataFeatureFloats(
    tensorflow::SequenceExample* sequence) {
  // Loop through all keys and see if they contain "/feature/floats"
  // If so, check dimensions and set rate.
  for (const auto& key_value : sequence->feature_lists().feature_list()) {
    const std::string& key = key_value.first;
    if (absl::StrContains(key, kFeatureFloatsKey)) {
      const auto prefix = key.substr(0, key.find(kFeatureFloatsKey) - 1);
      int number_of_elements = GetFeatureFloatsAt(prefix, *sequence, 0).size();
      if (HasFeatureDimensions(prefix, *sequence)) {
        int64 product = 1;
        for (int64 value : GetFeatureDimensions(prefix, *sequence)) {
          product *= value;
        }
        RET_CHECK_EQ(number_of_elements, product)
            << "The number of elements in float feature_list " << prefix
            << "/feature/floats does not match the dimensions: "
            << number_of_elements;
      } else {
        SetFeatureDimensions(prefix, {number_of_elements}, sequence);
      }

      if (GetFeatureTimestampSize(prefix, *sequence) > 1) {
        float rate =
            TimestampsToRate(GetFeatureTimestampAt(prefix, *sequence, 0),
                             GetFeatureTimestampAt(prefix, *sequence, 1));
        SetFeatureRate(prefix, rate, sequence);
      }
    }
  }
  return absl::OkStatus();
}

// Go through all bounding box annotations and move the annotation to the
// nearest image frame with a timestamp. If timestamps are not present, does
// nothing. If two or more annotations are closest to the same frame, then only
// the closest annotation is saved. This matches the behavior of downsampling
// images streams in time.
absl::Status ReconcileMetadataBoxAnnotations(
    const std::string& prefix, tensorflow::SequenceExample* sequence) {
  int num_bboxes = GetBBoxTimestampSize(prefix, *sequence);
  int num_frames = GetImageTimestampSize(*sequence);
  if (num_bboxes && num_frames) {
    // If no one has indicated which frames are annotated, assume annotations
    // are dense.
    if (GetBBoxIsAnnotatedSize(prefix, *sequence) == 0) {
      for (int i = 0; i < num_bboxes; ++i) {
        AddBBoxIsAnnotated(prefix, true, sequence);
      }
    }
    RET_CHECK_EQ(num_bboxes, GetBBoxIsAnnotatedSize(prefix, *sequence))
        << "Expected number of BBox timestamps and annotation marks to match.";
    // Update num_bboxes.
    if (GetBBoxSize(prefix, *sequence) > 0) {
      std::string xmin_key = merge_prefix(prefix, kRegionBBoxXMinKey);
      auto* bbox_feature_list = MutableFeatureList(xmin_key, sequence);
      RET_CHECK_EQ(num_bboxes, bbox_feature_list->feature_size())
          << "Expected number of BBox timestamps and boxes to match.";
      ClearBBoxNumRegions(prefix, sequence);
      for (int i = 0; i < num_bboxes; ++i) {
        AddBBoxNumRegions(
            prefix, bbox_feature_list->feature(i).float_list().value_size(),
            sequence);
      }
    }
    if (GetPointSize(prefix, *sequence) > 0) {
      std::string x_key = merge_prefix(prefix, kRegionPointXKey);
      auto* region_feature_list = MutableFeatureList(x_key, sequence);
      RET_CHECK_EQ(num_bboxes, region_feature_list->feature_size())
          << "Expected number of BBox timestamps and boxes to match.";
      ClearBBoxNumRegions(prefix, sequence);
      for (int i = 0; i < num_bboxes; ++i) {
        AddBBoxNumRegions(
            prefix, region_feature_list->feature(i).float_list().value_size(),
            sequence);
      }
    }
    if (Get3dPointSize(prefix, *sequence) > 0) {
      std::string x_key = merge_prefix(prefix, kRegion3dPointXKey);
      auto* region_feature_list = MutableFeatureList(x_key, sequence);
      RET_CHECK_EQ(num_bboxes, region_feature_list->feature_size())
          << "Expected number of BBox timestamps and boxes to match.";
      ClearBBoxNumRegions(prefix, sequence);
      for (int i = 0; i < num_bboxes; ++i) {
        AddBBoxNumRegions(
            prefix, region_feature_list->feature(i).float_list().value_size(),
            sequence);
      }
    }
    // Collect which timestamps currently match to which indices in timestamps.
    // skip empty timestamps.
    // Requires sorted indices.
    ::std::vector<int64> box_timestamps(num_bboxes);
    int bbox_index = 0;
    std::string timestamp_key = merge_prefix(prefix, kRegionTimestampKey);
    for (auto& feature : GetFeatureList(*sequence, timestamp_key).feature()) {
      box_timestamps[bbox_index] = feature.int64_list().value(0);
      ++bbox_index;
    }
    ::std::vector<int32> box_is_annotated(num_bboxes);
    bbox_index = 0;
    std::string is_annotated_key = merge_prefix(prefix, kRegionIsAnnotatedKey);
    for (auto& feature :
         GetFeatureList(*sequence, is_annotated_key).feature()) {
      box_is_annotated[bbox_index] = feature.int64_list().value(0);
      ++bbox_index;
    }
    ::std::vector<int64> image_timestamps(num_frames);
    int frame_index = 0;
    for (auto& feature :
         GetFeatureList(*sequence, kImageTimestampKey).feature()) {
      image_timestamps[frame_index] = feature.int64_list().value(0);
      ++frame_index;
    }
    // Collect which bbox timestamps are closest to which image indices.
    ::std::vector<int> bbox_index_if_annotated(num_frames, -1);
    int box_index = 0;
    int image_index = 0;
    while (box_index < num_bboxes) {
      // leave unannotated boxes at -1.
      if (!box_is_annotated[box_index]) {
        box_index += 1;
        // annotated boxes should updated their closest index.
      } else if (image_index >= num_frames - 1 ||
                 llabs(image_timestamps[image_index] -
                       box_timestamps[box_index]) <
                     llabs(image_timestamps[image_index + 1] -
                           box_timestamps[box_index])) {
        // Only overwrite with a new value if no value is present or this is
        // closer in time.
        if (bbox_index_if_annotated[image_index] == -1 ||
            llabs(image_timestamps[image_index] -
                  box_timestamps[bbox_index_if_annotated[image_index]]) >
                llabs(image_timestamps[image_index] -
                      box_timestamps[box_index])) {
          bbox_index_if_annotated[image_index] = box_index;
        }
        box_index += 1;
      } else {
        image_index += 1;
      }
    }
    // Only update unmodified bbox timestamp if it doesn't exist to prevent
    // overwriting with modified values.
    if (!GetUnmodifiedBBoxTimestampSize(prefix, *sequence)) {
      for (int i = 0; i < num_frames; ++i) {
        const int bbox_index = bbox_index_if_annotated[i];
        if (bbox_index >= 0 &&
            GetBBoxIsAnnotatedAt(prefix, *sequence, bbox_index)) {
          AddUnmodifiedBBoxTimestamp(prefix, box_timestamps[bbox_index],
                                     sequence);
        }
      }
    }
    // store some new feature_lists in a temporary sequence
    std::string expected_prefix = merge_prefix(prefix, "region/");
    ::tensorflow::SequenceExample tmp_seq;
    for (const auto& key_value : sequence->feature_lists().feature_list()) {
      const std::string& key = key_value.first;
      if (::absl::StartsWith(key, expected_prefix)) {
        // create a new set of values and swap them in.
        tmp_seq.Clear();
        auto* old_feature_list = MutableFeatureList(key, sequence);
        auto* new_feature_list = MutableFeatureList(key, &tmp_seq);
        if (key != merge_prefix(prefix, kUnmodifiedRegionTimestampKey)) {
          RET_CHECK_EQ(num_bboxes, old_feature_list->feature().size())
              << "Expected number of BBox timestamps to match number of "
                 "entries "
              << "in " << key;
          for (int i = 0; i < num_frames; ++i) {
            if (bbox_index_if_annotated[i] >= 0) {
              if (key == merge_prefix(prefix, kRegionTimestampKey)) {
                new_feature_list->add_feature()
                    ->mutable_int64_list()
                    ->add_value(image_timestamps[i]);
              } else {
                *new_feature_list->add_feature() =
                    old_feature_list->feature(bbox_index_if_annotated[i]);
              }
            } else {
              // Add either a default value or an empty.
              if (key == merge_prefix(prefix, kRegionIsAnnotatedKey)) {
                new_feature_list->add_feature()
                    ->mutable_int64_list()
                    ->add_value(0);
              } else if (key == merge_prefix(prefix, kRegionNumRegionsKey)) {
                new_feature_list->add_feature()
                    ->mutable_int64_list()
                    ->add_value(0);
              } else if (key == merge_prefix(prefix, kRegionTimestampKey)) {
                new_feature_list->add_feature()
                    ->mutable_int64_list()
                    ->add_value(image_timestamps[i]);
              } else {
                new_feature_list->add_feature();  // Adds an empty.
              }
            }
          }
          *old_feature_list = *new_feature_list;
        }
      }
    }
  }
  return absl::OkStatus();
}

absl::Status ReconcileMetadataRegionAnnotations(
    tensorflow::SequenceExample* sequence) {
  // Copy keys for fixed iteration order while updating feature_lists.
  std::vector<const std::string*> key_ptrs;
  for (const auto& key_value : sequence->feature_lists().feature_list()) {
    key_ptrs.push_back(&key_value.first);
  }
  for (const std::string* key_ptr : key_ptrs) {
    const std::string& key = *key_ptr;
    if (::absl::StrContains(key, kRegionTimestampKey)) {
      std::string prefix =
          key.substr(0, key.size() - sizeof(kRegionTimestampKey));
      if (key == kRegionTimestampKey) {
        prefix = "";
      }
      RET_CHECK_OK(ReconcileMetadataBoxAnnotations(prefix, sequence));
    }
  }
  return absl::OkStatus();
}
}  // namespace

int GetBBoxSize(const std::string& prefix,
                const tensorflow::SequenceExample& sequence) {
  return GetBBoxXMinSize(prefix, sequence);
}

std::vector<::mediapipe::Location> GetBBoxAt(
    const std::string& prefix, const tensorflow::SequenceExample& sequence,
    int index) {
  std::vector<::mediapipe::Location> bboxes;
  const auto& xmins = GetBBoxXMinAt(prefix, sequence, index);
  const auto& ymins = GetBBoxYMinAt(prefix, sequence, index);
  const auto& xmaxs = GetBBoxXMaxAt(prefix, sequence, index);
  const auto& ymaxs = GetBBoxYMaxAt(prefix, sequence, index);
  bboxes.reserve(xmins.size());
  for (int i = 0; i < xmins.size(); ++i) {
    bboxes.push_back(::mediapipe::Location::CreateRelativeBBoxLocation(
        xmins[i], ymins[i], xmaxs[i] - xmins[i], ymaxs[i] - ymins[i]));
  }
  return bboxes;
}

void AddBBox(const std::string& prefix,
             const std::vector<::mediapipe::Location>& bboxes,
             tensorflow::SequenceExample* sequence) {
  ::std::vector<float> xmins;
  ::std::vector<float> ymins;
  ::std::vector<float> xmaxs;
  ::std::vector<float> ymaxs;
  for (auto& bbox : bboxes) {
    const auto& rect = bbox.GetRelativeBBox();
    xmins.push_back(rect.xmin());
    ymins.push_back(rect.ymin());
    xmaxs.push_back(rect.xmax());
    ymaxs.push_back(rect.ymax());
  }
  AddBBoxXMin(prefix, xmins, sequence);
  AddBBoxYMin(prefix, ymins, sequence);
  AddBBoxXMax(prefix, xmaxs, sequence);
  AddBBoxYMax(prefix, ymaxs, sequence);
}

void ClearBBox(const std::string& prefix,
               tensorflow::SequenceExample* sequence) {
  ClearBBoxXMin(prefix, sequence);
  ClearBBoxYMin(prefix, sequence);
  ClearBBoxXMax(prefix, sequence);
  ClearBBoxYMax(prefix, sequence);
}

int GetPointSize(const std::string& prefix,
                 const tensorflow::SequenceExample& sequence) {
  return GetBBoxPointXSize(prefix, sequence);
}

std::vector<::std::pair<float, float>> GetPointAt(
    const std::string& prefix, const tensorflow::SequenceExample& sequence,
    int index) {
  const auto& ys = GetBBoxPointYAt(prefix, sequence, index);
  const auto& xs = GetBBoxPointXAt(prefix, sequence, index);
  std::vector<::std::pair<float, float>> points(ys.size());
  for (int i = 0; i < xs.size(); ++i) {
    points[i].first = ys[i];
    points[i].second = xs[i];
  }
  return points;
}

void AddPoint(const std::string& prefix,
              const std::vector<::std::pair<float, float>>& points,
              tensorflow::SequenceExample* sequence) {
  ::std::vector<float> xs;
  ::std::vector<float> ys;
  for (auto& point : points) {
    ys.push_back(point.first);
    xs.push_back(point.second);
  }
  AddBBoxPointY(prefix, ys, sequence);
  AddBBoxPointX(prefix, xs, sequence);
}

void ClearPoint(const std::string& prefix,
                tensorflow::SequenceExample* sequence) {
  ClearBBoxPointY(prefix, sequence);
  ClearBBoxPointX(prefix, sequence);
}

int Get3dPointSize(const std::string& prefix,
                   const tensorflow::SequenceExample& sequence) {
  return GetBBox3dPointXSize(prefix, sequence);
}

std::vector<::std::tuple<float, float, float>> Get3dPointAt(
    const std::string& prefix, const tensorflow::SequenceExample& sequence,
    int index) {
  const auto& xs = GetBBox3dPointXAt(prefix, sequence, index);
  const auto& ys = GetBBox3dPointYAt(prefix, sequence, index);
  const auto& zs = GetBBox3dPointZAt(prefix, sequence, index);
  std::vector<::std::tuple<float, float, float>> points(ys.size());
  for (int i = 0; i < xs.size(); ++i) {
    points[i] = std::make_tuple(xs[i], ys[i], zs[i]);
  }
  return points;
}

void Add3dPoint(const std::string& prefix,
                const std::vector<::std::tuple<float, float, float>>& points,
                tensorflow::SequenceExample* sequence) {
  ::std::vector<float> xs;
  ::std::vector<float> ys;
  ::std::vector<float> zs;
  for (auto& point : points) {
    xs.push_back(std::get<0>(point));
    ys.push_back(std::get<1>(point));
    zs.push_back(std::get<2>(point));
  }
  AddBBox3dPointX(prefix, xs, sequence);
  AddBBox3dPointY(prefix, ys, sequence);
  AddBBox3dPointZ(prefix, zs, sequence);
}

void Clear3dPoint(const std::string& prefix,
                  tensorflow::SequenceExample* sequence) {
  ClearBBox3dPointX(prefix, sequence);
  ClearBBox3dPointY(prefix, sequence);
  ClearBBox3dPointZ(prefix, sequence);
}

std::unique_ptr<mediapipe::Matrix> GetAudioFromFeatureAt(
    const std::string& prefix, const tensorflow::SequenceExample& sequence,
    int index) {
  const auto& flat_data = GetFeatureFloatsAt(prefix, sequence, index);
  CHECK(HasFeatureNumChannels(prefix, sequence))
      << "GetAudioAt requires num_channels context to be specified as key: "
      << merge_prefix(prefix, kFeatureNumChannelsKey);
  int num_channels = GetFeatureNumChannels(prefix, sequence);
  CHECK_EQ(flat_data.size() % num_channels, 0)
      << "The data size is not a multiple of the number of channels: "
      << flat_data.size() << " % " << num_channels << " = "
      << flat_data.size() % num_channels << " for sequence index " << index;
  auto output = absl::make_unique<mediapipe::Matrix>(
      num_channels, flat_data.size() / num_channels);
  std::copy(flat_data.begin(), flat_data.end(), output->data());
  return output;
}

void AddAudioAsFeature(const std::string& prefix,
                       const mediapipe::Matrix& audio,
                       tensorflow::SequenceExample* sequence) {
  auto* value_list =
      MutableFeatureList(merge_prefix(prefix, kFeatureFloatsKey), sequence)
          ->add_feature()
          ->mutable_float_list()
          ->mutable_value();
  mediapipe::proto_ns::RepeatedField<float>(
      audio.data(), audio.data() + audio.rows() * audio.cols())
      .Swap(value_list);
}

absl::Status ReconcileMetadata(bool reconcile_bbox_annotations,
                               bool reconcile_region_annotations,
                               tensorflow::SequenceExample* sequence) {
  RET_CHECK_OK(ReconcileAnnotationIndicesByImageTimestamps(sequence));
  RET_CHECK_OK(ReconcileMetadataImages("", sequence));
  RET_CHECK_OK(ReconcileMetadataImages(kForwardFlowPrefix, sequence));
  RET_CHECK_OK(ReconcileMetadataImages(kClassSegmentationPrefix, sequence));
  RET_CHECK_OK(ReconcileMetadataImages(kInstanceSegmentationPrefix, sequence));
  RET_CHECK_OK(ReconcileMetadataFeatureFloats(sequence));
  if (reconcile_bbox_annotations) {
    RET_CHECK_OK(ReconcileMetadataBoxAnnotations("", sequence));
  }
  if (reconcile_region_annotations) {
    RET_CHECK_OK(ReconcileMetadataRegionAnnotations(sequence));
  }
  // audio is always reconciled in the framework.
  return absl::OkStatus();
}

}  // namespace mediasequence
}  // namespace mediapipe