mediapipe/mediapipe/calculators/video/motion_analysis_calculator.cc
MediaPipe Team cc6a2f7af6 Project import generated by Copybara.
GitOrigin-RevId: 73d686c40057684f8bfaca285368bf1813f9fc26
2022-03-21 12:12:39 -07:00

1008 lines
37 KiB
C++

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cmath>
#include <fstream>
#include <memory>
#include <string>
#include "absl/strings/numbers.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "mediapipe/calculators/video/motion_analysis_calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/image_frame.h"
#include "mediapipe/framework/formats/image_frame_opencv.h"
#include "mediapipe/framework/formats/video_stream_header.h"
#include "mediapipe/framework/port/integral_types.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/util/tracking/camera_motion.h"
#include "mediapipe/util/tracking/camera_motion.pb.h"
#include "mediapipe/util/tracking/frame_selection.pb.h"
#include "mediapipe/util/tracking/motion_analysis.h"
#include "mediapipe/util/tracking/motion_estimation.h"
#include "mediapipe/util/tracking/motion_models.h"
#include "mediapipe/util/tracking/region_flow.pb.h"
namespace mediapipe {
constexpr char kDownsampleTag[] = "DOWNSAMPLE";
constexpr char kCsvFileTag[] = "CSV_FILE";
constexpr char kGrayVideoOutTag[] = "GRAY_VIDEO_OUT";
constexpr char kVideoOutTag[] = "VIDEO_OUT";
constexpr char kDenseFgTag[] = "DENSE_FG";
constexpr char kVizTag[] = "VIZ";
constexpr char kSaliencyTag[] = "SALIENCY";
constexpr char kCameraTag[] = "CAMERA";
constexpr char kFlowTag[] = "FLOW";
constexpr char kSelectionTag[] = "SELECTION";
constexpr char kVideoTag[] = "VIDEO";
using mediapipe::AffineAdapter;
using mediapipe::CameraMotion;
using mediapipe::FrameSelectionResult;
using mediapipe::Homography;
using mediapipe::HomographyAdapter;
using mediapipe::LinearSimilarityModel;
using mediapipe::MixtureHomography;
using mediapipe::MixtureRowWeights;
using mediapipe::MotionAnalysis;
using mediapipe::ProjectViaFit;
using mediapipe::RegionFlowComputationOptions;
using mediapipe::RegionFlowFeatureList;
using mediapipe::SalientPointFrame;
using mediapipe::TranslationModel;
const char kOptionsTag[] = "OPTIONS";
// A calculator that performs motion analysis on an incoming video stream.
//
// Input streams: (at least one of them is required).
// VIDEO: The input video stream (ImageFrame, sRGB, sRGBA or GRAY8).
// SELECTION: Optional input stream to perform analysis only on selected
// frames. If present needs to contain camera motion
// and features.
//
// Input side packets:
// CSV_FILE: Read motion models as homographies from CSV file. Expected
// to be defined in the frame domain (un-normalized).
// Should store 9 floats per row.
// Specify number of homographies per frames via option
// meta_models_per_frame. For values > 1, MixtureHomographies
// are created, for value == 1, a single Homography is used.
// DOWNSAMPLE: Optionally specify downsampling factor via input side packet
// overriding value in the graph settings.
// Output streams (all are optional).
// FLOW: Sparse feature tracks in form of proto RegionFlowFeatureList.
// CAMERA: Camera motion as proto CameraMotion describing the per frame-
// pair motion. Has VideoHeader from input video.
// SALIENCY: Foreground saliency (objects moving different from the
// background) as proto SalientPointFrame.
// VIZ: Visualization stream as ImageFrame, sRGB, visualizing
// features and saliency (set via
// analysis_options().visualization_options())
// DENSE_FG: Dense foreground stream, describing per-pixel foreground-
// ness as confidence between 0 (background) and 255
// (foreground). Output is ImageFrame (GRAY8).
// VIDEO_OUT: Optional output stream when SELECTION is used. Output is input
// VIDEO at the selected frames. Required VIDEO to be present.
// GRAY_VIDEO_OUT: Optional output stream for downsampled, grayscale video.
// Requires VIDEO to be present and SELECTION to not be used.
class MotionAnalysisCalculator : public CalculatorBase {
// TODO: Activate once leakr approval is ready.
// typedef com::google::android::libraries::micro::proto::Data HomographyData;
public:
~MotionAnalysisCalculator() override = default;
static absl::Status GetContract(CalculatorContract* cc);
absl::Status Open(CalculatorContext* cc) override;
absl::Status Process(CalculatorContext* cc) override;
absl::Status Close(CalculatorContext* cc) override;
private:
// Outputs results to Outputs() if MotionAnalysis buffered sufficient results.
// Otherwise no-op. Set flush to true to force output of all buffered data.
void OutputMotionAnalyzedFrames(bool flush, CalculatorContext* cc);
// Lazy init function to be called on Process.
absl::Status InitOnProcess(InputStream* video_stream,
InputStream* selection_stream);
// Parses CSV file contents to homographies.
bool ParseModelCSV(const std::string& contents,
std::deque<Homography>* homographies);
// Turns list of 9-tuple floating values into set of homographies.
bool HomographiesFromValues(const std::vector<float>& homog_values,
std::deque<Homography>* homographies);
// Appends CameraMotions and features from homographies.
// Set append_identity to true to add an identity transform to the beginning
// of the each list *in addition* to the motions derived from homographies.
void AppendCameraMotionsFromHomographies(
const std::deque<Homography>& homographies, bool append_identity,
std::deque<CameraMotion>* camera_motions,
std::deque<RegionFlowFeatureList>* features);
// Helper function to subtract current metadata motion from features. Used
// for hybrid estimation case.
void SubtractMetaMotion(const CameraMotion& meta_motion,
RegionFlowFeatureList* features);
// Inverse of above function to add back meta motion and replace
// feature location with originals after estimation.
void AddMetaMotion(const CameraMotion& meta_motion,
const RegionFlowFeatureList& meta_features,
RegionFlowFeatureList* features, CameraMotion* motion);
MotionAnalysisCalculatorOptions options_;
int frame_width_ = -1;
int frame_height_ = -1;
int frame_idx_ = 0;
// Buffers incoming video frame packets (if visualization output is requested)
std::vector<Packet> packet_buffer_;
// Buffers incoming timestamps until MotionAnalysis is ready to output via
// above OutputMotionAnalyzedFrames.
std::vector<Timestamp> timestamp_buffer_;
// Input indicators for each stream.
bool selection_input_ = false;
bool video_input_ = false;
// Output indicators for each stream.
bool region_flow_feature_output_ = false;
bool camera_motion_output_ = false;
bool saliency_output_ = false;
bool visualize_output_ = false;
bool dense_foreground_output_ = false;
bool video_output_ = false;
bool grayscale_output_ = false;
bool csv_file_input_ = false;
// Inidicates if saliency should be computed.
bool with_saliency_ = false;
// Set if hybrid meta analysis - see proto for details.
bool hybrid_meta_analysis_ = false;
// Concatenated motions for each selected frame. Used in case
// hybrid estimation is requested to fallback to valid models.
std::deque<CameraMotion> selected_motions_;
// Normalized homographies from CSV file or metadata.
std::deque<Homography> meta_homographies_;
std::deque<CameraMotion> meta_motions_;
std::deque<RegionFlowFeatureList> meta_features_;
// Offset into above meta_motions_ and features_ when using
// hybrid meta analysis.
int hybrid_meta_offset_ = 0;
std::unique_ptr<MotionAnalysis> motion_analysis_;
std::unique_ptr<MixtureRowWeights> row_weights_;
};
REGISTER_CALCULATOR(MotionAnalysisCalculator);
absl::Status MotionAnalysisCalculator::GetContract(CalculatorContract* cc) {
if (cc->Inputs().HasTag(kVideoTag)) {
cc->Inputs().Tag(kVideoTag).Set<ImageFrame>();
}
// Optional input stream from frame selection calculator.
if (cc->Inputs().HasTag(kSelectionTag)) {
cc->Inputs().Tag(kSelectionTag).Set<FrameSelectionResult>();
}
RET_CHECK(cc->Inputs().HasTag(kVideoTag) ||
cc->Inputs().HasTag(kSelectionTag))
<< "Either VIDEO, SELECTION must be specified.";
if (cc->Outputs().HasTag(kFlowTag)) {
cc->Outputs().Tag(kFlowTag).Set<RegionFlowFeatureList>();
}
if (cc->Outputs().HasTag(kCameraTag)) {
cc->Outputs().Tag(kCameraTag).Set<CameraMotion>();
}
if (cc->Outputs().HasTag(kSaliencyTag)) {
cc->Outputs().Tag(kSaliencyTag).Set<SalientPointFrame>();
}
if (cc->Outputs().HasTag(kVizTag)) {
cc->Outputs().Tag(kVizTag).Set<ImageFrame>();
}
if (cc->Outputs().HasTag(kDenseFgTag)) {
cc->Outputs().Tag(kDenseFgTag).Set<ImageFrame>();
}
if (cc->Outputs().HasTag(kVideoOutTag)) {
cc->Outputs().Tag(kVideoOutTag).Set<ImageFrame>();
}
if (cc->Outputs().HasTag(kGrayVideoOutTag)) {
// We only output grayscale video if we're actually performing full region-
// flow analysis on the video.
RET_CHECK(cc->Inputs().HasTag(kVideoTag) &&
!cc->Inputs().HasTag(kSelectionTag));
cc->Outputs().Tag(kGrayVideoOutTag).Set<ImageFrame>();
}
if (cc->InputSidePackets().HasTag(kCsvFileTag)) {
cc->InputSidePackets().Tag(kCsvFileTag).Set<std::string>();
}
if (cc->InputSidePackets().HasTag(kDownsampleTag)) {
cc->InputSidePackets().Tag(kDownsampleTag).Set<float>();
}
if (cc->InputSidePackets().HasTag(kOptionsTag)) {
cc->InputSidePackets().Tag(kOptionsTag).Set<CalculatorOptions>();
}
return absl::OkStatus();
}
absl::Status MotionAnalysisCalculator::Open(CalculatorContext* cc) {
options_ =
tool::RetrieveOptions(cc->Options<MotionAnalysisCalculatorOptions>(),
cc->InputSidePackets(), kOptionsTag);
video_input_ = cc->Inputs().HasTag(kVideoTag);
selection_input_ = cc->Inputs().HasTag(kSelectionTag);
region_flow_feature_output_ = cc->Outputs().HasTag(kFlowTag);
camera_motion_output_ = cc->Outputs().HasTag(kCameraTag);
saliency_output_ = cc->Outputs().HasTag(kSaliencyTag);
visualize_output_ = cc->Outputs().HasTag(kVizTag);
dense_foreground_output_ = cc->Outputs().HasTag(kDenseFgTag);
video_output_ = cc->Outputs().HasTag(kVideoOutTag);
grayscale_output_ = cc->Outputs().HasTag(kGrayVideoOutTag);
csv_file_input_ = cc->InputSidePackets().HasTag(kCsvFileTag);
hybrid_meta_analysis_ = options_.meta_analysis() ==
MotionAnalysisCalculatorOptions::META_ANALYSIS_HYBRID;
if (video_output_) {
RET_CHECK(selection_input_) << "VIDEO_OUT requires SELECTION input";
}
if (selection_input_) {
switch (options_.selection_analysis()) {
case MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION:
RET_CHECK(!visualize_output_)
<< "Visualization not supported for NO_ANALYSIS_USE_SELECTION";
RET_CHECK(!dense_foreground_output_)
<< "Dense foreground not supported for NO_ANALYSIS_USE_SELECTION";
RET_CHECK(!saliency_output_)
<< "Saliency output not supported for NO_ANALYSIS_USE_SELECTION";
break;
case MotionAnalysisCalculatorOptions::ANALYSIS_RECOMPUTE:
case MotionAnalysisCalculatorOptions::ANALYSIS_WITH_SEED:
RET_CHECK(video_input_) << "Need video input for feature tracking.";
break;
case MotionAnalysisCalculatorOptions::ANALYSIS_FROM_FEATURES:
// Nothing to add here.
break;
}
}
if (visualize_output_ || dense_foreground_output_ || video_output_) {
RET_CHECK(video_input_) << "Video input required.";
}
if (csv_file_input_) {
RET_CHECK(!selection_input_)
<< "Can not use selection input with csv input.";
if (!hybrid_meta_analysis_) {
RET_CHECK(!saliency_output_ && !visualize_output_ &&
!dense_foreground_output_ && !grayscale_output_)
<< "CSV file and meta input only supports flow and camera motion "
<< "output when using metadata only.";
}
}
if (csv_file_input_) {
// Read from file and parse.
const std::string filename =
cc->InputSidePackets().Tag(kCsvFileTag).Get<std::string>();
std::string file_contents;
std::ifstream input_file(filename, std::ios::in);
input_file.seekg(0, std::ios::end);
const int file_length = input_file.tellg();
file_contents.resize(file_length);
input_file.seekg(0, std::ios::beg);
input_file.read(&file_contents[0], file_length);
input_file.close();
RET_CHECK(ParseModelCSV(file_contents, &meta_homographies_))
<< "Could not parse CSV file";
}
// Get video header from video or selection input if present.
const VideoHeader* video_header = nullptr;
if (video_input_ && !cc->Inputs().Tag(kVideoTag).Header().IsEmpty()) {
video_header = &(cc->Inputs().Tag(kVideoTag).Header().Get<VideoHeader>());
} else if (selection_input_ &&
!cc->Inputs().Tag(kSelectionTag).Header().IsEmpty()) {
video_header =
&(cc->Inputs().Tag(kSelectionTag).Header().Get<VideoHeader>());
} else {
LOG(WARNING) << "No input video header found. Downstream calculators "
"expecting video headers are likely to fail.";
}
with_saliency_ = options_.analysis_options().compute_motion_saliency();
// Force computation of saliency if requested as output.
if (cc->Outputs().HasTag(kSaliencyTag)) {
with_saliency_ = true;
if (!options_.analysis_options().compute_motion_saliency()) {
LOG(WARNING) << "Enable saliency computation. Set "
<< "compute_motion_saliency to true to silence this "
<< "warning.";
options_.mutable_analysis_options()->set_compute_motion_saliency(true);
}
}
if (options_.bypass_mode()) {
cc->SetOffset(TimestampDiff(0));
}
if (cc->InputSidePackets().HasTag(kDownsampleTag)) {
options_.mutable_analysis_options()
->mutable_flow_options()
->set_downsample_factor(
cc->InputSidePackets().Tag(kDownsampleTag).Get<float>());
}
// If no video header is provided, just return and initialize on the first
// Process() call.
if (video_header == nullptr) {
return absl::OkStatus();
}
////////////// EARLY RETURN; ONLY HEADER OUTPUT SHOULD GO HERE ///////////////
if (visualize_output_) {
cc->Outputs().Tag(kVizTag).SetHeader(Adopt(new VideoHeader(*video_header)));
}
if (video_output_) {
cc->Outputs()
.Tag(kVideoOutTag)
.SetHeader(Adopt(new VideoHeader(*video_header)));
}
if (cc->Outputs().HasTag(kDenseFgTag)) {
std::unique_ptr<VideoHeader> foreground_header(
new VideoHeader(*video_header));
foreground_header->format = ImageFormat::GRAY8;
cc->Outputs()
.Tag(kDenseFgTag)
.SetHeader(Adopt(foreground_header.release()));
}
if (cc->Outputs().HasTag(kCameraTag)) {
cc->Outputs()
.Tag(kCameraTag)
.SetHeader(Adopt(new VideoHeader(*video_header)));
}
if (cc->Outputs().HasTag(kSaliencyTag)) {
cc->Outputs()
.Tag(kSaliencyTag)
.SetHeader(Adopt(new VideoHeader(*video_header)));
}
return absl::OkStatus();
}
absl::Status MotionAnalysisCalculator::Process(CalculatorContext* cc) {
if (options_.bypass_mode()) {
return absl::OkStatus();
}
InputStream* video_stream =
video_input_ ? &(cc->Inputs().Tag(kVideoTag)) : nullptr;
InputStream* selection_stream =
selection_input_ ? &(cc->Inputs().Tag(kSelectionTag)) : nullptr;
// Checked on Open.
CHECK(video_stream || selection_stream);
// Lazy init.
if (frame_width_ < 0 || frame_height_ < 0) {
MP_RETURN_IF_ERROR(InitOnProcess(video_stream, selection_stream));
}
const Timestamp timestamp = cc->InputTimestamp();
if ((csv_file_input_) && !hybrid_meta_analysis_) {
if (camera_motion_output_) {
RET_CHECK(!meta_motions_.empty()) << "Insufficient metadata.";
CameraMotion output_motion = meta_motions_.front();
meta_motions_.pop_front();
output_motion.set_timestamp_usec(timestamp.Value());
cc->Outputs()
.Tag(kCameraTag)
.Add(new CameraMotion(output_motion), timestamp);
}
if (region_flow_feature_output_) {
RET_CHECK(!meta_features_.empty()) << "Insufficient frames in CSV file";
RegionFlowFeatureList output_features = meta_features_.front();
meta_features_.pop_front();
output_features.set_timestamp_usec(timestamp.Value());
cc->Outputs().Tag(kFlowTag).Add(
new RegionFlowFeatureList(output_features), timestamp);
}
++frame_idx_;
return absl::OkStatus();
}
if (motion_analysis_ == nullptr) {
// We do not need MotionAnalysis when using just metadata.
motion_analysis_.reset(new MotionAnalysis(options_.analysis_options(),
frame_width_, frame_height_));
}
std::unique_ptr<FrameSelectionResult> frame_selection_result;
// Always use frame if selection is not activated.
bool use_frame = !selection_input_;
if (selection_input_) {
CHECK(selection_stream);
// Fill in timestamps we process.
if (!selection_stream->Value().IsEmpty()) {
ASSIGN_OR_RETURN(
frame_selection_result,
selection_stream->Value().ConsumeOrCopy<FrameSelectionResult>());
use_frame = true;
// Make sure both features and camera motion are present.
RET_CHECK(frame_selection_result->has_camera_motion() &&
frame_selection_result->has_features())
<< "Frame selection input error at: " << timestamp
<< " both camera motion and features need to be "
"present in FrameSelectionResult. "
<< frame_selection_result->has_camera_motion() << " , "
<< frame_selection_result->has_features();
}
}
if (selection_input_ && use_frame &&
options_.selection_analysis() ==
MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION) {
// Output concatenated results, nothing to compute here.
if (camera_motion_output_) {
cc->Outputs()
.Tag(kCameraTag)
.Add(frame_selection_result->release_camera_motion(), timestamp);
}
if (region_flow_feature_output_) {
cc->Outputs().Tag(kFlowTag).Add(
frame_selection_result->release_features(), timestamp);
}
if (video_output_) {
cc->Outputs().Tag(kVideoOutTag).AddPacket(video_stream->Value());
}
return absl::OkStatus();
}
if (use_frame) {
if (!selection_input_) {
const cv::Mat input_view =
formats::MatView(&video_stream->Get<ImageFrame>());
if (hybrid_meta_analysis_) {
// Seed with meta homography.
RET_CHECK(hybrid_meta_offset_ < meta_motions_.size())
<< "Not enough metadata received for hybrid meta analysis";
Homography initial_transform =
meta_motions_[hybrid_meta_offset_].homography();
std::function<void(RegionFlowFeatureList*)> subtract_helper = std::bind(
&MotionAnalysisCalculator::SubtractMetaMotion, this,
meta_motions_[hybrid_meta_offset_], std::placeholders::_1);
// Keep original features before modification around.
motion_analysis_->AddFrameGeneric(
input_view, timestamp.Value(), initial_transform, nullptr, nullptr,
&subtract_helper, &meta_features_[hybrid_meta_offset_]);
++hybrid_meta_offset_;
} else {
motion_analysis_->AddFrame(input_view, timestamp.Value());
}
} else {
selected_motions_.push_back(frame_selection_result->camera_motion());
switch (options_.selection_analysis()) {
case MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION:
return mediapipe::UnknownErrorBuilder(MEDIAPIPE_LOC)
<< "Should not reach this point!";
case MotionAnalysisCalculatorOptions::ANALYSIS_FROM_FEATURES:
motion_analysis_->AddFeatures(frame_selection_result->features());
break;
case MotionAnalysisCalculatorOptions::ANALYSIS_RECOMPUTE: {
const cv::Mat input_view =
formats::MatView(&video_stream->Get<ImageFrame>());
motion_analysis_->AddFrame(input_view, timestamp.Value());
break;
}
case MotionAnalysisCalculatorOptions::ANALYSIS_WITH_SEED: {
Homography homography;
CameraMotionToHomography(frame_selection_result->camera_motion(),
&homography);
const cv::Mat input_view =
formats::MatView(&video_stream->Get<ImageFrame>());
motion_analysis_->AddFrameGeneric(input_view, timestamp.Value(),
homography, &homography);
break;
}
}
}
timestamp_buffer_.push_back(timestamp);
++frame_idx_;
VLOG_EVERY_N(1, 100) << "Analyzed frame " << frame_idx_;
// Buffer input frames only if visualization is requested.
if (visualize_output_ || video_output_) {
packet_buffer_.push_back(video_stream->Value());
}
// If requested, output grayscale thumbnails
if (grayscale_output_) {
cv::Mat grayscale_mat = motion_analysis_->GetGrayscaleFrameFromResults();
std::unique_ptr<ImageFrame> grayscale_image(new ImageFrame(
ImageFormat::GRAY8, grayscale_mat.cols, grayscale_mat.rows));
cv::Mat image_frame_mat = formats::MatView(grayscale_image.get());
grayscale_mat.copyTo(image_frame_mat);
cc->Outputs()
.Tag(kGrayVideoOutTag)
.Add(grayscale_image.release(), timestamp);
}
// Output other results, if we have any yet.
OutputMotionAnalyzedFrames(false, cc);
}
return absl::OkStatus();
}
absl::Status MotionAnalysisCalculator::Close(CalculatorContext* cc) {
// Guard against empty videos.
if (motion_analysis_) {
OutputMotionAnalyzedFrames(true, cc);
}
if (csv_file_input_) {
if (!meta_motions_.empty()) {
LOG(ERROR) << "More motions than frames. Unexpected! Remainder: "
<< meta_motions_.size();
}
}
return absl::OkStatus();
}
void MotionAnalysisCalculator::OutputMotionAnalyzedFrames(
bool flush, CalculatorContext* cc) {
std::vector<std::unique_ptr<RegionFlowFeatureList>> features;
std::vector<std::unique_ptr<CameraMotion>> camera_motions;
std::vector<std::unique_ptr<SalientPointFrame>> saliency;
const int buffer_size = timestamp_buffer_.size();
const int num_results = motion_analysis_->GetResults(
flush, &features, &camera_motions, with_saliency_ ? &saliency : nullptr);
CHECK_LE(num_results, buffer_size);
if (num_results == 0) {
return;
}
for (int k = 0; k < num_results; ++k) {
// Region flow features and camera motion for this frame.
auto& feature_list = features[k];
auto& camera_motion = camera_motions[k];
const Timestamp timestamp = timestamp_buffer_[k];
if (selection_input_ && options_.hybrid_selection_camera()) {
if (camera_motion->type() > selected_motions_.front().type()) {
// Composited type is more stable.
camera_motion->Swap(&selected_motions_.front());
}
selected_motions_.pop_front();
}
if (hybrid_meta_analysis_) {
AddMetaMotion(meta_motions_.front(), meta_features_.front(),
feature_list.get(), camera_motion.get());
meta_motions_.pop_front();
meta_features_.pop_front();
}
// Video frame for visualization.
std::unique_ptr<ImageFrame> visualization_frame;
cv::Mat visualization;
if (visualize_output_) {
// Initialize visualization frame with original frame.
visualization_frame.reset(new ImageFrame());
visualization_frame->CopyFrom(packet_buffer_[k].Get<ImageFrame>(), 16);
visualization = formats::MatView(visualization_frame.get());
motion_analysis_->RenderResults(
*feature_list, *camera_motion,
with_saliency_ ? saliency[k].get() : nullptr, &visualization);
cc->Outputs().Tag(kVizTag).Add(visualization_frame.release(), timestamp);
}
// Output dense foreground mask.
if (dense_foreground_output_) {
std::unique_ptr<ImageFrame> foreground_frame(
new ImageFrame(ImageFormat::GRAY8, frame_width_, frame_height_));
cv::Mat foreground = formats::MatView(foreground_frame.get());
motion_analysis_->ComputeDenseForeground(*feature_list, *camera_motion,
&foreground);
cc->Outputs().Tag(kDenseFgTag).Add(foreground_frame.release(), timestamp);
}
// Output flow features if requested.
if (region_flow_feature_output_) {
cc->Outputs().Tag(kFlowTag).Add(feature_list.release(), timestamp);
}
// Output camera motion.
if (camera_motion_output_) {
cc->Outputs().Tag(kCameraTag).Add(camera_motion.release(), timestamp);
}
if (video_output_) {
cc->Outputs().Tag(kVideoOutTag).AddPacket(packet_buffer_[k]);
}
// Output saliency.
if (saliency_output_) {
cc->Outputs().Tag(kSaliencyTag).Add(saliency[k].release(), timestamp);
}
}
if (hybrid_meta_analysis_) {
hybrid_meta_offset_ -= num_results;
CHECK_GE(hybrid_meta_offset_, 0);
}
timestamp_buffer_.erase(timestamp_buffer_.begin(),
timestamp_buffer_.begin() + num_results);
if (visualize_output_ || video_output_) {
packet_buffer_.erase(packet_buffer_.begin(),
packet_buffer_.begin() + num_results);
}
}
absl::Status MotionAnalysisCalculator::InitOnProcess(
InputStream* video_stream, InputStream* selection_stream) {
if (video_stream) {
frame_width_ = video_stream->Get<ImageFrame>().Width();
frame_height_ = video_stream->Get<ImageFrame>().Height();
// Ensure image options are set correctly.
auto* region_options =
options_.mutable_analysis_options()->mutable_flow_options();
// Use two possible formats to account for different channel orders.
RegionFlowComputationOptions::ImageFormat image_format;
RegionFlowComputationOptions::ImageFormat image_format2;
switch (video_stream->Get<ImageFrame>().Format()) {
case ImageFormat::GRAY8:
image_format = image_format2 =
RegionFlowComputationOptions::FORMAT_GRAYSCALE;
break;
case ImageFormat::SRGB:
image_format = RegionFlowComputationOptions::FORMAT_RGB;
image_format2 = RegionFlowComputationOptions::FORMAT_BGR;
break;
case ImageFormat::SRGBA:
image_format = RegionFlowComputationOptions::FORMAT_RGBA;
image_format2 = RegionFlowComputationOptions::FORMAT_BGRA;
break;
default:
RET_CHECK(false) << "Unsupported image format.";
}
if (region_options->image_format() != image_format &&
region_options->image_format() != image_format2) {
LOG(WARNING) << "Requested image format in RegionFlowComputation "
<< "does not match video stream format. Overriding.";
region_options->set_image_format(image_format);
}
// Account for downsampling mode INPUT_SIZE. In this case we are handed
// already downsampled frames but the resulting CameraMotion should
// be computed on higher resolution as specifed by the downsample scale.
if (region_options->downsample_mode() ==
RegionFlowComputationOptions::DOWNSAMPLE_TO_INPUT_SIZE) {
const float scale = region_options->downsample_factor();
frame_width_ = static_cast<int>(std::round(frame_width_ * scale));
frame_height_ = static_cast<int>(std::round(frame_height_ * scale));
}
} else if (selection_stream) {
const auto& camera_motion =
selection_stream->Get<FrameSelectionResult>().camera_motion();
frame_width_ = camera_motion.frame_width();
frame_height_ = camera_motion.frame_height();
} else {
LOG(FATAL) << "Either VIDEO or SELECTION stream need to be specified.";
}
// Filled by CSV file parsing.
if (!meta_homographies_.empty()) {
CHECK(csv_file_input_);
AppendCameraMotionsFromHomographies(meta_homographies_,
true, // append identity.
&meta_motions_, &meta_features_);
meta_homographies_.clear();
}
// Filter weights before using for hybrid mode.
if (hybrid_meta_analysis_) {
auto* motion_options =
options_.mutable_analysis_options()->mutable_motion_options();
motion_options->set_filter_initialized_irls_weights(true);
}
return absl::OkStatus();
}
bool MotionAnalysisCalculator::ParseModelCSV(
const std::string& contents, std::deque<Homography>* homographies) {
std::vector<absl::string_view> values =
absl::StrSplit(contents, absl::ByAnyChar(",\n"));
// Trim off any empty lines.
while (values.back().empty()) {
values.pop_back();
}
// Convert to float.
std::vector<float> homog_values;
homog_values.reserve(values.size());
for (const auto& value : values) {
double value_64f;
if (!absl::SimpleAtod(value, &value_64f)) {
LOG(ERROR) << "Not a double, expected!";
return false;
}
homog_values.push_back(value_64f);
}
return HomographiesFromValues(homog_values, homographies);
}
bool MotionAnalysisCalculator::HomographiesFromValues(
const std::vector<float>& homog_values,
std::deque<Homography>* homographies) {
CHECK(homographies);
// Obvious constants are obvious :D
constexpr int kHomographyValues = 9;
if (homog_values.size() % kHomographyValues != 0) {
LOG(ERROR) << "Contents not a multiple of " << kHomographyValues;
return false;
}
for (int k = 0; k < homog_values.size(); k += kHomographyValues) {
std::vector<double> h_vals(kHomographyValues);
for (int l = 0; l < kHomographyValues; ++l) {
h_vals[l] = homog_values[k + l];
}
// Normalize last entry to 1.
if (h_vals[kHomographyValues - 1] == 0) {
LOG(ERROR) << "Degenerate homography, last entry is zero";
return false;
}
const double scale = 1.0f / h_vals[kHomographyValues - 1];
for (int l = 0; l < kHomographyValues; ++l) {
h_vals[l] *= scale;
}
Homography h = HomographyAdapter::FromDoublePointer(h_vals.data(), false);
homographies->push_back(h);
}
if (homographies->size() % options_.meta_models_per_frame() != 0) {
LOG(ERROR) << "Total homographies not a multiple of specified models "
<< "per frame.";
return false;
}
return true;
}
void MotionAnalysisCalculator::SubtractMetaMotion(
const CameraMotion& meta_motion, RegionFlowFeatureList* features) {
if (meta_motion.mixture_homography().model_size() > 0) {
CHECK(row_weights_ != nullptr);
RegionFlowFeatureListViaTransform(meta_motion.mixture_homography(),
features, -1.0f,
1.0f, // subtract transformed.
true, // replace feature loc.
row_weights_.get());
} else {
RegionFlowFeatureListViaTransform(meta_motion.homography(), features, -1.0f,
1.0f, // subtract transformed.
true); // replace feature loc.
}
// Clamp transformed features to domain and handle outliers.
const float domain_diam =
hypot(features->frame_width(), features->frame_height());
const float motion_mag = meta_motion.average_magnitude();
// Same irls fraction as used by MODEL_MIXTURE_HOMOGRAPHY scaling in
// MotionEstimation.
const float irls_fraction = options_.analysis_options()
.motion_options()
.irls_mixture_fraction_scale() *
options_.analysis_options()
.motion_options()
.irls_motion_magnitude_fraction();
float err_scale = std::max(1.0f, motion_mag * irls_fraction);
const float max_err =
options_.meta_outlier_domain_ratio() * domain_diam * err_scale;
const float max_err_sq = max_err * max_err;
for (auto& feature : *features->mutable_feature()) {
feature.set_x(
std::max(0.0f, std::min(features->frame_width() - 1.0f, feature.x())));
feature.set_y(
std::max(0.0f, std::min(features->frame_height() - 1.0f, feature.y())));
// Label anything with large residual motion an outlier.
if (FeatureFlow(feature).Norm2() > max_err_sq) {
feature.set_irls_weight(0.0f);
}
}
}
void MotionAnalysisCalculator::AddMetaMotion(
const CameraMotion& meta_motion, const RegionFlowFeatureList& meta_features,
RegionFlowFeatureList* features, CameraMotion* motion) {
// Restore old feature location.
CHECK_EQ(meta_features.feature_size(), features->feature_size());
for (int k = 0; k < meta_features.feature_size(); ++k) {
auto feature = features->mutable_feature(k);
const auto& meta_feature = meta_features.feature(k);
feature->set_x(meta_feature.x());
feature->set_y(meta_feature.y());
feature->set_dx(meta_feature.dx());
feature->set_dy(meta_feature.dy());
}
// Composite camera motion.
*motion = ComposeCameraMotion(*motion, meta_motion);
// Restore type from metadata, i.e. do not declare motions as invalid.
motion->set_type(meta_motion.type());
motion->set_match_frame(-1);
}
void MotionAnalysisCalculator::AppendCameraMotionsFromHomographies(
const std::deque<Homography>& homographies, bool append_identity,
std::deque<CameraMotion>* camera_motions,
std::deque<RegionFlowFeatureList>* features) {
CHECK(camera_motions);
CHECK(features);
CameraMotion identity;
identity.set_frame_width(frame_width_);
identity.set_frame_height(frame_height_);
*identity.mutable_translation() = TranslationModel();
*identity.mutable_linear_similarity() = LinearSimilarityModel();
*identity.mutable_homography() = Homography();
identity.set_type(CameraMotion::VALID);
identity.set_match_frame(0);
RegionFlowFeatureList empty_list;
empty_list.set_long_tracks(true);
empty_list.set_match_frame(-1);
empty_list.set_frame_width(frame_width_);
empty_list.set_frame_height(frame_height_);
if (append_identity) {
camera_motions->push_back(identity);
features->push_back(empty_list);
}
const int models_per_frame = options_.meta_models_per_frame();
CHECK_GT(models_per_frame, 0) << "At least one model per frame is needed";
CHECK_EQ(0, homographies.size() % models_per_frame);
const int num_frames = homographies.size() / models_per_frame;
// Heuristic sigma, similar to what we use for rolling shutter removal.
const float mixture_sigma = 1.0f / models_per_frame;
if (row_weights_ == nullptr) {
row_weights_.reset(new MixtureRowWeights(frame_height_,
frame_height_ / 10, // 10% margin
mixture_sigma * frame_height_,
1.0f, models_per_frame));
}
for (int f = 0; f < num_frames; ++f) {
MixtureHomography mix_homog;
const int model_start = f * models_per_frame;
for (int k = 0; k < models_per_frame; ++k) {
const Homography& homog = homographies[model_start + k];
*mix_homog.add_model() = ModelInvert(homog);
}
CameraMotion c = identity;
c.set_match_frame(-1);
if (mix_homog.model_size() > 1) {
*c.mutable_mixture_homography() = mix_homog;
c.set_mixture_row_sigma(mixture_sigma);
for (int k = 0; k < models_per_frame; ++k) {
c.add_mixture_inlier_coverage(1.0f);
}
*c.add_mixture_homography_spectrum() = mix_homog;
c.set_rolling_shutter_motion_index(0);
*c.mutable_homography() = ProjectViaFit<Homography>(
mix_homog, frame_width_, frame_height_, row_weights_.get());
} else {
// Guaranteed to exist because to check that models_per_frame > 0 above.
*c.mutable_homography() = mix_homog.model(0);
}
// Project remaining motions down.
*c.mutable_linear_similarity() = ProjectViaFit<LinearSimilarityModel>(
c.homography(), frame_width_, frame_height_);
*c.mutable_translation() = ProjectViaFit<TranslationModel>(
c.homography(), frame_width_, frame_height_);
c.set_average_magnitude(
std::hypot(c.translation().dx(), c.translation().dy()));
camera_motions->push_back(c);
features->push_back(empty_list);
}
}
} // namespace mediapipe